# Baseline linear regression model

## download the data



In [10]:
import requests
import datetime

import pandas as pd
import zipfile

import mlflow
import os


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [6]:
#set up tracking server
TRACKING_SERVER_HOST = "34.68.82.207" #external IP reserved in GCP
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://34.68.82.207:5000'


In [7]:
data_file = "../data/processed/202304-usage.parquet"

In [8]:
mlflow.set_experiment("experiment-3")

2024/07/24 13:48:32 INFO mlflow.tracking.fluent: Experiment with name 'experiment-3' does not exist. Creating a new experiment.


<Experiment: artifact_location='gs://mlops-divvy-experiment-tracking/mlruns/3', creation_time=1721846912312, experiment_id='3', last_update_time=1721846912312, lifecycle_stage='active', name='experiment-3', tags={}>

# Model training


In [11]:
with mlflow.start_run():
    #load prepared data
    df = pd.read_parquet(data_file)
    mlflow.log_param("data_file", data_file)

    features = df[['station_name', 'hour', 'day_of_week']]
    target = df['net_usage']

    num_features= ['hour']
    cat_features = ['station_name', 'day_of_week']


    split_params = {"test_size": 0.2, "random_state": 42}
    X_train, X_test, y_train, y_test = train_test_split(features, target, **split_params)

    Standard_Scaler = StandardScaler()
    num_scaled_train = pd.DataFrame(Standard_Scaler.fit_transform(X_train[num_features]), columns=['hour_scaled'])
    num_scaled_test = pd.DataFrame(Standard_Scaler.transform(X_test[num_features]), columns=['hour_scaled'])

    num_scaled_train.index = X_train.index
    num_scaled_test.index = X_test.index

    ohe = OneHotEncoder(handle_unknown='ignore')
    ohe_cols_train = ohe.fit_transform(X_train[cat_features])
    ohe_cols_test = ohe.transform(X_test[cat_features])

    ohe_cols_train = pd.DataFrame(ohe.fit_transform(X_train[cat_features]).toarray(), columns = ohe.get_feature_names_out(cat_features))
    ohe_cols_test = pd.DataFrame(ohe.transform(X_test[cat_features]).toarray(), columns = ohe.get_feature_names_out(cat_features))

    ohe_cols_train.index = X_train.index
    ohe_cols_test.index = X_test.index

    transformed_X_train = pd.concat([num_scaled_train, ohe_cols_train], axis=1)
    transformed_X_test = pd.concat([num_scaled_test, ohe_cols_test], axis=1)

    
    #params = {"n_estimators": 10, "random_state": 42}
    #mlflow.log_params(params)

    lr = LinearRegression().fit(transformed_X_train, y_train) #**params
    
    y_pred = lr.predict(transformed_X_test)
    mlflow.log_metric("mse", mean_squared_error(y_test, y_pred))

    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

mlflow.search_experiments()

default artifacts URI: 'gs://mlops-divvy-experiment-tracking/mlruns/3/80a9e48d19af4273a5c9c54f04ac1b84/artifacts'


[<Experiment: artifact_location='gs://mlops-divvy-experiment-tracking/mlruns/3', creation_time=1721846912312, experiment_id='3', last_update_time=1721846912312, lifecycle_stage='active', name='experiment-3', tags={}>,
 <Experiment: artifact_location='gs://mlops-divvy-experiment-tracking/mlruns/2', creation_time=1721227750156, experiment_id='2', last_update_time=1721227750156, lifecycle_stage='active', name='experiment-2', tags={}>,
 <Experiment: artifact_location='gs://mlops-divvy-experiment-tracking/mlruns/1', creation_time=1721225054205, experiment_id='1', last_update_time=1721225054205, lifecycle_stage='active', name='experiment-1', tags={}>,
 <Experiment: artifact_location='gs://mlops-divvy-experiment-tracking/mlruns/0', creation_time=1721165169497, experiment_id='0', last_update_time=1721165169497, lifecycle_stage='active', name='Default', tags={}>]