In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import mlflow

In [2]:
## switch to the project root path
os.chdir("C:\\_mlops\\vgp_predict_lab\\vgp-predict")

# e.g. City_id = 1, using selected data

## Step 1: get selected_df-data from csv

In [3]:
selected_df = pd.read_csv("./data/processed/filtered_data_city1.csv", sep='\t', header=0)
selected_df = selected_df[selected_df["city_id"]==1]
selected_df.head(3)

Unnamed: 0,date_event,type,value,city_id,timestamp,date,time,weekday,tdelta,tdelta_min
0,2023-07-04 22:12:39,0,0.105,1,2023-07-04 22:12:39,2023-07-04,22:12:39,2,0 days 00:00:02,0.033333
1,2023-07-10 12:25:05,0,0.020354,1,2023-07-10 12:25:05,2023-07-10,12:25:05,1,0 days 05:47:00,347.0
2,2023-07-10 15:17:35,0,0.105,1,2023-07-10 15:17:35,2023-07-10,15:17:35,1,0 days 02:52:30,172.5


## Step 2: Compute hour-delta index and mean congestion length by hour-delta

In [4]:
## convert timestamp to int
selected_df['unix_ts'] = pd.to_datetime(selected_df.timestamp).apply(lambda x: int(x.timestamp())).astype(int)

## datetime from 2023/7/11 (1689082398)
selected_df = selected_df[selected_df['unix_ts']>=1689082398].reset_index(drop=True)

## compute the hour-delta-index from the first row 
selected_df['hour_index'] = selected_df['unix_ts']//3600
selected_df['hour_index'] = selected_df['hour_index'] - selected_df['hour_index'].min()
# print(selected_df.head(3))

## calculate mean congestion length in hour (we can have multiple hour-delta-index=1, so we need mean value)
df = selected_df[['hour_index', 'value']]
df = df.groupby(['hour_index']).mean().reset_index()

In [5]:
## take a look "df"
df

Unnamed: 0,hour_index,value
0,0,0.187787
1,1,0.404619
2,2,0.441498
3,3,0.724283
4,4,0.193599
...,...,...
266,347,0.218163
267,348,0.558570
268,352,0.262792
269,353,0.315643


## Step 3: Padding the missing "hour_index" with mean_value=0 for selected_df

In [8]:
pad_df = []
for h in  np.arange(0, df.hour_index.max()): # possible hour index
    matching_row = df[df['hour_index'] == h]
    ## if missing h-delta, set v = 0, else get v value
    v = matching_row['value'].values[0] if not matching_row.empty else 0
    ## calculate weekday with hard coded formular according to initial hour
    day = ((h - 11) // 24 + 2) % 7 + 1 
    ## insert h(hour delta), v(mean congestion length), day(weekday) in list "pad_df"
    pad_df.append([h, v, day]) # hour and value

## convert list to dataframe 
pad_df = pd.DataFrame(pad_df, columns = ['hour_index', 'mean_value', 'day'])
pad_df

Unnamed: 0,hour_index,mean_value,day
0,0,0.187787,2
1,1,0.404619,2
2,2,0.441498,2
3,3,0.724283,2
4,4,0.193599,2
...,...,...,...
349,349,0.000000,3
350,350,0.000000,3
351,351,0.000000,3
352,352,0.262792,3


## Step 4: Re-arrange X and y with autoRegression strategy.
### Use last 24h data (mean value) to predict actual mean value.

In [9]:
feat_df = pad_df.copy()

## last 24h data
for i in range(1, 25):
    new_column_name = f'value_shift_{i}'
    feat_df[new_column_name] = feat_df['mean_value'].shift(i)
feat_df = feat_df.dropna().reset_index()
feat_df = pd.get_dummies(feat_df, columns=['day'], dtype=int)
feat_df

Unnamed: 0,index,hour_index,mean_value,value_shift_1,value_shift_2,value_shift_3,value_shift_4,value_shift_5,value_shift_6,value_shift_7,...,value_shift_22,value_shift_23,value_shift_24,day_1,day_2,day_3,day_4,day_5,day_6,day_7
0,24,24,0.365277,0.295500,0.333460,0.314829,0.402290,0.389791,0.330464,0.330314,...,0.441498,0.404619,0.187787,0,0,1,0,0,0,0
1,25,25,0.391379,0.365277,0.295500,0.333460,0.314829,0.402290,0.389791,0.330464,...,0.724283,0.441498,0.404619,0,0,1,0,0,0,0
2,26,26,0.440046,0.391379,0.365277,0.295500,0.333460,0.314829,0.402290,0.389791,...,0.193599,0.724283,0.441498,0,0,1,0,0,0,0
3,27,27,0.362597,0.440046,0.391379,0.365277,0.295500,0.333460,0.314829,0.402290,...,0.154464,0.193599,0.724283,0,0,1,0,0,0,0
4,28,28,0.243996,0.362597,0.440046,0.391379,0.365277,0.295500,0.333460,0.314829,...,0.214764,0.154464,0.193599,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,349,349,0.000000,0.558570,0.218163,0.428380,0.177547,0.212264,0.266966,0.244087,...,0.000000,0.000000,0.396077,0,0,1,0,0,0,0
326,350,350,0.000000,0.000000,0.558570,0.218163,0.428380,0.177547,0.212264,0.266966,...,0.297351,0.000000,0.000000,0,0,1,0,0,0,0
327,351,351,0.000000,0.000000,0.000000,0.558570,0.218163,0.428380,0.177547,0.212264,...,0.523222,0.297351,0.000000,0,0,1,0,0,0,0
328,352,352,0.262792,0.000000,0.000000,0.000000,0.558570,0.218163,0.428380,0.177547,...,0.294434,0.523222,0.297351,0,0,1,0,0,0,0


In [10]:
X = feat_df.iloc[:, 3:].values
X.shape

(330, 31)

In [11]:
y = feat_df['mean_value'].values
y.shape

(330,)

## Step 5: Launch MLFlow UI

With your conda or terminal (where mlflow installed), tape:

`mlflow ui --default-artifact-root=file:mlruns` 

to launch MLFlow ui server --> so you can set, remote_server_uri = "http://127.0.0.1:5000/" (for exemple)

## Step 6: Start Training

In [20]:
## mlflow_config:
artifacts_dir = "artifacts"
experiment_name = "my_experiment"
run_name = "linear_regression"
registered_model_name = "linear_regression_model"
remote_server_uri = "http://127.0.0.1:5000/"

In [21]:
## start mlflow service by creating experiment
try: 
    mlflow.create_experiment(experiment_name)
except:
    mlflow.set_experiment(experiment_name)

In [22]:
## set remote_server_uri, here you can use a local server or a remote server(recommanded if collaborate)
mlflow.set_tracking_uri(remote_server_uri)

## get all launched runs (in the beginning it will be None or zero)
runs = mlflow.search_runs(experiment_names=[experiment_name])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.mae,params.fit_intercept,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.git.commit
0,7ea774e559ad480dab4220af24e54f55,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-07 14:00:06.073000+00:00,2023-09-07 14:00:12.026000+00:00,0.095324,False,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""7ea774e559ad480dab4220af24e54f55""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6
1,90916f14ff644f6d9e517862aa7b08cf,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-07 07:57:05.816000+00:00,2023-09-07 07:57:11.584000+00:00,0.095324,False,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""90916f14ff644f6d9e517862aa7b08cf""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6
2,4976ab14d90e4f2da78ad0296b834273,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-06 16:47:23.772000+00:00,2023-09-06 16:47:29.358000+00:00,0.095266,True,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""4976ab14d90e4f2da78ad0296b834273""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6
3,169422d1f26e491abb6a537e47d9f4ed,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-06 16:44:23.720000+00:00,2023-09-06 16:44:27.157000+00:00,0.095266,True,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""169422d1f26e491abb6a537e47d9f4ed""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6
4,18c18eb28a0643cd99941d37aa8a93b4,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-06 16:38:27.366000+00:00,2023-09-06 16:38:32.943000+00:00,0.095266,True,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""18c18eb28a0643cd99941d37aa8a93b4""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6
5,49f75ae5341049cdbffcb1bb04942424,962530624777546567,FINISHED,file:///C:/_mlops/vgp_predict_lab/vgp-predict/...,2023-09-06 14:33:58.324000+00:00,2023-09-06 14:34:01.799000+00:00,0.095266,True,C:\_mlops\vgp_predict_lab\vgp-predict\src/mode...,LOCAL,"[{""run_id"": ""49f75ae5341049cdbffcb1bb04942424""...",redlab,linear_regression,a57573d6a85318f56f6161aaefd2f2cbb12c66a6


In [23]:
print(mlflow.tracking.MlflowClient().get_experiment_by_name(experiment_name))

<Experiment: artifact_location='file:///C:/_mlops/vgp_predict_lab/vgp-predict/mlruns/962530624777546567', creation_time=1694009898447, experiment_id='962530624777546567', last_update_time=1694009898447, lifecycle_stage='active', name='my_experiment', tags={}>


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from urllib.parse import urlparse

## prepare train test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
size_X_train = X_train.shape[0]

################### MLFLOW ###############################
with mlflow.start_run(run_name=run_name) as mlops_run:
    ## initialize the LR model
    model = LinearRegression(fit_intercept=True)  # fit_intercept=True == bias
    ## train the model
    model.fit(X_train, y_train)
    ## predict with test set
    y_pred = model.predict(X_test)*
    ## get model score
    mae = mean_absolute_error(y_test, y_pred)
    print("Mean Absolute Error on Test Set : %.4f \n"%mae)
    
    ## rigistry mae score in mlflow 
    mlflow.log_metric("mae", mae)
    
    ########### RF model ###########
#     model = RandomForestClassifier(max_depth=max_depth,n_estimators=n_estimators)
#     model.fit(train_x, train_y)
#     y_pred = model.predict(test_x)
#     accuracy,precision,recall,f1score = accuracymeasures(test_y,y_pred,'weighted')
#     mlflow.log_param("max_depth",max_depth)
#     mlflow.log_param("n_estimators", n_estimators)
#     mlflow.log_metric("accuracy", accuracy)
#     mlflow.log_metric("precision", precision)
#     mlflow.log_metric("recall", recall)
#     mlflow.log_metric("f1_score", f1score)

    ## save the model in MlFlow
    tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme
    if tracking_url_type_store == "file":
        mlflow.sklearn.log_model(
            model, 
            "model", 
            registered_model_name=registered_model_name)
    else:
        mlflow.sklearn.load_model(model, "model")
        
mlflow.end_run()

Mean Absolute Error on Test Set : 0.0953 



Registered model 'linear_regression_model' already exists. Creating a new version of this model...
2023/11/23 14:35:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: linear_regression_model, version 8
Created version '8' of model 'linear_regression_model'.
