# MLFLOW EXPERIMENT TRACKING

## SET UP ENVIRONMENT

In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## SET UP MLFLOW

In [6]:
# %%bash

# mlflow ui --backend-store-uri sqlite:///mlflow.db

[2022-06-18 21:37:45 +0700] [47213] [INFO] Starting gunicorn 20.1.0
[2022-06-18 21:37:45 +0700] [47213] [INFO] Listening at: http://127.0.0.1:5000 (47213)
[2022-06-18 21:37:45 +0700] [47213] [INFO] Using worker: sync
[2022-06-18 21:37:45 +0700] [47218] [INFO] Booting worker with pid: 47218
[2022-06-18 21:38:23 +0700] [47213] [INFO] Handling signal: int


Error while terminating subprocess (pid=47207): 


[2022-06-18 21:38:23 +0700] [47218] [INFO] Worker exiting (pid: 47218)

Aborted!
[2022-06-18 21:38:23 +0700] [47213] [INFO] Shutting down: Master


In [4]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("trip_prediction-experiment")

<Experiment: artifact_location='./mlruns/6', experiment_id='6', lifecycle_stage='active', name='trip_prediction-experiment', tags={}>

## PREPARE DATASET

In [5]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [6]:
train_path = './data/green_tripdata_2021-01.parquet'
valid_path = './data/green_tripdata_2021-02.parquet'

In [7]:
df_train = read_dataframe(train_path)
df_val = read_dataframe(valid_path)

In [8]:
print(f'train_dataset: {len(df_train)}')
print(f'valid_dataset: {len(df_val)}')

train_dataset: 73908
valid_dataset: 61921


## FEATURE ENGINEERING

In [14]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [15]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [16]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

from pathlib import Path

model_path = Path('./models/')
model_path.mkdir(parents=True, exist_ok=True)

## MODEL TRAINING AND EXPERIMENT

### LINEAR REGRESSION - BASELINE

In [18]:
with mlflow.start_run(run_name="base_model"):
    
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "linear regession")
    
    mlflow.log_param("train_path", {train_path})
    mlflow.log_param("valid_path", {valid_path})
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    mlflow.sklearn.log_model(lr, artifact_path="sklearn-model")

    y_pred = lr.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    with open('./models/linear_regression.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)
        
    with open('./models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    mlflow.log_artifact('./models/linear_regression.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./models/preprocessor.b', artifact_path='preprocessor')

### LINEAR REGRESSION - LASSO

In [19]:
with mlflow.start_run(run_name="model_selection"):
    
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "linear regession")
    
    mlflow.log_param("train_path", {train_path})
    mlflow.log_param("valid_path", {valid_path})
    
    
    alpha = 0.1
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    mlflow.log_param("alpha", alpha)
    mlflow.sklearn.log_model(lr, artifact_path='sklearn_model')
    
    y_pred = lr.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    with open('./models/lasso.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)
        
    with open('./models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    mlflow.log_artifact('./models/lasso.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./models/preprocessor.b', artifact_path='preprocessor')

### XGBOOST

In [20]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [21]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

### HYPER OPT FOR FINE TUNING 

In [22]:
def objective(params):
    with mlflow.start_run(run_name="fine tuning"):
        mlflow.set_tag("developer", "surawut")
        mlflow.set_tag("model", "hyperopt")
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

In [23]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	validation-rmse:18.84759                          
[1]	validation-rmse:16.84099                          
[2]	validation-rmse:15.12949                          
[3]	validation-rmse:13.67865                          
[4]	validation-rmse:12.44926                          
[5]	validation-rmse:11.41718                          
[6]	validation-rmse:10.55411                          
[7]	validation-rmse:9.83176                           
[8]	validation-rmse:9.23514                           
[9]	validation-rmse:8.74410                           
[10]	validation-rmse:8.33857                          
[11]	validation-rmse:8.00520                          
[12]	validation-rmse:7.73236                          
[13]	validation-rmse:7.51069                          
[14]	validation-rmse:7.32968                          
[15]	validation-rmse:7.18104                          
[16]	validation-rmse:7.05946                          
[17]	validation-rmse:6.96044                          
[18]	valid

### XGBOOST OPTIMIZATION HYPER PARAMETER WITH HYPEROPT

In [24]:
with mlflow.start_run(run_name="tuned_hyper_params"):
    
    mlflow.xgboost.autolog(disable=True)
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "xgboost")
    
    best_params = {
        'learning_rate': 0.130837160621154,
        'max_depth': 78,
        'min_child_weight': 1.2331058504757806,
        'objective': 'reg:linear',
        'reg_alpha': 0.12390743789664009,
        'reg_lambda': 0.20084334664548684,
        'seed': 42
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    with open('./models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    with open('./models/xgboost.bin', 'wb') as f_out:
        pickle.dump(booster, f_out)
        
    mlflow.log_artifact('./models/xgboost.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./models/preprocessor.b', artifact_path='preprocessor')
    
    mlflow.xgboost.log_model(booster, artifact_path='models_xgboost')

[0]	validation-rmse:18.84759
[1]	validation-rmse:16.84099
[2]	validation-rmse:15.12949
[3]	validation-rmse:13.67865
[4]	validation-rmse:12.44926
[5]	validation-rmse:11.41718
[6]	validation-rmse:10.55411
[7]	validation-rmse:9.83176
[8]	validation-rmse:9.23514
[9]	validation-rmse:8.74410
[10]	validation-rmse:8.33857
[11]	validation-rmse:8.00520
[12]	validation-rmse:7.73236
[13]	validation-rmse:7.51069
[14]	validation-rmse:7.32968
[15]	validation-rmse:7.18104
[16]	validation-rmse:7.05946
[17]	validation-rmse:6.96044
[18]	validation-rmse:6.87649
[19]	validation-rmse:6.80804
[20]	validation-rmse:6.75185
[21]	validation-rmse:6.70400
[22]	validation-rmse:6.66231
[23]	validation-rmse:6.62794
[24]	validation-rmse:6.59977
[25]	validation-rmse:6.57550
[26]	validation-rmse:6.55541
[27]	validation-rmse:6.53720
[28]	validation-rmse:6.52171
[29]	validation-rmse:6.50789
[30]	validation-rmse:6.49716
[31]	validation-rmse:6.48705
[32]	validation-rmse:6.47836
[33]	validation-rmse:6.47064
[34]	validation-r

### MODEL SELECTION

- RandomForestRegressor
- GradientBoostingRegressor
- ExtraTreeRegressor
- LinearSVR

In [25]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

for model_class in (RandomForestRegressor,
                   GradientBoostingRegressor,
                   ExtraTreesRegressor,
                   LinearSVR):
    with mlflow.start_run(run_name="model_selection"):
        
        mlflow.sklearn.autolog()
        mlflow.set_tag("developer", "surawut")
        mlflow.set_tag("model", {model_class()})
        
        mlflow.log_param("train_path", {train_path})
        mlflow.log_param("valid_path", {valid_path})
        mlflow.log_artifact("./models/preprocessor.b", artifact_path='preprocessor')
        
        model = model_class()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        with open('./models/preprocessor.b', 'wb') as dv_out:
            pickle.dump(dv, dv_out)
            
        mlflow.log_artifact('./models/preprocessor.b', artifact_path='preprocessor')
        

## SELECTED THE BEST MODEL FROM MLFLOW AND EVALUATE MODEL

## LOAD MODEL FOR PREDICTION

In [28]:
logged_model = 'runs:/c1db82cc4b45493a98c63bf0c8cc89e4/models_xgboost'

In [29]:
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model



mlflow.pyfunc.loaded_model:
  artifact_path: models_xgboost
  flavor: mlflow.xgboost
  run_id: c1db82cc4b45493a98c63bf0c8cc89e4

In [30]:
xgboost = mlflow.xgboost.load_model(logged_model)
xgboost



<xgboost.core.Booster at 0x284b70f70>

In [31]:
y_pred = xgboost.predict(valid)
y_pred[:10]

array([14.730225,  6.844986, 15.907544, 24.345303,  9.241478, 17.365046,
       10.806662,  8.045458,  9.816886, 19.79216 ], dtype=float32)