# MLFLOW EXPERIMENT TRACKING

## SET UP ENVIRONMENT

## Scenario 3: Multiple data scientists working on multiple ML models

MLflow setup:
* Tracking server: yes, remote server (EC2).
* Backend store: postgresql database.
* Artifacts store: s3 bucket.

The experiments can be explored by accessing the remote server.

The exampe uses AWS to host a remote server. In order to run the example you'll need an AWS account. Follow the steps described in the file `mlflow_on_aws.md` to create a new AWS account and launch the tracking server. 

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## SET UP MLFLOW

In [6]:
import os

os.environ["AWS_PROFILE"] = "" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

[2022-06-18 21:37:45 +0700] [47213] [INFO] Starting gunicorn 20.1.0
[2022-06-18 21:37:45 +0700] [47213] [INFO] Listening at: http://127.0.0.1:5000 (47213)
[2022-06-18 21:37:45 +0700] [47213] [INFO] Using worker: sync
[2022-06-18 21:37:45 +0700] [47218] [INFO] Booting worker with pid: 47218
[2022-06-18 21:38:23 +0700] [47213] [INFO] Handling signal: int


Error while terminating subprocess (pid=47207): 


[2022-06-18 21:38:23 +0700] [47218] [INFO] Worker exiting (pid: 47218)

Aborted!
[2022-06-18 21:38:23 +0700] [47213] [INFO] Shutting down: Master


In [5]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("taxi_trip_prediction-experiment")

2022/07/09 20:56:56 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2022/07/09 20:56:56 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='taxi_trip_prediction-experiment', tags={}>

## PREPARE DATASET

In [6]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [7]:
train_path = './data/green_tripdata_2021-01.parquet'
valid_path = './data/green_tripdata_2021-02.parquet'

In [8]:
df_train = read_dataframe(train_path)
df_val = read_dataframe(valid_path)

In [9]:
print(f'train_dataset: {len(df_train)}')
print(f'valid_dataset: {len(df_val)}')

train_dataset: 73908
valid_dataset: 61921


## FEATURE ENGINEERING

In [10]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [11]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [14]:
from pathlib import Path

model_path = Path('./mlflow/models/')
model_path.mkdir(parents=True, exist_ok=True)

## MODEL TRAINING AND EXPERIMENT

### LINEAR REGRESSION - BASELINE

In [16]:
with mlflow.start_run(run_name="base_model"):
    
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "linear regession")
    
    mlflow.log_param("train_path", {train_path})
    mlflow.log_param("valid_path", {valid_path})
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    mlflow.sklearn.log_model(lr, artifact_path="sklearn-model")

    y_pred = lr.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
   
    with open('./mlflow/models/linear_regression.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)
        
    with open('./mlflow/models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    mlflow.log_artifact('./mlflow/models/linear_regression.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./mlflow/models/preprocessor.b', artifact_path='preprocessor')

### LINEAR REGRESSION - LASSO

In [18]:
with mlflow.start_run(run_name="model_selection"):
    
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "linear regession")
    
    mlflow.log_param("train_path", {train_path})
    mlflow.log_param("valid_path", {valid_path})
    
    
    alpha = 0.1
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    mlflow.log_param("alpha", alpha)
    mlflow.sklearn.log_model(lr, artifact_path='sklearn_model')
    
    y_pred = lr.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    with open('./mlflow/models/lasso.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)
        
    with open('./mlflow/models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    mlflow.log_artifact('./mlflow/models/lasso.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./mlflow/models/preprocessor.b', artifact_path='preprocessor')

### XGBOOST

In [19]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [20]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

### HYPER OPT FOR FINE TUNING 

In [21]:
def objective(params):
    with mlflow.start_run(run_name="fine tuning"):
        mlflow.set_tag("developer", "surawut")
        mlflow.set_tag("model", "hyperopt")
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

In [22]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	validation-rmse:8.89696                           
[1]	validation-rmse:6.90666                           
[2]	validation-rmse:6.60273                           
[3]	validation-rmse:6.52069                           
[4]	validation-rmse:6.50768                           
[5]	validation-rmse:6.49449                           
[6]	validation-rmse:6.48705                           
[7]	validation-rmse:6.47447                           
[8]	validation-rmse:6.46961                           
[9]	validation-rmse:6.46218                           
[10]	validation-rmse:6.46143                          
[11]	validation-rmse:6.45725                          
[12]	validation-rmse:6.45400                          
[13]	validation-rmse:6.44865                          
[14]	validation-rmse:6.44326                          
[15]	validation-rmse:6.43999                          
[16]	validation-rmse:6.43487                          
[17]	validation-rmse:6.43368                          
[18]	valid

### XGBOOST OPTIMIZATION HYPER PARAMETER WITH HYPEROPT

In [31]:
with mlflow.start_run(run_name="tuned_hyper_params"):
    
    mlflow.xgboost.autolog(disable=True)
    mlflow.set_tag("developer", "surawut")
    mlflow.set_tag("model", "xgboost")
    
    best_params = {
        'learning_rate': 0.2053232728477489,
        'max_depth': 49,
        'min_child_weight': 2.744970249660763,
        'objective': 'reg:linear',
        'reg_alpha': 0.012966352401600332,
        'reg_lambda': 0.16922535923434157,
        'seed': 42
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
   
    with open('./mlflow/models/preprocessor.b', 'wb') as dv_out:
        pickle.dump(dv, dv_out)
        
    with open('./mlflow/models/xgboost.bin', 'wb') as f_out:
        pickle.dump(booster, f_out)
        
    mlflow.log_artifact('./mlflow/models/xgboost.bin', artifact_path='models_pickle')
    mlflow.log_artifact('./mlflow/models/preprocessor.b', artifact_path='preprocessor')
    
    mlflow.xgboost.log_model(booster, artifact_path='models_xgboost')

[0]	validation-rmse:17.54287
[1]	validation-rmse:14.72296
[2]	validation-rmse:12.57072
[3]	validation-rmse:10.95021
[4]	validation-rmse:9.74298
[5]	validation-rmse:8.85627
[6]	validation-rmse:8.21168
[7]	validation-rmse:7.74695
[8]	validation-rmse:7.41107
[9]	validation-rmse:7.17202
[10]	validation-rmse:6.99769
[11]	validation-rmse:6.87153
[12]	validation-rmse:6.77912
[13]	validation-rmse:6.70846
[14]	validation-rmse:6.65632
[15]	validation-rmse:6.61415
[16]	validation-rmse:6.58222
[17]	validation-rmse:6.55632
[18]	validation-rmse:6.53600
[19]	validation-rmse:6.52031
[20]	validation-rmse:6.50897
[21]	validation-rmse:6.49880
[22]	validation-rmse:6.49085
[23]	validation-rmse:6.48648
[24]	validation-rmse:6.47979
[25]	validation-rmse:6.47616
[26]	validation-rmse:6.47225
[27]	validation-rmse:6.46930
[28]	validation-rmse:6.46560
[29]	validation-rmse:6.46267
[30]	validation-rmse:6.45940
[31]	validation-rmse:6.45636
[32]	validation-rmse:6.45441
[33]	validation-rmse:6.45184
[34]	validation-rmse

### MODEL SELECTION

- RandomForestRegressor
- GradientBoostingRegressor
- ExtraTreeRegressor
- LinearSVR

In [24]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

for model_class in tqdm((RandomForestRegressor,
                   GradientBoostingRegressor,
                   ExtraTreesRegressor,
                   LinearSVR), desc="Training..."):
    with mlflow.start_run(run_name="model_selection"):
        
        mlflow.sklearn.autolog()
        mlflow.set_tag("developer", "surawut")
        mlflow.set_tag("model", {model_class()})
        
        mlflow.log_param("train_path", {train_path})
        mlflow.log_param("valid_path", {valid_path})
        mlflow.log_artifact("./mlflow/models/preprocessor.b", artifact_path='preprocessor')
       
        model = model_class()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        with open('./mlflow/models/preprocessor.b', 'wb') as dv_out:
            pickle.dump(dv, dv_out)
            
        mlflow.log_artifact('./mlflow/models/preprocessor.b', artifact_path='preprocessor')
        

Training...: 100%|██████████| 4/4 [27:18<00:00, 409.55s/it]


## SELECTED THE BEST MODEL FROM MLFLOW AND EVALUATE MODEL

## LOAD MODEL FOR PREDICTION

In [32]:
logged_model = 'runs:/3cd71efdccd4468f85d0278e8b062bea/models_xgboost'

In [33]:
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model



mlflow.pyfunc.loaded_model:
  artifact_path: models_xgboost
  flavor: mlflow.xgboost
  run_id: 3cd71efdccd4468f85d0278e8b062bea

In [34]:
xgboost = mlflow.xgboost.load_model(logged_model)
xgboost



<xgboost.core.Booster at 0x154264fa0>

In [35]:
y_pred = xgboost.predict(valid)
y_pred[:10]

array([15.052768,  6.983479, 16.258198, 24.636147,  9.48605 , 17.167553,
       10.363366,  7.713389,  9.043783, 20.177074], dtype=float32)