# Duration Prediction

In [1]:
# Load dependencies
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from config import PATH_DATA_GT_JAN, PATH_DATA_GT_FEB
from src.load_data import load_data
from src.preprocess_data import preprocess

In [2]:
# Configure mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc_taxi_experiment', tags={}>

In [3]:
# Load datasets
df_train = load_data(PATH_DATA_GT_JAN)
df_val = load_data(PATH_DATA_GT_FEB)

In [4]:
# Data preprocessing

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

# Select numerical and categorical variables
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# Define train and validation vectors for features
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Define the target variable
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [5]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [6]:
def objective(params):

    # Run an experiment by using xgboost
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [7]:
# Define hyperparameter search space: http://hyperopt.github.io/hyperopt/getting-started/search_spaces/
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),  # int(Returns a value like round(uniform(low, high) / q) * q)
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # exp(-3) -> exp(0) = 1
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

# minimize the objective over the space
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,  # Tree of Parzen Estimators (TPE)
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:14.20420                          
[1]	validation-rmse:10.42119                          
[2]	validation-rmse:8.50697                           
[3]	validation-rmse:7.59187                           
[4]	validation-rmse:7.15661                           
[5]	validation-rmse:6.94493                           
[6]	validation-rmse:6.83501                           
[7]	validation-rmse:6.77409                           
[8]	validation-rmse:6.73985                           
[9]	validation-rmse:6.71633                           
[10]	validation-rmse:6.70171                          
[11]	validation-rmse:6.69429                          
[12]	validation-rmse:6.69038                          
[13]	validation-rmse:6.68630                          
[14]	validation-rmse:6.68304                          
[15]	validation-rmse:6.68111                          
[16]	validation-rmse:6.67910                          
[17]	validation-rmse:6.67825                          
[18]	valid

In [13]:
# Alternatively, we can allow mlflow to log atomatically -> disable must be removed!
mlflow.xgboost.autolog(disable=True)  # Do not track automatically!

In [14]:
# Run an experiment by using the best parameters from xgboost
with mlflow.start_run():

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    # Choose the best hyperparemeters from MLflow UI
    best_params = {
        'learning_rate': 0.44615334151561087,
        'max_depth': 16,
        'min_child_weight': 1.9389972491286867,
        'objective': 'reg:linear',
        'reg_alpha': 0.17546962748313016,
        'reg_lambda': 0.31620036723955675,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    # Save the trained model for later usage
    with open("../models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact(local_path="../models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(xgb_model=booster, artifact_path="models_mlflow")


[0]	validation-rmse:13.46047
[1]	validation-rmse:9.65666
[2]	validation-rmse:7.93326
[3]	validation-rmse:7.19287
[4]	validation-rmse:6.87076
[5]	validation-rmse:6.72391
[6]	validation-rmse:6.65195
[7]	validation-rmse:6.61213
[8]	validation-rmse:6.59527
[9]	validation-rmse:6.58200
[10]	validation-rmse:6.57452
[11]	validation-rmse:6.56339
[12]	validation-rmse:6.55855
[13]	validation-rmse:6.55374
[14]	validation-rmse:6.55094
[15]	validation-rmse:6.54836
[16]	validation-rmse:6.54405
[17]	validation-rmse:6.54134
[18]	validation-rmse:6.53810
[19]	validation-rmse:6.53570
[20]	validation-rmse:6.53346
[21]	validation-rmse:6.53066
[22]	validation-rmse:6.52778
[23]	validation-rmse:6.52590
[24]	validation-rmse:6.52187
[25]	validation-rmse:6.51838
[26]	validation-rmse:6.51329
[27]	validation-rmse:6.51057
[28]	validation-rmse:6.51050
[29]	validation-rmse:6.50851
[30]	validation-rmse:6.50572
[31]	validation-rmse:6.50412
[32]	validation-rmse:6.50258
[33]	validation-rmse:6.50013
[34]	validation-rmse:6.



In [21]:
# Check the logged model
logged_model = 'runs:/bf06fdbba89040069a1a314b58407378/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
display(loaded_model)

# Alternatively you can load the model as xgboost model
xgboost_model = mlflow.xgboost.load_model(logged_model)
display(xgboost_model)

# Predict on a Pandas DataFrame.
#loaded_model.predict(pd.DataFrame(data))

# Make predictions by using trained xgboost_model
y_preds_xgboost = xgboost_model.predict(valid)
y_preds_xgboost[:10]



mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: bf06fdbba89040069a1a314b58407378



<xgboost.core.Booster at 0x1f1cd166400>

array([14.939469,  7.21468 , 13.734897, 24.3241  ,  8.959513, 17.16547 ,
       11.413286,  8.66366 ,  9.011423, 19.73803 ], dtype=float32)