# Model selection
- train val split
- explore model
- select a good one and do tuning
- train and save to model directory

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
sys.path.append(os.path.join(PROJ_ROOT, 'src'))

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Read processed data

In [4]:
from data.read import read_processed_data
df = read_processed_data('../data/processed/train.pkl')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1329780 entries, 0 to 1329779
Data columns (total 9 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   trip_distance                1329780 non-null  float64
 1   pickup_datetime_date         1329780 non-null  float64
 2   pickup_datetime_day_of_week  1329780 non-null  float64
 3   pickup_datetime_hour         1329780 non-null  float64
 4   pickup_latitude              1329780 non-null  float64
 5   pickup_longitude             1329780 non-null  float64
 6   dropoff_latitude             1329780 non-null  float64
 7   dropoff_longitude            1329780 non-null  float64
 8   log_trip_duration            1329780 non-null  float64
dtypes: float64(9)
memory usage: 91.3 MB


# Train val split

In [7]:
X = df.drop(['log_trip_duration'], axis=1)
y = df['log_trip_duration']

In [35]:
# without column names
# np.array(X).shape
# y.ravel()

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [15]:
print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

(1196802, 8) (132978, 8)
(1196802,) (132978,)


# Explore models
- evaluate by rmse

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
def evaluate_regressor_skl(regressor, X_train, y_train, X_val, y_val):
    '''
    for sklearn models
    '''
    regressor.fit(X_train, y_train)
    # make predictions on training data
    y_pred_train = regressor.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

    # make predictions on test data
    y_pred_val = regressor.predict(X_val)
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

    print(f'Predicting with {regressor}')
    print('Training error:')
    print(f'Root mean squared error = {rmse_train}')
    
    print('Validation error:')
    print(f'Root mean squared error = {rmse_val}')

    return regressor, y_pred_train, y_pred_val

In [18]:
# try HistogramGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

hgb_results = evaluate_regressor_skl(
    HistGradientBoostingRegressor(random_state=42),
    X_train, y_train,
    X_val, y_val
    )

Predicting with HistGradientBoostingRegressor(random_state=42)
Training error:
Root mean squared error = 0.38732930810657246
Validation error:
Root mean squared error = 0.3946265400959257


# Cross validation

In [19]:
from sklearn.model_selection import cross_val_score

cv_scores_hgb = np.sqrt(-1*np.array(cross_val_score(HistGradientBoostingRegressor(random_state=42), X, y, scoring='neg_mean_squared_error', cv=5)))

In [20]:
print(cv_scores_hgb)
print(f'Mean = {np.mean(cv_scores_hgb)}')

[0.3913187  0.39114067 0.39033772 0.39144576 0.38939521]
Mean = 0.39072761412687507


# Hyperparams tuning

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    # param_name: [to random search] + [default value]
    'max_leaf_nodes': [31, 150, 300]
}

regressor = GridSearchCV(HistGradientBoostingRegressor(random_state=42), param_grid, scoring='neg_root_mean_squared_error', cv=3, refit=True)
search = regressor.fit(X, y)

In [22]:
# we got better results with more max_leaf_nodes
search.cv_results_

{'mean_fit_time': array([19.53243907, 24.09316913, 34.82859087]),
 'std_fit_time': array([1.43024878, 0.13117922, 3.04362348]),
 'mean_score_time': array([1.86253564, 1.72625391, 2.16300265]),
 'std_score_time': array([0.31782028, 0.04289972, 0.36191248]),
 'param_max_leaf_nodes': masked_array(data=[31, 150, 300],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_leaf_nodes': 31},
  {'max_leaf_nodes': 150},
  {'max_leaf_nodes': 300}],
 'split0_test_score': array([-0.39124732, -0.37303398, -0.36574581]),
 'split1_test_score': array([-0.38992782, -0.37205564, -0.36575616]),
 'split2_test_score': array([-0.39015086, -0.37197654, -0.36521196]),
 'mean_test_score': array([-0.390442  , -0.37235539, -0.36557131]),
 'std_test_score': array([0.00057668, 0.00048092, 0.00025414]),
 'rank_test_score': array([3, 2, 1])}

In [23]:
# save model to model directory
from joblib import dump

dump(search.best_estimator_, '../models/best_estimator.joblib')

['../model/best_estimator.joblib']

In [39]:
# test we can load and use for making predictions
from joblib import load
from data.read import read_processed_test_data
from models.predict import make_predictions

loaded_model = load('../models/best_estimator.joblib')

test = read_processed_test_data('../data/processed/test.pkl')

make_predictions(loaded_model, test)

array([ 783.83034448,  639.28668273,  400.50694306, ..., 1463.95510792,
       1573.39523268, 1145.26212871])

In [40]:
pd.read_pickle('../models/predictions.pkl')

Unnamed: 0,trip_duration
0,826.733712
1,633.535294
2,430.694029
3,918.680135
4,350.398802
...,...
625129,292.325805
625130,1239.288231
625131,1512.261131
625132,1585.017629
