In [1]:
import numpy as np
import pandas as pd

from helpers.rmse import rmse, rmse_scorer

from sklearn.model_selection import ShuffleSplit, train_test_split, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
# fix seed
SEED = 123
np.random.seed(SEED)

In [5]:
# Load preprocess train and test data
train = pd.read_csv('data/train_abt.csv')
test = pd.read_csv('data/test_abt.csv')

In [6]:
# Verify and check train data
train.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,log_trip_duration,haversine_distance,manhattan_distance,euclidean_distance,vendor_id_1,vendor_id_2,...,dropoff_cluster_90,dropoff_cluster_91,dropoff_cluster_92,dropoff_cluster_93,dropoff_cluster_94,dropoff_cluster_95,dropoff_cluster_96,dropoff_cluster_97,dropoff_cluster_98,dropoff_cluster_99
0,-73.982155,40.767937,-73.96463,40.765602,6.122493,1.498521,0.019859,0.01768,0,1,...,0,0,0,0,0,0,0,0,0,0
1,-73.980415,40.738564,-73.999481,40.731152,6.498282,1.805507,0.026478,0.020456,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-73.979027,40.763939,-74.005333,40.710087,7.661527,6.385098,0.080158,0.059934,0,1,...,0,0,0,0,0,0,0,0,0,0
3,-74.01004,40.719971,-74.012268,40.706718,6.063785,1.485498,0.01548,0.013438,0,1,...,0,0,0,0,0,0,0,0,1,0
4,-73.973053,40.793209,-73.972923,40.78252,6.077642,1.188588,0.010818,0.01069,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Verify and check train data
test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,haversine_distance,manhattan_distance,euclidean_distance,vendor_id_1,vendor_id_2,passenger_count_0,...,dropoff_cluster_90,dropoff_cluster_91,dropoff_cluster_92,dropoff_cluster_93,dropoff_cluster_94,dropoff_cluster_95,dropoff_cluster_96,dropoff_cluster_97,dropoff_cluster_98,dropoff_cluster_99
0,-73.988129,40.732029,-73.990173,40.75668,2.746426,0.026695,0.024735,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-73.964203,40.679993,-73.959808,40.655403,2.759239,0.028984,0.024979,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-73.997437,40.737583,-73.98616,40.729523,1.306155,0.019337,0.013861,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-73.95607,40.7719,-73.986427,40.730469,5.269088,0.071789,0.051363,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,-73.970215,40.761475,-73.96151,40.75589,0.960842,0.01429,0.010343,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X = train.drop('log_trip_duration', axis=1).values
y = train.log_trip_duration.values

X_test = test.values

## Linear Regression Benchmark
Now we are ready to train our models. However, we have to perform and create our milestone to be as a goal to beat first, Benchmark.

In [9]:
# Split training set for benchmark purpose only
X_train_benchmark, X_valid_benchmark, y_train_benchmark, y_valid_benchmark = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [8]:
# Initiate Linear Regression Instance and train model
reg = LinearRegression()
reg.fit(X_train_benchmark, y_train_benchmark)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# Local validation score 
valid_pred = reg.predict(X_valid_benchmark)
# rmse in this case is the same as rmsle because we did log transformation for trip duration already
rmse(y_valid_benchmark, valid_pred)

0.6029172150450034

In [10]:
# Test set score: submit to kaggle leaderboard
test_pred = reg.predict(X_test)

sub = pd.DataFrame()    
sub['id'] = pd.read_csv('data/test.csv').id
sub['trip_duration'] = np.expm1(test_pred)

sub.to_csv('sub/benchmark_sub.csv', index=False)

RMSLE benchmark on validation set is 0.6029172150450034 <br>
RMSLE benchmark on public leaderboard (30% of test set) is 0.60910 <br>
RMSLE benchmark on private leaderboard (70% of test set) is 0.72923

In [11]:
# Delete all unused variables 
del X_train_benchmark, X_valid_benchmark, y_train_benchmark, y_valid_benchmark, valid_pred, test_pred, sub

## Pick the best model
After setting up the benchmark, the next process is to train, evaluate and pick the best performing model in this data set. Our candidates are Ridge Regression, Random Forest, and Gradient Boosing tree.

In [12]:
# Create Instances and store in dictionary so that it is easy to loop later
models = {
    'xgb': xgb.XGBRegressor(tree_method='hist', n_jobs=-1, random_state=SEED),
    'ridge': Ridge(random_state=SEED),
    'rf': RandomForestRegressor(n_jobs=-1, random_state=SEED)
}

ridge_hyperparameters = {
    'alpha': [0.01, 0.1, 0.5, 1, 5]
}

rf_hyperparameters = {
    'n_estimators' : [200, 500, 1000],
    'max_features': ['auto'],
    'max_depth': [5, 7, 10]
}

xgb_hyperparameters = {
    'n_estimators': [500, 1000, 2000],
    'learning_rate' : [0.01, 0.1],
    'max_depth': [5, 7, 9],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Store hyperparameters
hyperparameters = {
    'ridge' : ridge_hyperparameters,
    'rf' : rf_hyperparameters,
    'xgb' : xgb_hyperparameters
}

In [13]:
# Train each model with 3 random combinations of hyperparameters
fitted_models = {}
shuffle = ShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)

for name, reg in models.items():
    model = RandomizedSearchCV(reg , hyperparameters[name], 
                               n_iter=3,
                               cv=shuffle, 
                               scoring=rmse_scorer,
                               verbose=5,
                               n_jobs=1)
    
    model.fit(X , y)
    # Store model in dictionary
    fitted_models[name] = model
    
    print(name, 'has done')

Fitting 1 folds for each of 3 candidates, totalling 3 fits
[CV] subsample=0.8, n_estimators=1000, max_depth=5, learning_rate=0.1, colsample_bytree=0.9 
[13:04:55] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[CV]  subsample=0.8, n_estimators=1000, max_depth=5, learning_rate=0.1, colsample_bytree=0.9, score=-0.40830638788893603, total= 4.9min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.2min remaining:    0.0s


[CV] subsample=0.7, n_estimators=500, max_depth=7, learning_rate=0.1, colsample_bytree=0.9 
[13:10:11] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[CV]  subsample=0.7, n_estimators=500, max_depth=7, learning_rate=0.1, colsample_bytree=0.9, score=-0.4067131487628946, total= 3.0min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.3min remaining:    0.0s


[CV] subsample=0.8, n_estimators=500, max_depth=9, learning_rate=0.01, colsample_bytree=0.9 
[13:13:18] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[CV]  subsample=0.8, n_estimators=500, max_depth=9, learning_rate=0.01, colsample_bytree=0.9, score=-0.4347221765148964, total= 5.1min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 13.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 13.7min finished


[13:18:37] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
xgb has done
Fitting 1 folds for each of 3 candidates, totalling 3 fits
[CV] alpha=5 .........................................................
[CV] ............... alpha=5, score=-0.6022829963174543, total=  25.6s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.5s remaining:    0.0s


[CV] ............. alpha=0.5, score=-0.6028501922562434, total=  10.8s
[CV] alpha=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   38.3s remaining:    0.0s


[CV] ............. alpha=0.1, score=-0.6029038342545151, total=  10.3s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   49.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   49.5s finished


ridge has done
Fitting 1 folds for each of 3 candidates, totalling 3 fits
[CV] n_estimators=200, max_features=auto, max_depth=7 ................
[CV]  n_estimators=200, max_features=auto, max_depth=7, score=-0.47716018767823215, total=29.4min
[CV] n_estimators=500, max_features=auto, max_depth=10 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 29.6min remaining:    0.0s


[CV]  n_estimators=500, max_features=auto, max_depth=10, score=-0.4622597021424335, total=98.6min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 128.5min remaining:    0.0s


[CV] n_estimators=500, max_features=auto, max_depth=7 ................
[CV]  n_estimators=500, max_features=auto, max_depth=7, score=-0.477107780732511, total=73.4min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 202.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 202.1min finished


rf has done


In [21]:
# Display rmsle on all validation sets
for name , model in fitted_models.items():
    print(name , model.cv_results_['mean_test_score'] )

xgb [-0.40830639 -0.40671315 -0.43472218]
ridge [-0.602283   -0.60285019 -0.60290383]
rf [-0.47716019 -0.4622597  -0.47710778]


In [19]:
# Print out the best score for each algorithm
for name , model in fitted_models.items():
    print(name , model.best_score_ )

xgb -0.4067131487628946
ridge -0.6022829963174543
rf -0.46225970214243345


According to the score above, Ridge regression, regularized linear regression, has the same performance to our benchmark; it is not a good model to beat benchmark in this situation. However, with tree models, they both have performed better than benchmark model but Gradient boosting tree is doing a lot better comparing to Random Forrest. Thus, we are going to use gradient boosting tree (XGBoost) for the rest of the process. The next part is to find the optimal hyperparameters for xgboost which can be found on searching hyperparameters notebook in the same folder.

In [23]:
# Save test prediciton to evaluate on the kaggle leaderboard
for name , model in fitted_models.items():
    pred = model.predict(X_test)
    
    sub = pd.DataFrame()    
    sub['id'] = pd.read_csv('data/test.csv').id
    sub['trip_duration'] = np.expm1(pred)
    sub.to_csv('sub/' + name + '_sub.csv', index=False)