https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/

# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
import math

from IPython.display import display

In [3]:
import xgboost as xgb

In [4]:
PATH = 'data/nyc-taxi/'

# Helper Functions

In [5]:
def proc_df(df, y_fld, subset=None, seed=42):
    
    if subset: df = df.sample(n=subset,random_state=seed)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

In [6]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_scores(model):
    pred_train = model.predict(DM_train)
    rmse_train = rmse(pred_train, y_train)
    
    pred_val = model.predict(DM_val)
    rmse_val = rmse(pred_val, y_val)
    
    print("\nTraining score")
    print(f'RMSE for training set: {rmse_train:.5f}')
    print(f'RMSE for validation set: {rmse_val:.5f}')

# Load clean dataframe

In [7]:
%%time
train_df = pd.read_feather('tmp/v2/taxi-v2-train')
val_df = pd.read_feather('tmp/v2/taxi-v2-val')
test_df = pd.read_feather('tmp/v2/taxi-test-v2')

Wall time: 238 ms


In [8]:
%%time
X_train, y_train = proc_df(train_df,'fare_amount') 

Wall time: 66.6 ms


In [9]:
X_val, y_val = proc_df(val_df,'fare_amount')

In [10]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((3445082, 16), (9674, 16), (3445082,), (9674,))

# XGBoost

In [11]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_val = xgb.DMatrix(data=X_val, label=y_val)
DM_test = xgb.DMatrix(data=test_df)

In [12]:
evals_result = {}
watchlist = [(DM_train, "training"), (DM_val, "validation")]

In [41]:
params_native = {
    'objective': 'reg:linear', 
    'booster':'gbtree',        
    'silent': 1,               
    'eta': 0.1,                 
    'gamma': 0,                 
    'max-depth': 4,             
    'min_child_weight': 1,      
    'max_delta_step': 0,        
    'subsample': 0.8,             
    'colsample_bytree': 0.6,      
    'colsample_bylevel': 0.6,     
    'lambda': 1,                
    'alpha': 0,                 
    'scale_pos_weight': 1,      
    'base_score': 0.5,          
    'eval_metric':'rmse',      
    'seed': 42                  
}

## Train

In [42]:
%%time
xgb_native = xgb.train(params=params_native, 
                            dtrain=DM_train,
                            num_boost_round=100_000,
                            evals=watchlist,
                            early_stopping_rounds=10,
                            evals_result=evals_result,
                            verbose_eval=1)

[0]	training-rmse:12.9663	validation-rmse:12.6968
Multiple eval metrics have been passed: 'validation-rmse' will be used for early stopping.

Will train until validation-rmse hasn't improved in 10 rounds.
[1]	training-rmse:11.8579	validation-rmse:11.5766
[2]	training-rmse:10.8208	validation-rmse:10.5315
[3]	training-rmse:9.90417	validation-rmse:9.603
[4]	training-rmse:9.09686	validation-rmse:8.78586
[5]	training-rmse:8.36481	validation-rmse:8.04846
[6]	training-rmse:7.72703	validation-rmse:7.40451
[7]	training-rmse:7.16532	validation-rmse:6.83983
[8]	training-rmse:6.68426	validation-rmse:6.34758
[9]	training-rmse:6.25305	validation-rmse:5.91083
[10]	training-rmse:5.89879	validation-rmse:5.55307
[11]	training-rmse:5.59757	validation-rmse:5.2489
[12]	training-rmse:5.31213	validation-rmse:4.96302
[13]	training-rmse:5.0684	validation-rmse:4.72071
[14]	training-rmse:4.85637	validation-rmse:4.5113
[15]	training-rmse:4.67494	validation-rmse:4.32912
[16]	training-rmse:4.52077	validation-rmse:4

[158]	training-rmse:3.37684	validation-rmse:3.1886
[159]	training-rmse:3.37563	validation-rmse:3.18804
[160]	training-rmse:3.37513	validation-rmse:3.18775
[161]	training-rmse:3.37315	validation-rmse:3.18561
[162]	training-rmse:3.37214	validation-rmse:3.18512
[163]	training-rmse:3.37148	validation-rmse:3.18478
[164]	training-rmse:3.37068	validation-rmse:3.18466
[165]	training-rmse:3.37029	validation-rmse:3.18469
[166]	training-rmse:3.36988	validation-rmse:3.18437
[167]	training-rmse:3.36818	validation-rmse:3.18314
[168]	training-rmse:3.3675	validation-rmse:3.18321
[169]	training-rmse:3.36682	validation-rmse:3.18305
[170]	training-rmse:3.36577	validation-rmse:3.18288
[171]	training-rmse:3.36514	validation-rmse:3.18262
[172]	training-rmse:3.36447	validation-rmse:3.1824
[173]	training-rmse:3.36397	validation-rmse:3.18241
[174]	training-rmse:3.36298	validation-rmse:3.18248
[175]	training-rmse:3.36183	validation-rmse:3.18172
[176]	training-rmse:3.36116	validation-rmse:3.1814
[177]	training-r

[317]	training-rmse:3.2638	validation-rmse:3.12431
[318]	training-rmse:3.26356	validation-rmse:3.12435
[319]	training-rmse:3.26344	validation-rmse:3.1244
[320]	training-rmse:3.26335	validation-rmse:3.1244
[321]	training-rmse:3.26321	validation-rmse:3.1244
[322]	training-rmse:3.26284	validation-rmse:3.12423
[323]	training-rmse:3.26037	validation-rmse:3.12135
[324]	training-rmse:3.25963	validation-rmse:3.12127
[325]	training-rmse:3.25927	validation-rmse:3.12113
[326]	training-rmse:3.25888	validation-rmse:3.12115
[327]	training-rmse:3.25828	validation-rmse:3.12097
[328]	training-rmse:3.25811	validation-rmse:3.12109
[329]	training-rmse:3.25728	validation-rmse:3.12124
[330]	training-rmse:3.25688	validation-rmse:3.12096
[331]	training-rmse:3.25653	validation-rmse:3.12061
[332]	training-rmse:3.25623	validation-rmse:3.12059
[333]	training-rmse:3.25599	validation-rmse:3.12051
[334]	training-rmse:3.25568	validation-rmse:3.11987
[335]	training-rmse:3.2548	validation-rmse:3.11894
[336]	training-rm

In [43]:
print("Best iteration\t{}".format(xgb_native.best_iteration))
print("Best tree limit\t{}".format(xgb_native.best_ntree_limit))
print("Best RMSE score\t{}".format(xgb_native.best_score))

print_scores(xgb_native)

Best iteration	415
Best tree limit	416
Best RMSE score	3.105657

Training score
RMSE for training set: 3.21947
RMSE for validation set: 3.10600


# Predict on test data

In [44]:
# We only need 'key' for the test submission file
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [45]:
%%time
test_y_predictions = xgb_native.predict(DM_test)

Wall time: 30.7 ms


# Submit

In [46]:
from datetime import datetime as dt

In [47]:
outdir = 'submissions'
os.makedirs(outdir, exist_ok=True)
    
cur_dt = dt.now().strftime("%Y%m%d_%H%M%S")
desc = 'v06_XGB_' # description of submission for reference
subm_fn =  desc + cur_dt + '.csv'
subm_path = f'{outdir}/{subm_fn}'

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv(subm_path, index = False)

subm_path

'submissions/v06_XGB_20180731_011309.csv'

Kaggle Score:

# Save model

In [48]:
import pickle

In [49]:
model_fn_save = 'models/xgb_test.pkl'

In [50]:
pickle.dump(xgb_native, open(model_fn_save, "wb"))

In [57]:
xgb_native.save_model(model_fn_save)

# Load Model

In [61]:
model_fn_open = 'models/xgb_test.pkl'

In [52]:
clf2 = pickle.load(open(model_fn_open, "rb"))

In [69]:
clf2 = xgb.Booster(params=params_native) 
clf2.load_model(model_fn_open) 

{'best_iteration': '415',
 'best_msg': '[415]\ttraining-rmse:3.224\tvalidation-rmse:3.10566',
 'best_score': '3.105657'}

# Compare

In [59]:
preds1 = xgb_native.predict(DM_train)

In [63]:
preds2 = clf2.predict(DM_train)

In [67]:
np.sum(preds1 - preds2)

1.0131798

# Results

| Validation | Leader Board | Specs |
| --- | --- | --- |
| 3.13952 | 3.14550 | v06_XGB_20180731_001818 |