https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/

# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

from IPython.display import display

In [3]:
import xgboost as xgb

In [4]:
PATH = 'data/nyc-taxi/'

# Helper Functions

In [16]:
def proc_df(df, y_fld, subset=None, seed=42):
    
    if subset: df = df.sample(n=subset,random_state=seed)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_scores(model):
    pred_train = model.predict(DM_train)
    rmse_train = rmse(pred_train, y_train)
    
    pred_val = model.predict(DM_val)
    rmse_val = rmse(pred_val, y_val)
    
    print("\nTraining score")
    print(f'RMSE for training set: {rmse_train:.5f}')
    print(f'RMSE for validation set: {rmse_val:.5f}')

# Load clean dataframe

In [5]:
%%time
train_df = pd.read_feather('tmp/taxi-train-chkpt4')

Wall time: 8.94 s


In [6]:
test_df = pd.read_feather('tmp/taxi-test-chkpt4')

In [7]:
all_val_index_df = pd.read_csv('tmp/validation_list1.csv', header=None)

# Split Training-Validation Data

In [8]:
unique_val_list = list(set(all_val_index_df[0].tolist())) # get unique values

In [9]:
len(all_val_index_df), len(unique_val_list),

(9914, 9154)

In [10]:
n_valid = len(unique_val_list)
n_valid

9154

In [11]:
val_df = train_df.iloc[unique_val_list].copy()

In [12]:
%%time
pruned_train_df = train_df.drop(unique_val_list, axis='rows')

Wall time: 9.4 s


In [13]:
assert len(pruned_train_df) + len(val_df) == len(train_df)

In [14]:
pruned_train_df.shape, val_df.shape

((54051828, 17), (9154, 17))

In [15]:
del train_df

In [17]:
%%time
subsample = 25_000_000

X_train, y_train = proc_df(pruned_train_df, 'fare_amount', subsample)
X_val, y_val = proc_df(val_df,'fare_amount')

Wall time: 12.8 s


In [18]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((25000000, 16), (9154, 16), (25000000,), (9154,))

# XGBoost

In [19]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_val = xgb.DMatrix(data=X_val, label=y_val)
DM_test = xgb.DMatrix(data=test_df)

In [20]:
evals_result = {}
watchlist = [(DM_train, "training"), (DM_val, "validation")]

In [24]:
params_native = {
    'objective': 'reg:linear', 
    'booster':'gbtree',        
    'silent': 1,               
    'eta': 0.01,                 
    'gamma': 0,                 
    'max-depth': 8,             
    'min_child_weight': 1,      
    'max_delta_step': 0,        
    'subsample': 0.8,             
    'colsample_bytree': 0.6,      
    'colsample_bylevel': 0.7,     
    'lambda': 0.9,                
    'alpha': 0,                 
    'scale_pos_weight': 1,      
    'base_score': 0.5,          
    'eval_metric':'rmse',      
    'seed': 42                  
}

## Train

In [None]:
%%time
xgb_native = xgb.train(params=params_native, 
                            dtrain=DM_train,
                            num_boost_round=100_000,
                            evals=watchlist,
                            early_stopping_rounds=5,
                            evals_result=evals_result,
                            verbose_eval=10)

[0]	training-rmse:14.3367	validation-rmse:14.5123
Multiple eval metrics have been passed: 'validation-rmse' will be used for early stopping.

Will train until validation-rmse hasn't improved in 5 rounds.
[10]	training-rmse:13.1167	validation-rmse:13.2918
[20]	training-rmse:12.0137	validation-rmse:12.1921
[30]	training-rmse:11.0347	validation-rmse:11.2172
[40]	training-rmse:10.1732	validation-rmse:10.3609
[50]	training-rmse:9.39052	validation-rmse:9.58479
[60]	training-rmse:8.72103	validation-rmse:8.92153
[70]	training-rmse:8.10529	validation-rmse:8.31506
[80]	training-rmse:7.56673	validation-rmse:7.78633
[90]	training-rmse:7.08601	validation-rmse:7.31644
[100]	training-rmse:6.66247	validation-rmse:6.90711
[110]	training-rmse:6.28604	validation-rmse:6.5424
[120]	training-rmse:5.95569	validation-rmse:6.2265
[130]	training-rmse:5.67512	validation-rmse:5.95802
[140]	training-rmse:5.42643	validation-rmse:5.72158
[150]	training-rmse:5.21454	validation-rmse:5.52124


In [23]:
print("Best iteration\t{}".format(xgb_native.best_iteration))
print("Best tree limit\t{}".format(xgb_native.best_ntree_limit))
print("Best RMSE score\t{}".format(xgb_native.best_score))

print_scores(xgb_native)

NameError: name 'xgb_native' is not defined

# Predict on test data

In [None]:
# We need 'key' for the test submission file
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [None]:
%%time
test_y_predictions = xgb_native_full_data.predict(DM_test)

# Submit

In [None]:
from datetime import datetime as dt

In [None]:
outdir = 'submissions'
os.makedirs(outdir, exist_ok=True)
    
cur_dt = dt.now().strftime("%Y%m%d_%H%M%S")
desc = 'v06_XGB_' # description of submission for reference
subm_fn =  desc + cur_dt + '.csv'
subm_path = f'{outdir}/{subm_fn}'

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv(subm_path, index = False)

subm_path

Kaggle Score:

# Save model

In [None]:
del pruned_train_df

In [None]:
model_fn_save = 'models/v05_RF_20180729_121842.pkl'

In [None]:
%time pickle.dump(m, open(model_fn_save, 'wb'))

# Load Model

In [None]:
model_fn_open = 'models/<>'

In [None]:
%time m = pickle.load(open(model_fn_open, 'rb'))

# Feature Importance

In [None]:
fi = rf_feat_importance(m, X_train); 
fi

In [None]:
pd.DataFrame({'cols':X_train.columns, 'imp':m.feature_importances_}).sort_values(by='imp', ascending=False)

In [None]:
def plot_fi(fi): 
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi)