# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import os
import math

In [3]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from datetime import datetime as dt
import pytz

In [4]:
PATH = 'data/nyc-taxi/'

# Helper functions

In [5]:
def split_df(df, y_fld, subset=None, random_state=42):
    
    if subset: df = df.sample(n=subset, random_state=random_state)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

# Load data

In [6]:
seed = 42 # random seed for replication

In [7]:
%%time
similar_df = pd.read_feather("tmp/similar_mask")
train_df = pd.read_feather('tmp/taxi-train-v6-chkpt5')
test_df = pd.read_feather('tmp/taxi-test-v6-chkpt5')
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

CPU times: user 1.93 s, sys: 2.54 s, total: 4.47 s
Wall time: 4.46 s


In [8]:
similar_df.shape, train_df.shape, test_df.shape

((54053965, 1), (54053965, 24), (9914, 23))

In [9]:
%%time
val_df = train_df[similar_df.similar.values==1]
train_df = train_df[similar_df.similar.values==0]

CPU times: user 3.69 s, sys: 3.11 s, total: 6.8 s
Wall time: 6.8 s


In [10]:
%%time
subsample = 100_000

X_train, y_train = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)
# X_train, y_train = split_df(train_df, 'fare_amount')

X_val, y_val = split_df(val_df, 'fare_amount')

CPU times: user 4.62 s, sys: 263 ms, total: 4.88 s
Wall time: 4.87 s


In [11]:
X_train.shape, y_train.shape, test_df.shape, X_val.shape, y_val.shape

((100000, 23), (100000,), (9914, 23), (99825, 23), (99825,))

In [12]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    # in scikit-learn, random forest regressor,score is the r-squared value
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    # RMSE is what we are more interested in
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)

In [19]:
m2 = RandomForestRegressor(n_estimators=160, min_samples_leaf=3, max_features='sqrt', n_jobs=-1, oob_score=True)

In [20]:
%%time
m.fit(X_val, y_val)

CPU times: user 5min 4s, sys: 378 ms, total: 5min 5s
Wall time: 22.9 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=160, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [None]:
%%time
# print_score(m)

In [15]:
test_y_predictions = m.predict(test_df)

In [16]:
from datetime import datetime as dt
import pytz

In [17]:
outdir = 'submissions'
os.makedirs(outdir, exist_ok=True)
tz = pytz.timezone('Asia/Singapore')
    
cur_dt = dt.now(tz).strftime("%Y%m%d_%H%M%S")
desc = 'v13_RF_' # description of submission for reference
subm_fn =  desc + cur_dt + '.csv'
subm_path = f'{outdir}/{subm_fn}'

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv(subm_path, index = False)

subm_path

'submissions/v13_RF_20180806_230745.csv'

In [18]:
!kaggle competitions submit -c new-york-city-taxi-fare-prediction -f submissions/v13_RF_20180806_230745.csv -m "160 trees with maxfeat 0.5 train similar val data"

Successfully submitted to New York City Taxi Fare Prediction

# Save model

In [13]:
import pickle

In [14]:
model_fn_save = 'models/<>'

In [39]:
%time pickle.dump(m, open(model_fn_save, 'wb'))

# Load Model

In [None]:
model_fn_open = 'models/<>.pkl'

In [15]:
%time rf = pickle.load(open(model_fn_open, 'rb'))

Wall time: 5.53 s
