# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [16]:
import numpy as np 
import pandas as pd 
import os
import math

In [3]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from datetime import datetime as dt
import pytz

In [4]:
PATH = 'data/nyc-taxi/'

# Helper functions

In [5]:
def split_df(df, y_fld, subset=None, random_state=42):
    
    if subset: df = df.sample(n=subset, random_state=random_state)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

# Load data

In [6]:
seed = 42 # random seed for replication

In [7]:
%%time
similar_df = pd.read_feather("tmp/similar_mask")
train_df = pd.read_feather('tmp/taxi-train-v6-chkpt5')
test_df = pd.read_feather('tmp/taxi-test-v6-chkpt5')
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

Wall time: 2.98 s


In [8]:
similar_df.shape, train_df.shape, test_df.shape

((54053965, 1), (54053965, 24), (9914, 23))

In [9]:
%%time
val_df = train_df[similar_df.similar.values==1]
train_df = train_df[similar_df.similar.values==0]

Wall time: 4.13 s


In [10]:
%%time
subsample = 2_000_000

X_train, y_train = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)
# X_train, y_train = split_df(train_df, 'fare_amount')

X_val, y_val = split_df(val_df, 'fare_amount')

Wall time: 4.36 s


In [11]:
X_train.shape, y_train.shape, test_df.shape, X_val.shape, y_val.shape

((2000000, 23), (2000000,), (9914, 23), (94411, 23), (94411,))

In [12]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    # in scikit-learn, random forest regressor,score is the r-squared value
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    # RMSE is what we are more interested in
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)

In [18]:
m = RandomForestRegressor(n_estimators=8, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)

In [19]:
%%time
m.fit(X_train, y_train)

Wall time: 1min 4s


  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [20]:
%%time
print_score(m)

Training-Score:   0.94091	Training-rmse:   2.33090
Validation-score: 0.86001	Validation-rmse: 3.99697
OOB-score:	  0.77414
Wall time: 9.99 s
