# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import os
import math

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [4]:
PATH = 'data/nyc-taxi/'

# Helper functions

In [5]:
def split_df(df, y_fld, subset=None, random_state=42):
    
    if subset: df = df.sample(n=subset, random_state=random_state)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

In [6]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)
    
def print_score(score):
    print(f"All scores:{score}\nMean:{scores.mean()}\nStandard Deviation:{scores.std()}")

# Load data

In [17]:
seed = 42 # random seed for replication

In [36]:
%%time
train_df = pd.read_feather('tmp/taxi-train-v8-chkpt3NoDistance')

Wall time: 1.37 s


In [37]:
train_df.shape

(54057746, 13)

In [38]:
%%time
subsample = 100_000

Xs, ys = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)

Wall time: 3.25 s


In [39]:
%%time
subsample = 1_000_000

X, y = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)

Wall time: 3.2 s


In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=seed)

In [41]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((750000, 12), (750000,), (250000, 12), (250000,))

In [42]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1)

## No Distance (Worse) - Distance important!

In [43]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.73207096 3.64971172 3.78855846 3.70434415 3.75709017]
Mean:3.726355092469748
Standard Deviation:0.047339757272736475
Wall time: 6min 15s


In [44]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.18556465 4.09533261 4.11378021 3.91492525 3.8575003 ]
Mean:4.033420603872139
Standard Deviation:0.12524190846900796
Wall time: 28.9 s


## No Price Flag - Slightly worse, meaning flag helps a bit

In [34]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63846212 3.5470505  3.70158152 3.62338888 3.67458945]
Mean:3.6370144943106792
Standard Deviation:0.052653680121059726
Wall time: 6min 35s


In [35]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.15191537 3.838425   3.89519449 3.82329984 3.74014949]
Mean:3.889796835750949
Standard Deviation:0.14013600889515976
Wall time: 29.8 s


## Simple Distance with Dropped Features - Worse

In [25]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.65201921 3.57102133 3.72271901 3.65535705 3.69958211]
Mean:3.6601397429458418
Standard Deviation:0.05196686419026182
Wall time: 5min 59s


In [26]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.11515771 3.82088883 3.92610813 3.82803286 3.73762595]
Mean:3.8855626929784477
Standard Deviation:0.1294105692233161
Wall time: 27.6 s


## Adaptive Distance Results - Almost same

In [15]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63706619 3.54593837 3.70929003 3.61300545 3.66439731]
Mean:3.6339394689207567
Standard Deviation:0.054401452590344075
Wall time: 6min 40s


In [16]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.10145912 3.85218782 3.89292442 3.7888432  3.74667721]
Mean:3.876418353477573
Standard Deviation:0.12329240623805218
Wall time: 30.9 s


## Straight Distance Results - Baseline for comparison

In [24]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.62629522 3.54219699 3.69463875 3.62451808 3.66437177]
Mean:3.630404162840859
Standard Deviation:0.051209209746410786
Wall time: 6min 34s


In [25]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.10352632 3.86554431 3.88060332 3.79846667 3.74304178]
Mean:3.8782364803056653
Standard Deviation:0.12293334306792435
Wall time: 30.5 s


## Manhattan Distance Results - almost same

In [15]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63766713 3.55590959 3.68609126 3.61896021 3.65826552]
Mean:3.631378740620408
Standard Deviation:0.043830845785230284
Wall time: 6min 46s


In [16]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.11236619 3.87212126 3.89425008 3.79124005 3.71353413]
Mean:3.876702340338427
Standard Deviation:0.13401047052087886
Wall time: 29.1 s


# Save model

In [None]:
import pickle

In [None]:
model_fn_save = 'models/<>'

In [None]:
%time pickle.dump(m, open(model_fn_save, 'wb'))

# Load Model

In [None]:
model_fn_open = 'models/<>.pkl'

In [None]:
%time rf = pickle.load(open(model_fn_open, 'rb'))