# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import os
import math

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [4]:
PATH = 'data/nyc-taxi/'

# Helper functions

In [5]:
def split_df(df, y_fld, subset=None, random_state=42):
    
    if subset: df = df.sample(n=subset, random_state=random_state)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

In [6]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)

In [7]:
def run_tests(dfs, df):
    
    print("200,000 subsample") 
    score1 = np.sqrt(-cross_val_score(m, dfs, ys, cv=5, scoring='neg_mean_squared_error'))
    print(f"All scores:{score1}\nMean:{score1.mean()}\nStandard Deviation:{score1.std()}")
    
    print()
    
    print("1,000,000 subsample")
    score2 = np.sqrt(-cross_val_score(m, df, y, cv=5, scoring='neg_mean_squared_error'))
    print(f"All scores:{score2}\nMean:{score2.mean()}\nStandard Deviation:{score2.std()}")
        
    return score2.mean()

# Prepare Data

In [8]:
seed = 101 # random seed for replication

In [9]:
%%time
train_df = pd.read_feather('tmp/taxi-train-v8-Baseline')

Wall time: 1.63 s


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54057746 entries, 0 to 54057745
Data columns (total 15 columns):
fare_amount           float32
pickup_longitude      float32
pickup_latitude       float32
dropoff_longitude     float32
dropoff_latitude      float32
passenger_count       uint8
year                  uint8
month                 uint8
week                  uint8
dayofweek             uint8
day                   uint8
hour                  uint8
longitude_distance    float32
latitude_distance     float32
fare_increased        bool
dtypes: bool(1), float32(7), uint8(7)
memory usage: 1.8 GB


In [11]:
train_df.shape

(54057746, 15)

In [12]:
%%time
subsamples = 200_000
Xs, ys = split_df(train_df, 'fare_amount', subset=subsamples, random_state=seed)

Wall time: 3.2 s


In [13]:
%%time
subsample = 1_000_000
X, y = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)

Wall time: 3.46 s


In [14]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=seed)

# Model

In [15]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, random_state=seed)

# Various Features

## Clean Baseline

In [16]:
%%time
benchmark_score = run_tests(Xs,X)

200,000 subsample
All scores:[3.70243413 3.89522313 3.66540262 3.6205772  3.64173309]
Mean:3.7050740361043326
Standard Deviation:0.09888301830641673

1,000,000 subsample
All scores:[3.55749296 3.58194821 3.62336068 3.67396847 3.66222735]
Mean:3.619799534490087
Standard Deviation:0.044859573380195056
Wall time: 7min 32s


## Manhattan Distance

In [17]:
def add_manhattan_dist(df):
    df_new = df.copy()
    df_new['manhattan_dist'] = np.abs(df_new.longitude_distance)  + np.abs(df_new.latitude_distance)
    
    return df_new

In [18]:
Xs_add = None
X_add = None

Xs_add = add_manhattan_dist(Xs)
X_add = add_manhattan_dist(X)

In [19]:
%%time
test_score = run_tests(Xs_add,X_add)

diff = benchmark_score-test_score
if diff>0: print("\nBETTER by", diff) 
else: print("\nWORSE by", diff)
    
Xs_add = None
X_add = None

200,000 subsample
All scores:[3.70846434 3.89737825 3.66097535 3.64430964 3.62515459]
Mean:3.707256433897386
Standard Deviation:0.09898759172474675

1,000,000 subsample
All scores:[3.54952014 3.58757212 3.62328773 3.68453646 3.66474905]
Mean:3.621933098709799
Standard Deviation:0.049352881759103774

WORSE by -0.002133564219712003
Wall time: 7min 34s


## Euclidean Distance

In [20]:
def add_euclidean_dist(df):
    df_new = df.copy()
    df_new['manhattan_dist'] = np.sqrt(np.abs(df_new.longitude_distance)**2  + np.abs(df_new.latitude_distance)**2)
    
    return df_new

In [21]:
Xs_add = None
X_add = None

Xs_add = add_euclidean_dist(Xs)
X_add = add_euclidean_dist(X)

In [22]:
%%time
test_score = run_tests(Xs_add,X_add)

diff = benchmark_score-test_score
if diff>0: print("\nBETTER by", diff) 
else: print("\nWORSE by", diff)
    
Xs_add = None
X_add = None

200,000 subsample
All scores:[3.71844683 3.91284922 3.67402372 3.62148579 3.63389177]
Mean:3.7121394655956963
Standard Deviation:0.10594129530943752

1,000,000 subsample
All scores:[3.55083359 3.59121719 3.63248357 3.67738491 3.67776328]
Mean:3.6259365088749123
Standard Deviation:0.04944025853479972

WORSE by -0.006136974384825411
Wall time: 7min 32s


## Drop One Feature at a time

In [23]:
features_to_drop = ['passenger_count', 'fare_increased','year', 'month', 'day', 'week', 'dayofweek', 'hour']

In [24]:
%%time
for feature in features_to_drop:
    
    Xs_drop = Xs.drop(feature, axis=1)
    X_drop = X.drop(feature, axis=1)
    print("Drop", feature)
    assert feature not in Xs_drop.columns
    
    test_score = run_tests(Xs_drop,X_drop)

    diff = benchmark_score-test_score
    if diff>0: print("\nBETTER by", diff) 
    else: print("\nWORSE by", diff)
    print("\n\n\n")

Drop passenger_count
200,000 subsample
All scores:[3.7145035  3.90644737 3.68157383 3.6117234  3.6312096 ]
Mean:3.7090915398195192
Standard Deviation:0.10514940552277423

1,000,000 subsample
All scores:[3.5411621  3.579412   3.61913385 3.66702261 3.66710252]
Mean:3.6147666158016953
Standard Deviation:0.04930797664973391

BETTER by 0.005032918688391508




Drop fare_increased
200,000 subsample
All scores:[3.71108295 3.91818558 3.69972599 3.62045207 3.65117893]
Mean:3.720125106397881
Standard Deviation:0.10432078054233636

1,000,000 subsample
All scores:[3.54980705 3.5731588  3.61907192 3.6878341  3.66939594]
Mean:3.619853558360294
Standard Deviation:0.05322279036730862

WORSE by -5.402387020714272e-05




Drop year
200,000 subsample
All scores:[3.68662367 3.88903533 3.6550436  3.61964856 3.62897467]
Mean:3.695865166826149
Standard Deviation:0.09934769984165118

1,000,000 subsample
All scores:[3.54667651 3.58720601 3.6205853  3.69313262 3.66550422]
Mean:3.6226209317859017
Standard Deviat

## Drop Multi Feature Based on Previous

In [25]:
features_to_drop = ['passenger_count', 'year', 'dayofweek']

In [26]:
%%time
Xs_drop = None
X_drop = None

Xs_drop = Xs.drop(features_to_drop, axis=1)
X_drop = X.drop(features_to_drop, axis=1)
print("Drop", features_to_drop)

test_score = run_tests(Xs_drop, X_drop)

diff = benchmark_score-test_score
if diff>0: print("\nBETTER by", diff) 
else: print("\nWORSE by", diff)
    
Xs_drop = None
X_drop = None

Drop ['passenger_count', 'year', 'dayofweek']
200,000 subsample
All scores:[3.70235478 3.90569159 3.67521124 3.61724302 3.65061806]
Mean:3.7102237381707455
Standard Deviation:0.10167962301124032

1,000,000 subsample
All scores:[3.57624366 3.61092069 3.64774475 3.69007779 3.68239868]
Mean:3.6414771112274487
Standard Deviation:0.04304644564517624

WORSE by -0.021677576737361814
Wall time: 6min 6s


## Drop Multi Feature and Add Manhattan Distance

# Previous Results

## No Distance (Worse) - Distance important!

In [43]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.73207096 3.64971172 3.78855846 3.70434415 3.75709017]
Mean:3.726355092469748
Standard Deviation:0.047339757272736475
Wall time: 6min 15s


In [44]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.18556465 4.09533261 4.11378021 3.91492525 3.8575003 ]
Mean:4.033420603872139
Standard Deviation:0.12524190846900796
Wall time: 28.9 s


## No Price Flag - Slightly worse, meaning flag helps a bit

In [34]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63846212 3.5470505  3.70158152 3.62338888 3.67458945]
Mean:3.6370144943106792
Standard Deviation:0.052653680121059726
Wall time: 6min 35s


In [35]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.15191537 3.838425   3.89519449 3.82329984 3.74014949]
Mean:3.889796835750949
Standard Deviation:0.14013600889515976
Wall time: 29.8 s


## Simple Distance with Dropped Features - Worse

In [25]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.65201921 3.57102133 3.72271901 3.65535705 3.69958211]
Mean:3.6601397429458418
Standard Deviation:0.05196686419026182
Wall time: 5min 59s


In [26]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.11515771 3.82088883 3.92610813 3.82803286 3.73762595]
Mean:3.8855626929784477
Standard Deviation:0.1294105692233161
Wall time: 27.6 s


## Adaptive Distance Results - Almost same

In [15]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63706619 3.54593837 3.70929003 3.61300545 3.66439731]
Mean:3.6339394689207567
Standard Deviation:0.054401452590344075
Wall time: 6min 40s


In [16]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.10145912 3.85218782 3.89292442 3.7888432  3.74667721]
Mean:3.876418353477573
Standard Deviation:0.12329240623805218
Wall time: 30.9 s


## Straight Distance Results - Baseline for comparison

In [24]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.62629522 3.54219699 3.69463875 3.62451808 3.66437177]
Mean:3.630404162840859
Standard Deviation:0.051209209746410786
Wall time: 6min 34s


In [25]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.10352632 3.86554431 3.88060332 3.79846667 3.74304178]
Mean:3.8782364803056653
Standard Deviation:0.12293334306792435
Wall time: 30.5 s


## Manhattan Distance Results - almost same

In [15]:
%%time
scores = np.sqrt(-cross_val_score(m, X, y, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[3.63766713 3.55590959 3.68609126 3.61896021 3.65826552]
Mean:3.631378740620408
Standard Deviation:0.043830845785230284
Wall time: 6min 46s


In [16]:
%%time
scores = np.sqrt(-cross_val_score(m, Xs, ys, cv=5, scoring='neg_mean_squared_error'))
print_score(scores)

All scores:[4.11236619 3.87212126 3.89425008 3.79124005 3.71353413]
Mean:3.876702340338427
Standard Deviation:0.13401047052087886
Wall time: 29.1 s


# Save model

In [None]:
import pickle

In [None]:
model_fn_save = 'models/<>'

In [None]:
%time pickle.dump(m, open(model_fn_save, 'wb'))

# Load Model

In [None]:
model_fn_open = 'models/<>.pkl'

In [None]:
%time rf = pickle.load(open(model_fn_open, 'rb'))