https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/

# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

In [3]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

In [4]:
PATH = 'data/nyc-taxi/'

The entire dataset has about 55,423,857 rows

# Load clean dataframe

In [5]:
%%time
train_df = pd.read_feather('tmp/v2/taxi-train-v2-chkpt4')

Wall time: 7.06 s


TODO: Remove val from train_df using the xls

In [6]:
val_df = pd.read_feather('tmp/v2/taxi-v2-val')

In [7]:
test_df = pd.read_feather('tmp/v2/taxi-test-v2-chkpt4')

In [8]:
%%time
subsample = 20_000_000

X_train, y_train, _ = proc_df(train_df, 'fare_amount', subset=subsample)

Wall time: 35.9 s


In [9]:
X_val, y_val, _ = proc_df(val_df, 'fare_amount', subset=subsample)

In [10]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, test_df.shape

((20000000, 16), (9674, 16), (20000000,), (9674,), (9914, 16))

# Train

## Helper function to calculate evaluation metric

In [11]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    # in scikit-learn, random forest regressor,score is the r-squared value
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    # RMSE is what we are more interested in
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)

## Fit

In [12]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, max_features=0.5, n_jobs=-1)

In [13]:
%time m.fit(X_train, y_train)

Wall time: 2h 25min 56s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
%time print_score(m)

Training-Score:   0.95189	Training-rmse:   2.08844
Validation-score: 0.91975	Validation-rmse: 2.60518
Wall time: 44min


# Predict on test data

In [15]:
# We need 'key' for the test submission file
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [16]:
test_y_predictions = m.predict(test_df)

# Submit

In [17]:
from datetime import datetime as dt

In [18]:
outdir = 'submissions'
os.makedirs(outdir, exist_ok=True)
    
cur_dt = dt.now().strftime("%Y%m%d_%H%M%S")
desc = 'v05_RF_' # description of submission for reference
subm_fn =  desc + cur_dt + '.csv'
subm_path = f'{outdir}/{subm_fn}'

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df_raw.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv(subm_path, index = False)

subm_path

'submissions/v05_RF_20180731_202241.csv'

Kaggle Score: 2.99720

# Save model

In [None]:
model_fn_save = 'models/<>'

In [None]:
%time pickle.dump(m, open(model_fn_save, 'wb'))

# Load Model

In [None]:
model_fn_open = 'models/<>'

In [None]:
%time m = pickle.load(open(model_fn_open, 'rb'))