https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/

# Imports

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

In [3]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from datetime import datetime as dt

In [1]:
PATH = 'data/nyc-taxi/'
CSV_PATH = 'submissions/v09_ensemble_10trees_50M_rows'

The entire dataset has about 55,423,857 rows

# Helper functions

In [26]:
def split_df(df, y_fld, subset=None, random_state=42):
    
    if subset: df = df.sample(n=subset, random_state=random_state)

    y = df[y_fld].values
    df.drop(y_fld, axis=1, inplace=True)
    
    return df, y

In [27]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):  
    
    # in scikit-learn, random forest regressor's score is the r-squared value
    train_scr = m.score(X_train, y_train)
    val_scr = m.score(X_val, y_val)
    
    # RMSE is what we are more interested in
    train_rmse = rmse(m.predict(X_train), y_train)
    val_rmse = rmse(m.predict(X_val), y_val)
    if hasattr(m, 'oob_score_'): oob_scr = m.oob_score_
        
    res = f"Training-Score:   {train_scr:.5f}\tTraining-rmse:   {train_rmse:.5f}\n"
    res += f"Validation-score: {val_scr:.5f}\tValidation-rmse: {val_rmse:.5f}"   
    if hasattr(m, 'oob_score_'): res += f"\nOOB-score:\t  {oob_scr:.5f}"
        
    print(res)

# Load data

In [28]:
seed = 42 # random seed to replication

In [29]:
train_df = pd.read_feather('tmp/v3/taxi-train-v3-chkpt4')
val_df = pd.read_feather('tmp/v3/taxi-v3-val')
test_df = pd.read_feather('tmp/v3/taxi-test-v3-chkpt4')
test_df_raw = pd.read_csv(f'{PATH}test.csv', usecols=['key'])

In [30]:
train_df.shape, val_df.shape, test_df.shape

((54054206, 17), (9674, 17), (9914, 16))

In [31]:
%%time
subsample = 50_000_000

X_train, y_train = split_df(train_df, 'fare_amount', subset=subsample, random_state=seed)

Wall time: 3.26 s


In [32]:
X_val, y_val = split_df(val_df, 'fare_amount')

In [33]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, test_df.shape

((100000, 16), (9674, 16), (100000,), (9674,), (9914, 16))

# Batch fit

In [34]:
outdir = CSV_PATH
os.makedirs(outdir, exist_ok=True)
preds = []
n_forest=1

In [None]:
%%time
for i in range (n_forest):
    
    new_seed = seed + i + 3
    
    print(f"Random Forest [{i+1}/{n_forest}] of 10 trees with random seed {new_seed}")
    m = RandomForestRegressor(n_estimators=10, min_samples_leaf=3, max_features=0.5, n_jobs=-1, random_state=new_seed)
    
    print("Fitting on training data...")
    m.fit(X_train, y_train)

    print("Predicting on test data...")
    test_y_predictions = m.predict(test_df)
    preds.append(test_y_predictions)      
    
    print("Creating submission file...")
    
    cur_dt = dt.now().strftime("%Y%m%d_%H%M%S")
    desc = 'v08_RF_' # description of submission for reference
    ensemb_desc = '_01x10_50M'
    subm_fn =  desc + cur_dt + ensemb_desc + '.csv'
    subm_path = f'{outdir}/{subm_fn}'

    # Write the predictions to a CSV file which we can submit to the competition.
    submission = pd.DataFrame(
        {'key': test_df_raw.key, 'fare_amount': test_y_predictions},
        columns = ['key', 'fare_amount'])
    submission.to_csv(subm_path, index = False)
    
    print(f"Submission: {subm_fn}")
    print()

In [None]:
# preds[0]

In [None]:
# test_y_predictions = np.mean(np.vstack([preds]),axis=0)

# Results