In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
from utils import preprocess

In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
DATA_PATH = os.path.join('../data')
TRAIN_FILE = os.path.join(DATA_PATH, 'train.csv')
TEST_FILE = os.path.join(DATA_PATH, 'test.csv')

In [5]:
train_features = preprocess(TRAIN_FILE)

In [6]:
train_features.columns

Index([u'id', u'vendor_id', u'pickup_datetime', u'dropoff_datetime',
       u'passenger_count', u'pickup_longitude', u'pickup_latitude',
       u'dropoff_longitude', u'dropoff_latitude', u'store_and_fwd_flag',
       u'trip_duration', u'pickup_hr', u'pickup_min', u'pickup_sec',
       u'pickup_day', u'pickup_date', u'pickup_mon', u'quarter', u'weekday',
       u'holiday', u'st_dist', u'h_dist', u'travel_dir', u'bearing',
       u's_pickup_latitude', u's_pickup_longitude', u's_dropoff_latitude',
       u's_dropoff_longitude', u'flag', u'month_end', u'month_start'],
      dtype='object')

In [7]:
features = train_features[[u'vendor_id', 
       u'passenger_count', u'pickup_hr', u'pickup_min', u'pickup_sec',
       u'pickup_day', u'pickup_date', u'pickup_mon', u'weekday',
       u'holiday', u'st_dist', u'h_dist', u'bearing',
       u's_pickup_latitude', u's_pickup_longitude', u's_dropoff_latitude',
       u's_dropoff_longitude', u'flag', u'month_end', u'month_start']].as_matrix()
targets = train_features['trip_duration']

In [8]:
print features.shape
print targets.shape

(1458644, 20)
(1458644,)


In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [11]:
from matplotlib import pyplot as plt

In [12]:
samples_idx = np.random.choice(features.shape[0], features.shape[0]/2)

In [13]:
X = features[samples_idx]
y = targets[samples_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape

(583457, 20) (145865, 20)
(583457,) (145865,)


In [23]:
dtree = DecisionTreeRegressor(criterion='mse', max_features=20)

In [25]:
parameters = {
    'max_depth': [5, 10, 50, 75, 100, 150, 200],
    'min_samples_leaf': [10, 20, 50]
}

In [14]:
def scorer(est, X, y):
    y_hat = est.predict(X)
    loss = []
    for yi, yj in zip(y, y_hat):
        diff = np.log(yi + 1.0) - np.log(yj + 1.0)
        loss.append(diff ** 2)
    return -np.sqrt(np.mean(loss))

In [30]:
grid = GridSearchCV(dtree, parameters,
                    scoring=scorer, cv=10).fit(X_train, y_train)

In [31]:
grid.cv_results_

{'mean_fit_time': array([  4.15106609,   4.21588173,   4.76327405,   7.81479044,
          7.75697925,   7.5461813 ,  13.85494843,  13.07857499,
         11.23773658,  11.00132673,  10.20325999,  11.09947827,
         13.10630965,  12.44403067,  11.1873482 ,  13.34735091,
         12.47981491,  11.15741374,  13.21314371,  12.55158722,  11.27366269]),
 'mean_score_time': array([ 0.3940829 ,  0.39602225,  0.53074393,  0.39382751,  0.3892127 ,
         0.37535284,  0.40974808,  0.40243511,  0.38749526,  0.35936165,
         0.34999092,  0.42077112,  0.41107745,  0.3884094 ,  0.38892183,
         0.406141  ,  0.40053794,  0.39319613,  0.4056812 ,  0.3947109 ,
         0.3870729 ]),
 'mean_test_score': array([-0.60393728, -0.60322375, -0.59702199, -0.57401923, -0.57530147,
        -0.57770236, -0.53315677, -0.54705647, -0.56473117, -0.53316808,
        -0.54704328, -0.56473482, -0.53315943, -0.54702545, -0.56473298,
        -0.53311665, -0.54702201, -0.56473064, -0.53312342, -0.54700317,
  

In [32]:
grid.best_params_

{'max_depth': 150, 'min_samples_leaf': 10}

In [34]:
print scorer(grid.best_estimator_, X_test, y_test)

-0.52411113335


In [35]:
trace = pd.DataFrame(grid.cv_results_)

In [36]:
trace

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_min_samples_leaf,params,rank_test_score,split0_test_score,split0_train_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,4.151066,0.394083,-0.603937,-0.603252,5,10,"{u'max_depth': 5, u'min_samples_leaf': 10}",21,-0.599521,-0.600978,...,-0.598505,-0.599211,-0.603183,-0.600253,-0.600952,-0.599172,0.204245,0.025074,0.006109,0.00315
1,4.215882,0.396022,-0.603224,-0.602714,5,20,"{u'max_depth': 5, u'min_samples_leaf': 20}",20,-0.600246,-0.601658,...,-0.59744,-0.598635,-0.601748,-0.598818,-0.60065,-0.599056,0.368394,0.056148,0.006091,0.003214
2,4.763274,0.530744,-0.597022,-0.596606,5,50,"{u'max_depth': 5, u'min_samples_leaf': 50}",19,-0.597118,-0.59839,...,-0.589475,-0.591529,-0.600545,-0.597947,-0.598246,-0.595912,0.926754,0.345415,0.005249,0.002327
3,7.81479,0.393828,-0.574019,-0.567659,10,10,"{u'max_depth': 10, u'min_samples_leaf': 10}",16,-0.570287,-0.568363,...,-0.572871,-0.566801,-0.576418,-0.56749,-0.574952,-0.566342,0.306837,0.029347,0.003523,0.003036
4,7.756979,0.389213,-0.575301,-0.569021,10,20,"{u'max_depth': 10, u'min_samples_leaf': 20}",17,-0.57331,-0.56923,...,-0.573719,-0.566445,-0.578688,-0.570194,-0.577865,-0.569426,0.452426,0.032557,0.003471,0.002583
5,7.546181,0.375353,-0.577702,-0.571023,10,50,"{u'max_depth': 10, u'min_samples_leaf': 50}",18,-0.577459,-0.574281,...,-0.575879,-0.568373,-0.581943,-0.572771,-0.580471,-0.570392,0.235361,0.00311,0.004493,0.002525
6,13.854948,0.409748,-0.533157,-0.409575,50,10,"{u'max_depth': 50, u'min_samples_leaf': 10}",3,-0.523687,-0.40871,...,-0.534129,-0.408912,-0.537439,-0.407784,-0.537018,-0.408194,0.64769,0.011689,0.004512,0.00132
7,13.078575,0.402435,-0.547056,-0.472203,50,20,"{u'max_depth': 50, u'min_samples_leaf': 20}",10,-0.541187,-0.471181,...,-0.551515,-0.470049,-0.541154,-0.470786,-0.552951,-0.470459,1.200177,0.020769,0.005146,0.002175
8,11.237737,0.387495,-0.564731,-0.526436,50,50,"{u'max_depth': 50, u'min_samples_leaf': 50}",12,-0.557126,-0.527336,...,-0.567337,-0.522916,-0.55962,-0.524071,-0.569428,-0.527804,0.31794,0.010442,0.004237,0.002469
9,11.001327,0.359362,-0.533168,-0.409575,75,10,"{u'max_depth': 75, u'min_samples_leaf': 10}",5,-0.523781,-0.40871,...,-0.534006,-0.408913,-0.537555,-0.407781,-0.536882,-0.408194,0.209988,0.00945,0.004504,0.001321


In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
forest = RandomForestRegressor(criterion='mse', max_features=20, 
                               max_depth=50, min_samples_leaf=10,
                               n_estimators=100
                              ).fit(X_train, y_train)

In [17]:
scorer(forest, X_test, y_test)

-0.5324625358923355

In [19]:
parameters = {
    'n_estimators': range(100, 600, 100)
}

In [None]:
grid = GridSearchCV(forest, parameters,
                    scoring=scorer, cv=5).fit(X_train, y_train)