In [9]:
import pandas as pd
import data_cleaning

time_attrs = [
    'pickup_datetime',
    'dropoff_datetime',
]

taxi = pd.read_csv('data/train.csv', parse_dates=time_attrs)
taxi_cleaned = data_cleaning.pipeline.fit_transform(taxi)
taxi_labels = taxi['trip_duration'].copy()
taxi_cleaned_pd = pd.DataFrame({
    'pickup_datetime': taxi_cleaned[:, 0],
    'dropoff_datetime': taxi_cleaned[:, 1],
    'passenger_count': taxi_cleaned[:, 2],
    'pickup_longitude': taxi_cleaned[:, 3],
    'pickup_latitude': taxi_cleaned[:, 4],
    'dropoff_longitude': taxi_cleaned[:, 5],
    'dropoff_latitude': taxi_cleaned[:, 6],
    'distance': taxi_cleaned[:, 7],    
})

In [10]:
taxi_cleaned_pd.describe()

Unnamed: 0,distance,dropoff_datetime,dropoff_latitude,dropoff_longitude,passenger_count,pickup_datetime,pickup_latitude,pickup_longitude
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.1076020000000001e-17,-1.205094e-14,1.014493e-13,3.885016e-14,-6.459772000000001e-17,-2.63953e-14,3.492914e-15,-3.314967e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.800846,-1.773205,-238.7999,-678.9027,-1.266532,-1.773048,-194.3734,-676.4262
25%,-0.5141414,-0.8476855,-0.4434273,-0.2535194,-0.5056372,-0.8477397,-0.4128277,-0.2592424
50%,-0.3135425,0.005459914,0.07591738,-0.08968501,-0.5056372,0.005466079,0.09670856,-0.1164505
75%,0.1011218,0.8481785,0.5018093,0.1472646,0.2552578,0.8482028,0.5303712,0.08681542
max,288.0152,1.775168,88.30261,178.8973,5.581523,1.756776,338.4965,178.2458


In [11]:
taxi_cleaned_pd_label = taxi_cleaned_pd.copy()
taxi_cleaned_pd_label['trip_duration'] = taxi['trip_duration']

In [12]:
taxi_cleaned_pd_label.describe()

Unnamed: 0,distance,dropoff_datetime,dropoff_latitude,dropoff_longitude,passenger_count,pickup_datetime,pickup_latitude,pickup_longitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.1076020000000001e-17,-1.205094e-14,1.014493e-13,3.885016e-14,-6.459772000000001e-17,-2.63953e-14,3.492914e-15,-3.314967e-14,959.4923
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5237.432
min,-0.800846,-1.773205,-238.7999,-678.9027,-1.266532,-1.773048,-194.3734,-676.4262,1.0
25%,-0.5141414,-0.8476855,-0.4434273,-0.2535194,-0.5056372,-0.8477397,-0.4128277,-0.2592424,397.0
50%,-0.3135425,0.005459914,0.07591738,-0.08968501,-0.5056372,0.005466079,0.09670856,-0.1164505,662.0
75%,0.1011218,0.8481785,0.5018093,0.1472646,0.2552578,0.8482028,0.5303712,0.08681542,1075.0
max,288.0152,1.775168,88.30261,178.8973,5.581523,1.756776,338.4965,178.2458,3526282.0


In [13]:
corr_matrix = taxi_cleaned_pd_label.corr()
corr_matrix["trip_duration"].sort_values(ascending=False)

trip_duration        1.000000
distance             0.094777
pickup_longitude     0.026542
dropoff_longitude    0.014678
passenger_count      0.008471
dropoff_datetime     0.007819
pickup_datetime      0.006644
dropoff_latitude    -0.020677
pickup_latitude     -0.029204
Name: trip_duration, dtype: float64

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np

tree_reg = DecisionTreeRegressor()
tree_reg.fit(taxi_cleaned, taxi_labels)
tree_predictions = tree_reg.predict(taxi_cleaned)
tree_mse = mean_squared_error(taxi_labels, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [18]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    rmse_scores = np.sqrt(-scores)
    print("Scores:", rmse_scores)
    print("Mean:", rmse_scores.mean())
    print("Standard deviation:", rmse_scores.std())
    
tree_reg = DecisionTreeRegressor()
tree_scores = cross_val_score(tree_reg, taxi_cleaned, taxi_labels,
                              scoring="neg_mean_squared_error", cv=10)
display_scores(tree_scores)

Scores: [  4412.82583894   4299.74111742   6714.11620977   4208.50937834
   7022.34362978   4438.05889609  11539.14918485   4385.91554089
   6700.14671706   4507.71951653]
Mean: 5822.85260297
Standard deviation: 2197.8395301


In [19]:
from sklearn.model_selection import GridSearchCV

param_grid =[
    {'max_depth': [3, 5, 10]},
]

tree_reg = DecisionTreeRegressor()
tree_grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

tree_grid_search.fit(taxi_cleaned, taxi_labels)

tree_grid_search.best_params_

{'max_depth': 5}

In [20]:
cvres = tree_grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

5208.92988043 {'max_depth': 3}
5207.40653419 {'max_depth': 5}
5474.82326864 {'max_depth': 10}
