# New York City Taxi Trip Duration

In this competition, Kaggle is challenging you to build a model that predicts the total ride duration of taxi trips in New York City. Your primary dataset is one released by the NYC Taxi and Limousine Commission, which includes pickup time, geo-coordinates, number of passengers, and several other variables.

The evaluation metric for this competition is Root Mean Squared Logarithmic Error.

The RMSLE is calculated as


Where:

ϵ is the RMSLE value (score)<br>
n is the total number of observations in the (public/private) data set,<br>
pi is your prediction of trip duration, and<br>
ai is the actual trip duration for i.<br>
log(x) is the natural logarithm of x<br>

Submission File<br>

For every row in the dataset, submission files should contain two columns: id and trip_duration.  The id corresponds to the column of that id in the test.csv. The file should contain a header and have the following format:

|id |trip_duration|
|:-:|:-:|
|id00001|978|
|id00002|978|
|id00003|978|
|id00004|978|


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
import plotly.offline as py_offline
import plotly.plotly as py
import plotly.graph_objs as go

py_offline.init_notebook_mode(connected=True)

In [2]:
#load all the data and display

df = pd.read_csv('train.csv')

print(df.head())
print(df.shape)

          id  vendor_id      pickup_datetime     dropoff_datetime  \
0  id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1  id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2  id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3  id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4  id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude store_and_fwd_flag  trip_duration  
0         40.765602                  N            455  
1         40.731

In [3]:
# %%time
# test = df.take(np.random.permutation(len(df))[:2000])



# X_pick = df['pickup_longitude'].iloc[0:1500]
# y_pick = df['pickup_latitude'].iloc[0:1500]



# X_drop = df['dropoff_longitude'].iloc[0:1500]
# y_drop = df['dropoff_latitude'].iloc[0:1500]

# trace = go.Scattergeo(locationmode = 'USA-states',
#         lon = X_pick,
#         lat = y_pick,
#         text = "Pick Up Location",
#         mode = 'markers',
#         marker = dict(
#             size = 8,
#             opacity = 0.8,
#             reversescale = True,
#             autocolorscale = False,
#             symbol = 'square',
#             line = dict(
#                 width=1,
#                 color='rgba(102, 102, 102)'
#             ),
       
#         )
#     )

# trace_2 = go.Scattergeo(locationmode = 'USA-states',
#         lon = X_drop,
#         lat = y_drop,
#         text = "Drop Off Location",
#         mode = 'markers',
#         marker = dict(
#             size = 8,
#             opacity = 0.8,
#             reversescale = True,
#             autocolorscale = False,
#             symbol = 'circle',
#             line = dict(
#                 width=1,
#                 color='rgba(255, 0, 0, 0.4)'
#             ),
       
#         )
#     )

# layout = dict(
#         title = 'NYC Taxi pick/drop location',
# #         colorbar = True,
#         geo = dict(
#             scope='usa',
#             projection=dict( type='albers usa' ),
# #             projection=dict( type = 'Mercator' ),
#             showland = True,
#             landcolor = "rgb(250, 250, 250)",
#             subunitcolor = "rgb(217, 217, 217)",
#             countrycolor = "rgb(217, 217, 217)",
#             countrywidth = 0.5,
#             subunitwidth = 0.5,
            
#             lonaxis = dict( range= [ -74.05, -73] ),
#             lataxis = dict( range= [40.5, 41] ),
            
#             center = dict(lon= -73.980415, lat =40.763939)
            
#         ),
#     )

# fig = dict( data=[trace, trace_2], layout=layout )
# py_offline.iplot(fig, validate=False,)

# fig = plt.figure()
# ax1 = fig.add_subplot(111)

# ax1.scatter(X_pick, y_pick, c='b', marker="s", label='Pick up location')
# ax1.scatter(X_drop,y_drop, c='r', marker="o", label='Drop off location')
# plt.legend(loc='upper left');
# plt.show()


In [12]:
y = df['trip_duration']
X = df.drop(['id', 'store_and_fwd_flag', 'pickup_datetime', 'dropoff_datetime',], axis=1)
# print  the number of occurance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [13]:
#error function: RMLSE
from sklearn.metrics import mean_squared_error
def calculate_error(y_test, y_pred):

    log_test = np.log(y_test+1)
    log_pred = np.log(y_pred+1)
    RMSE = np.sqrt(mean_squared_error(log_test, log_pred))
    
    return RMSE


In [14]:
# from sklearn.ensemble import RandomForestRegressor as RFR

num_iterations = 15

for i in range (num_iterations):
    num_est = np.random.randint(10, 20)
    max_depth = np.random.randint(5, 12)
    min_samples_split = np.random.randint(2,50)
    regr = RFR(n_estimators = num_est,
          max_depth = max_depth,
          min_samples_split = min_samples_split,
          verbose = 2,
          random_state = 2)

    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    
    rmlse = calculate_error(y_pred, y_test)
    print("RMSE for n_est = {}, max_depth = {}, min_sam = {} is: {}".format(num_est, max_depth, min_samples_split, rmlse))
    

In [16]:
# from sklearn.grid_search import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor as RFR
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import make_scorer

# param_grid = { 
#     'n_estimators': [80, 100, 150],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [20, 30, 40, 50],   
# }

# rfr = RFR(verbose = 2,
#           random_state = 2)

# my_scorer = make_scorer(calculate_error, greater_is_better = False)
# CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5, scoring=my_scorer)
# CV_rfr.fit(X_train, y_train)
# y_pred = CV_rfr.predict(X_test)
# print (CV_rfr.best_params_)