<a href="https://colab.research.google.com/github/yunhui666/Kaggle_NYC_Taxi_Duration_Prediction_Project/blob/main/Modeling%2BResults.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

In [None]:
data = pd.read_csv("/content/drive/Shareddrives/Big Data Final/NYC Taxi Data/train.csv")

In [None]:
data

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


## Data Transformation

In [None]:
#Datetime transformation & feature extraction
data['pickup_datetime']=pd.to_datetime(data['pickup_datetime'])

weekday_converter=lambda x: x.weekday()
hour_converter=lambda x: x.hour
month_converter=lambda x: x.month

data['pickup_month']=data['pickup_datetime'].apply(month_converter)
data['pickup_weekday']=data['pickup_datetime'].apply(weekday_converter)
data['pickup_hour']=data['pickup_datetime'].apply(hour_converter)

In [None]:
data=data.drop(columns=['dropoff_datetime','id','pickup_datetime'])

In [None]:
#Eliminate outliers
data= data[(data.trip_duration <8000)]

In [None]:
#For computational efficiency, we randonly choose 100,000 rows from the mega dataset.
#df=data.sample(n=100000)
df=data.copy()

In [None]:
#Dummy coding (Step 1).
cvar_list=['vendor_id','store_and_fwd_flag','pickup_weekday', 'pickup_hour', 'pickup_month']
nvar_list=['passenger_count','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude', 'trip_duration']

df[cvar_list] = df[cvar_list].astype('category')
df[nvar_list] = df[nvar_list].astype('float64')

df = pd.get_dummies(df, prefix_sep='_')

In [None]:
# Remove the redundant dummies (Step 2 of dummy coding)
rdummies = ['vendor_id_2', 'store_and_fwd_flag_N', 'pickup_weekday_4', 'pickup_hour_18', 'pickup_month_1']
df = df.drop(columns=rdummies)
print(df.columns.values)

['passenger_count' 'pickup_longitude' 'pickup_latitude'
 'dropoff_longitude' 'dropoff_latitude' 'trip_duration' 'vendor_id_1'
 'store_and_fwd_flag_Y' 'pickup_month_2' 'pickup_month_3' 'pickup_month_4'
 'pickup_month_5' 'pickup_month_6' 'pickup_weekday_0' 'pickup_weekday_1'
 'pickup_weekday_2' 'pickup_weekday_3' 'pickup_weekday_5'
 'pickup_weekday_6' 'pickup_hour_0' 'pickup_hour_1' 'pickup_hour_2'
 'pickup_hour_3' 'pickup_hour_4' 'pickup_hour_5' 'pickup_hour_6'
 'pickup_hour_7' 'pickup_hour_8' 'pickup_hour_9' 'pickup_hour_10'
 'pickup_hour_11' 'pickup_hour_12' 'pickup_hour_13' 'pickup_hour_14'
 'pickup_hour_15' 'pickup_hour_16' 'pickup_hour_17' 'pickup_hour_19'
 'pickup_hour_20' 'pickup_hour_21' 'pickup_hour_22' 'pickup_hour_23']


## Machine Learning

In [None]:
# Data Partiton

from sklearn.model_selection import train_test_split

df_partition = df
testpart_size = 0.2

# random_state specifies the seed for random number generator. 
# random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df_partition, test_size=testpart_size, random_state=1)

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
# Prepare train and test sets.
DV = 'trip_duration'
ytrain = df_nontestData[DV]
xtrain = df_nontestData.drop(columns=[DV])
xtest = df_testData.drop(columns=[DV])
ytest = df_testData[DV]

## Neural network

In [None]:
#prepare data input for neural network

clf = MLPRegressor(max_iter=2000, random_state=1).fit(X,y)


In [None]:
clf.score(df_testData.drop(columns=[DV]),df_testData[DV])

-0.0003051070742809969

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# make a dictionary of hyperparameter values to search.
param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,1)],
          'activation': ['relu','tanh','logistic'],
          'alpha': [0.0001, 0.05],
          'learning_rate': ['constant','adaptive'],
          'solver': ['adam']}

In [None]:
# Cross validation on Neural Network.
gridsearch = GridSearchCV(MLPRegressor(max_iter=2000, random_state=1).fit(X,y), 
                          param_grid=param_grid, 
                          scoring = ["r2", "neg_root_mean_squared_error"], 
                          refit = "r2", cv=5, n_jobs=-1, verbose=4)
gridsearch.fit(X,y)
clf_NN = gridsearch.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
#R-sqaured
clf_NN.score(df_testData.drop(columns=[DV]),df_testData[DV])

0.004385652767810444

### XGBoost

In [None]:
#pip install xgboost



In [None]:
from xgboost import XGBRegressor

In [None]:
xgb_full=XGBRegressor(gamma=0.1, max_depth=8, n_estimators=500, random_state=2)

In [None]:
xgb_full_fit=xgb_full.fit(xtrain, ytrain)



In [None]:
xgb_full_fit.score(xtest,ytest)

0.7981180363340135

In [None]:
#Initiate the XGBoost model.
xgb=XGBRegressor(random_state=2)

In [None]:
# make a dictionary of hyperparameter values to search.
search_space = {
    "n_estimators" : [300, 400, 500],
    "max_depth" : [4, 6, 8],
    "gamma" : [0.01, 0.1],
    "learning_rate" : [0.01, 0.1]
}

In [None]:
#XGBoost Run 5-fold cross validation.
GS = GridSearchCV(estimator = xgb,
                  param_grid = search_space,
                  scoring = ["r2", "neg_root_mean_squared_error"],
                  refit = "r2",
                  cv = 5,
                  verbose = 4)
GS.fit(xtrain, ytrain)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=300; neg_root_mean_squared_error: (test=-459.175) r2: (test=0.493) total time=  27.9s
[CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=300; neg_root_mean_squared_error: (test=-455.757) r2: (test=0.509) total time=  28.9s
[CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=300; neg_root_mean_squared_error: (test=-466.057) r2: (test=0.501) total time=  28.0s
[CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=300; neg_root_mean_squared_error: (test=-466.615) r2: (test=0.501) total time=  27.7s
[CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=300; neg_root_mean_squared_error: (test=-468.998) r2: (test=0.488) total time=  27.6s
[CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=4, n_estimators=400; neg_root_mean_squared_error: (test=-445.707) r2: (test=0.522) total time=  36

GridSearchCV(cv=5, estimator=XGBRegressor(random_state=2),
             param_grid={'gamma': [0.01, 0.1], 'learning_rate': [0.01, 0.1],
                         'max_depth': [4, 6, 8],
                         'n_estimators': [300, 400, 500]},
             refit='r2', scoring=['r2', 'neg_root_mean_squared_error'],
             verbose=4)

In [None]:
#Run in-sample test on the best model and calculate R-squared.
GS_XGB=GS.best_estimator_
GS_XGB.score(xtest, ytest)

0.7769506588314067

In [None]:
print(GS_XGB)

XGBRegressor(gamma=0.1, max_depth=8, n_estimators=500, random_state=2)


## Predict on test sample for Kaggle submission

In [None]:
#Data transformation
td = pd.read_csv("/content/drive/Shareddrives/Big Data Final/NYC Taxi Data/test.csv")

test_data=td.copy()

test_data['pickup_datetime']=pd.to_datetime(test_data['pickup_datetime'])

weekday_converter=lambda x: x.weekday()
hour_converter=lambda x: x.hour
month_converter=lambda x: x.month

test_data['pickup_month']=test_data['pickup_datetime'].apply(month_converter)
test_data['pickup_weekday']=test_data['pickup_datetime'].apply(weekday_converter)
test_data['pickup_hour']=test_data['pickup_datetime'].apply(hour_converter)

test_data=test_data.drop(columns=['id','pickup_datetime'])

cvar_list=['vendor_id','store_and_fwd_flag','pickup_weekday', 'pickup_hour', 'pickup_month']
nvar_list=['passenger_count','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']

test_data[cvar_list] = test_data[cvar_list].astype('category')
test_data[nvar_list] = test_data[nvar_list].astype('float64')

test_data = pd.get_dummies(test_data, prefix_sep='_')

# Remove the redundant dummies (Step 2 of dummy coding)
# Placeholder variable: rdummies
rdummies = ['vendor_id_2', 'store_and_fwd_flag_N', 'pickup_weekday_4', 'pickup_hour_18', 'pickup_month_1']
test_data = test_data.drop(columns=rdummies)



In [None]:
predicted = GS_XGB.predict(test_data)

In [None]:
predicted = xgb_full_fit.predict(test_data)

In [None]:
predicted

array([ 820.7733 ,  594.09106,  397.80933, ..., 1456.2723 , 1685.7429 ,
       1178.881  ], dtype=float32)

In [None]:
td['trip_duration']=predicted

In [None]:
td

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.756680,N,820.773315
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,594.091064
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.986160,40.729523,N,397.809326
3,id2150126,2,2016-06-30 23:59:41,1,-73.956070,40.771900,-73.986427,40.730469,N,992.547363
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.961510,40.755890,N,336.543335
...,...,...,...,...,...,...,...,...,...,...
625129,id3008929,1,2016-01-01 00:02:52,1,-74.003464,40.725105,-74.001251,40.733643,N,254.828262
625130,id3700764,1,2016-01-01 00:01:52,1,-74.006363,40.743782,-73.953407,40.782467,N,1273.664795
625131,id2568735,1,2016-01-01 00:01:24,2,-73.972267,40.759865,-73.876602,40.748665,N,1456.272339
625132,id1384355,1,2016-01-01 00:00:28,1,-73.976501,40.733562,-73.854263,40.891788,N,1685.742920


In [None]:
td.to_csv('test_result_full.csv')

In [None]:
from google.colab import files
td.to_csv('test_result_full.csv', index=False)
files.download('test_result_full.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>