NYC Taxi duration analysis from Kaggle

In [39]:
'''
Part 1

In this notebook I:
Verify the data integrity of the dataset
Drop unnecesary columns
Drop passenger counts that are 0,7,8 & 9
Add column for distance between pickup & dropoff
'''

'\nPart 1\n\nIn this notebook I:\nVerify the data integrity of the dataset\nDrop unnecesary columns\nDrop passenger counts that are 0,7,8 & 9\nAdd column for distance between pickup & dropoff\n'

In [1]:
kaggle_url = 'https://www.kaggle.com/c/nyc-taxi-trip-duration'

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
txorig = pd.read_csv('train.csv')

In [5]:
txorig.head(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443
6,id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,N,341
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,N,1551
8,id1301050,1,2016-05-27 23:12:23,2016-05-27 23:16:38,1,-73.999481,40.7384,-73.985786,40.732815,N,255
9,id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973,40.789989,N,1225


In [6]:
txorig['id'].count()

1458644

In [7]:
# verify there are no repeats in 'id'

check_dups = txorig['id'].duplicated()

True in check_dups.values

False

In [8]:
# there are only 2 vender ids

txorig['vendor_id'].value_counts()

2    780302
1    678342
Name: vendor_id, dtype: int64

In [9]:
# there are 9 passenger counts (I may drop counts for 0,7,8,9)

txorig['passenger_count'].value_counts().sort_index()

0         60
1    1033540
2     210318
3      59896
4      28404
5      78088
6      48333
7          3
8          1
9          1
Name: passenger_count, dtype: int64

In [10]:
# column description - whether the trip record was held in vehicle memory before sending to the vendor

txorig['store_and_fwd_flag'].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [11]:
txorig['trip_duration'].describe()

count    1.458644e+06
mean     9.594923e+02
std      5.237432e+03
min      1.000000e+00
25%      3.970000e+02
50%      6.620000e+02
75%      1.075000e+03
max      3.526282e+06
Name: trip_duration, dtype: float64

In [12]:
txorig['trip_duration'].sort_values(ascending=False)[:10]

978383     3526282
924150     2227612
680594     2049578
355003     1939736
1234291      86392
295382       86391
73816        86390
59891        86387
1360439      86385
753765       86379
Name: trip_duration, dtype: int64

In [13]:
txorig.iloc[978383]

id                              id0053347
vendor_id                               1
pickup_datetime       2016-02-13 22:46:52
dropoff_datetime      2016-03-25 18:18:14
passenger_count                         1
pickup_longitude                 -73.7839
pickup_latitude                   40.6486
dropoff_longitude                -73.9783
dropoff_latitude                  40.7502
store_and_fwd_flag                      N
trip_duration                     3526282
Name: 978383, dtype: object

In [14]:
# will need to cross verify pickup-dropoff time with trip duration due to severe outliers

In [15]:
from dateutil import parser

In [16]:
# pickup & dropoff time for index 0
x = '2016-03-14 17:24:55'
y = '2016-03-14 17:32:30'

In [17]:
new_x = parser.parse(x)
new_y = parser.parse(y)

In [18]:
diff = (new_y - new_x) # matches trip duration from index 0

In [19]:
def time_diff (row):
    x = parser.parse(row['pickup_datetime'])
    y = parser.parse(row['dropoff_datetime'])
    return (y - x).total_seconds()

In [20]:
# create an editing copy

txedit = txorig.copy()

In [21]:
# this shows that all durations == dropoff time - pickup time
'''
txedit['new_duration'] = txedit.apply(time_diff,axis=1)

txedit['duration_match'] = txedit['trip_duration'] - txedit['new_duration']

txedit['duration_match'].value_counts()
'''

"\ntxedit['new_duration'] = txedit.apply(time_diff,axis=1)\n\ntxedit['duration_match'] = txedit['trip_duration'] - txedit['new_duration']\n\ntxedit['duration_match'].value_counts()\n"

In [22]:
'''
Cleaning up the dataset, including:
(Done) drop unnecessary columns
(Done) drop passenger counts for 0,7,8,9
'''

'\nCleaning up the dataset, including:\n(Done) drop unnecessary columns\n(Done) drop passenger counts for 0,7,8,9\n'

In [23]:
'''
Add to current dataset, including:
dummy variables for day of the week
dummy variable for hour / time of day; is ML able to identify clusters of times (i.e. 1am to 3am) that affect duration?
(Done) lat long distance calculations
'''

'\nAdd to current dataset, including:\ndummy variables for day of the week\ndummy variable for hour / time of day; is ML able to identify clusters of times (i.e. 1am to 3am) that affect duration?\n(Done) lat long distance calculations\n'

In [24]:
txedit.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [25]:
# drop 'store_and_fwd_flag' column

txedit.drop(['store_and_fwd_flag'],axis=1,inplace=True)

txedit.sample(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
116766,id3930302,1,2016-02-11 08:28:02,2016-02-11 08:37:01,3,-73.978638,40.752323,-73.963264,40.757462,539
474655,id1885264,1,2016-03-05 01:47:23,2016-03-05 02:09:46,1,-74.001617,40.730976,-73.895248,40.744579,1343
768775,id2573917,1,2016-04-05 17:00:37,2016-04-05 17:14:53,1,-73.974068,40.758324,-73.960365,40.778595,856
703403,id2830550,2,2016-01-29 01:49:16,2016-01-29 01:58:41,1,-73.990921,40.761028,-73.966347,40.754761,565
926811,id1276402,2,2016-01-07 17:06:40,2016-01-07 17:19:40,1,-73.969971,40.763016,-73.948776,40.773678,780


In [26]:
# drop 'vendor_id' column

txedit.drop(['vendor_id'],axis=1,inplace=True)

txedit.sample(5)

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
880412,id3892154,2016-06-25 07:29:13,2016-06-25 07:35:22,2,-73.977776,40.746311,-73.958626,40.773155,369
86466,id3819079,2016-06-22 23:13:54,2016-06-22 23:38:47,1,-73.999199,40.738449,-73.931351,40.854725,1493
1440023,id3904203,2016-04-09 17:40:26,2016-04-09 17:54:51,2,-73.986183,40.77845,-73.97364,40.757462,865
1319018,id3179466,2016-06-10 18:51:52,2016-06-10 19:09:06,2,-73.97567,40.752209,-73.994331,40.72448,1034
325937,id2464908,2016-06-16 07:22:04,2016-06-16 07:27:56,1,-73.990372,40.756821,-73.979591,40.75898,352


In [27]:
# drop 'id' column

txedit.drop(['id'],axis=1,inplace=True)

txedit.sample(5)

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
508127,2016-06-01 23:33:17,2016-06-01 23:45:59,5,-73.975166,40.76144,-74.003937,40.747688,762
1405061,2016-05-21 13:31:08,2016-05-21 13:48:50,4,-73.969635,40.673744,-74.002251,40.658485,1062
67369,2016-04-17 19:50:16,2016-04-17 19:55:59,2,-73.960854,40.775871,-73.981461,40.778679,343
1067053,2016-04-26 17:09:44,2016-04-26 17:19:28,5,-73.988808,40.748531,-73.978668,40.752796,584
730914,2016-02-12 09:39:51,2016-02-12 09:47:29,1,-73.981285,40.733448,-73.992149,40.734543,458


In [28]:
# drop passenger counts for 0,7,8,9

txedit = txedit[ (txedit['passenger_count'] > 0) &  (txedit['passenger_count'] < 7) ]

In [29]:
txedit['passenger_count'].value_counts().sort_index()

1    1033540
2     210318
3      59896
4      28404
5      78088
6      48333
Name: passenger_count, dtype: int64

In [30]:
# lat long distance calculation with library geopy

import geopy.distance

In [31]:
# testing geopy 1

loc1 = (40.742905, -73.982307)
loc2 = (40.745155, -74.005501)

geopy.distance.vincenty(loc1,loc2).km

1.9748315883621774

In [32]:
# testing geopy 2

loc1 = (40.777134, -73.955231)
loc2 = (40.641472, -73.788750)

geopy.distance.vincenty(loc1,loc2).km

20.61240065004596

In [33]:
# calculate pickup to dropoff distance in km

def calc_dist(row):
    
    pickup = (row['pickup_latitude'] , row['pickup_longitude'])
    dropoff = (row['dropoff_latitude'] , row['dropoff_longitude'] )
    
    return geopy.distance.vincenty(pickup,dropoff).km    

In [34]:
txedit['distance'] = txedit.apply(calc_dist,axis=1)

txedit.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,distance
0,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,455,1.502172
1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,663,1.80866
2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,2124,6.379687
3,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,429,1.483632
4,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,435,1.187038


In [35]:
txedit['distance'].describe()

count    1.458579e+06
mean     3.442291e+00
std      4.300269e+00
min      0.000000e+00
25%      1.232329e+00
50%      2.094129e+00
75%      3.874775e+00
max      1.240510e+03
Name: distance, dtype: float64

In [38]:
# save edit_1 copy to save 'distance' due to long processing time
'''
txedit.to_csv('edit_1.csv')
'''

In [None]:
# continue to part 1.5