# [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

## Import packages

In [1]:
import numpy as np 
import pandas as pd
from geopy.distance import geodesic, great_circle
import reverse_geocoder as rg
import holidays

## Import data

In [2]:
%%time
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

CPU times: user 1min 38s, sys: 1min 8s, total: 2min 47s
Wall time: 2min 57s


In [3]:
train.shape

(55423856, 8)

In [5]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [6]:
data = [train, test]
for df in data:
    print(df.shape)

(55423856, 8)
(9914, 7)


In [7]:
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [8]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,55423860.0,55423860.0,55423860.0,55423480.0,55423480.0,55423860.0
mean,11.34505,-72.50968,39.91979,-72.51121,39.92068,1.68538
std,20.71083,12.84888,9.642353,12.7822,9.633346,1.327664
min,-300.0,-3442.06,-3492.264,-3442.025,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73403,1.0
50%,8.5,-73.9818,40.75265,-73.98015,40.75316,1.0
75%,12.5,-73.96708,40.76713,-73.96367,40.7681,2.0
max,93963.36,3457.626,3408.79,3457.622,3537.133,208.0


In [9]:
train.apply(lambda x: x.nunique())

key                  55423855
fare_amount              9969
pickup_datetime      26173536
pickup_longitude       349893
pickup_latitude        350945
dropoff_longitude      447504
dropoff_latitude       450505
passenger_count            15
dtype: int64

# Data preprocessing

## Checking null values

In [10]:
def print_null(df):
    """
    prints null value of the given data
    """
    print(df.isnull().sum()[df.isnull().sum() != 0])

print_null(train)
print("------------")
print_null(test)

dropoff_longitude    376
dropoff_latitude     376
dtype: int64
------------
Series([], dtype: int64)


In [11]:
train = train.dropna(how='any', axis=0)

In [12]:
print_null(train)
print("------------")
print_null(test)

Series([], dtype: int64)
------------
Series([], dtype: int64)


## Natural Constraint

No more than 6 passengers are allowed in 1 trip.

In [13]:
max(train['passenger_count'])

208

In [14]:
train[train['passenger_count'] > 6]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
929022,2009-07-30 11:54:00.000000193,3.30,2009-07-30 11:54:00 UTC,0.000000,0.000000,0.000000,0.000000,208
1007609,2014-06-24 15:13:00.00000040,104.00,2014-06-24 15:13:00 UTC,-74.015780,40.715420,-74.170280,40.708340,9
2154045,2010-12-16 11:21:00.000000209,3.30,2010-12-16 11:21:00 UTC,0.000000,0.000000,0.000000,0.000000,208
2198549,2010-12-15 14:20:00.00000010,3.30,2010-12-15 14:20:00 UTC,0.000000,0.000000,0.000000,0.000000,208
2910347,2010-12-16 06:44:00.00000039,4.50,2010-12-16 06:44:00 UTC,0.000000,0.000000,0.000000,0.000000,208
...,...,...,...,...,...,...,...,...
53368487,2009-04-14 12:53:00.00000038,2.90,2009-04-14 12:53:00 UTC,-73.937927,40.758262,-73.937908,40.758263,208
53781948,2015-02-18 21:51:45.0000007,98.55,2015-02-18 21:51:45 UTC,-73.846512,40.927399,-73.846512,40.927399,9
53979230,2009-05-11 12:34:00.00000075,3.10,2009-05-11 12:34:00 UTC,-73.937743,40.758267,-73.937717,40.758262,208
54581863,2014-10-12 23:43:38.0000001,15.83,2014-10-12 23:43:38 UTC,-73.936762,40.804035,-73.917953,40.770468,9


In [15]:
train = train[train['passenger_count'] <= 6]
train.shape

(55423364, 8)

Fares are expected to be larger than 0

In [16]:
train[train.fare_amount < 0]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2039,2010-03-09 23:37:10.0000005,-2.9,2010-03-09 23:37:10 UTC,-73.789450,40.643498,-73.788665,40.641952,1
2486,2015-03-22 05:14:27.0000001,-2.5,2015-03-22 05:14:27 UTC,-74.000031,40.720631,-73.999809,40.720539,1
13032,2013-08-30 08:57:10.0000002,-3.0,2013-08-30 08:57:10 UTC,-73.995062,40.740755,-73.995885,40.741357,4
28839,2013-08-11 13:39:10.0000001,-2.5,2013-08-11 13:39:10 UTC,-73.785260,40.648442,0.000000,0.000000,1
36722,2015-04-30 15:19:45.0000003,-2.5,2015-04-30 15:19:45 UTC,-73.952187,40.790112,-73.950043,40.792839,1
...,...,...,...,...,...,...,...,...
55319537,2010-03-31 10:13:10.0000002,-12.1,2010-03-31 10:13:10 UTC,-73.989360,40.772995,-73.947843,40.772062,1
55326598,2010-03-27 00:41:10.0000004,-19.3,2010-03-27 00:41:10 UTC,-73.984128,40.739660,-73.904713,40.772890,1
55337328,2015-02-07 10:48:18.0000006,-2.5,2015-02-07 10:48:18 UTC,-73.961197,40.711250,-73.961517,40.711300,6
55360368,2015-06-21 00:32:32.0000004,-2.5,2015-06-21 00:32:32 UTC,-73.999550,40.738613,-73.999550,40.738613,1


In [17]:
train = train[train.fare_amount >= 0]
train.shape

(55420910, 8)

The max and min for latitude and longitude ranges from -90 to 90 and -180 to 180.

In [18]:
train[(train.pickup_latitude > 90) | (train.pickup_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
5686,2011-07-30 11:15:00.00000082,3.3,2011-07-30 11:15:00 UTC,-73.947235,401.083332,-73.951392,40.778927,1
150559,2012-08-03 07:43:00.000000176,25.3,2012-08-03 07:43:00 UTC,0.000000,-3116.285383,-73.953600,40.787998,1
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.601160,1703.092772,-1251.195890,-1189.615440,1
272439,2011-04-23 02:55:00.00000012,9.3,2011-04-23 02:55:00 UTC,-74.002497,405.350000,-73.978600,40.739962,1
436233,2012-03-11 01:56:00.000000100,4.1,2012-03-11 01:56:00 UTC,-2986.242495,-880.627428,-3383.296608,-2559.748913,1
...,...,...,...,...,...,...,...,...
55114105,2011-07-02 17:54:00.000000115,13.3,2011-07-02 17:54:00 UTC,-1305.734592,-778.582130,1196.891612,3359.841550,1
55186918,2011-08-10 10:05:00.00000018,7.7,2011-08-10 10:05:00 UTC,-1267.942923,2165.889693,-2593.367637,1598.247970,1
55191164,2011-06-03 18:40:00.00000046,4.9,2011-06-03 18:40:00 UTC,1.510003,-2541.548795,804.010087,3292.725070,1
55383247,2011-11-10 23:25:00.000000223,2.5,2011-11-10 23:25:00 UTC,-2959.462187,2371.279908,770.672687,-774.447875,1


In [19]:
train[(train.pickup_longitude > 180) | (train.pickup_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
60442,2012-01-12 13:36:00.000000186,4.9,2012-01-12 13:36:00 UTC,-736.550000,40.738230,-73.988742,40.748847,1
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.601160,1703.092772,-1251.195890,-1189.615440,1
217355,2012-06-03 23:21:00.00000077,6.1,2012-06-03 23:21:00 UTC,-740.000000,40.747620,0.000000,0.000000,6
243342,2012-08-02 10:38:00.000000111,7.3,2012-08-02 10:38:00 UTC,-736.333333,40.766480,-73.987928,40.751742,3
351119,2012-02-03 07:53:00.000000135,4.1,2012-02-03 07:53:00 UTC,-736.483332,40.766512,-73.981992,40.771672,1
...,...,...,...,...,...,...,...,...
55186918,2011-08-10 10:05:00.00000018,7.7,2011-08-10 10:05:00 UTC,-1267.942923,2165.889693,-2593.367637,1598.247970,1
55236494,2012-07-12 14:57:00.000000147,14.1,2012-07-12 14:57:00 UTC,-734.633332,40.688300,-73.951830,40.723345,1
55289327,2012-06-04 18:41:00.000000216,6.1,2012-06-04 18:41:00 UTC,-736.383332,40.755492,-73.987450,40.739705,1
55369218,2012-07-20 22:38:00.000000124,4.9,2012-07-20 22:38:00 UTC,-234.639982,40.740250,-73.988170,40.750022,2


In [20]:
train[(train.dropoff_latitude > 90) | (train.dropoff_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
92310,2011-09-27 11:54:00.000000127,28.9,2011-09-27 11:54:00 UTC,-74.014595,40.681880,-73.973310,404.616667,1
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.601160,1703.092772,-1251.195890,-1189.615440,1
181973,2012-01-03 09:04:00.000000130,6.5,2012-01-03 09:04:00 UTC,-74.008918,40.717827,-74.000855,404.133332,1
335675,2012-02-26 00:25:00.00000085,19.3,2012-02-26 00:25:00 UTC,-74.006457,40.743865,-73.981283,404.133332,1
436233,2012-03-11 01:56:00.000000100,4.1,2012-03-11 01:56:00 UTC,-2986.242495,-880.627428,-3383.296608,-2559.748913,1
...,...,...,...,...,...,...,...,...
55186918,2011-08-10 10:05:00.00000018,7.7,2011-08-10 10:05:00 UTC,-1267.942923,2165.889693,-2593.367637,1598.247970,1
55191164,2011-06-03 18:40:00.00000046,4.9,2011-06-03 18:40:00 UTC,1.510003,-2541.548795,804.010087,3292.725070,1
55256617,2012-06-15 20:20:00.00000088,4.5,2012-06-15 20:20:00 UTC,-74.005990,40.712027,-0.198333,474.007988,1
55306277,2012-03-24 12:38:00.0000009,28.9,2012-03-24 12:38:00 UTC,-73.997035,40.737092,-73.885225,405.016667,1


In [21]:
train[(train.dropoff_longitude > 180) | (train.dropoff_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.601160,1703.092772,-1251.195890,-1189.615440,1
269695,2012-05-24 09:00:00.000000101,17.7,2012-05-24 09:00:00 UTC,-73.997572,40.720945,-1329.621332,40.773717,1
282374,2011-04-25 13:09:00.000000123,5.7,2011-04-25 13:09:00 UTC,-73.987105,40.755732,-732.600000,40.744832,1
436233,2012-03-11 01:56:00.000000100,4.1,2012-03-11 01:56:00 UTC,-2986.242495,-880.627428,-3383.296608,-2559.748913,1
449263,2012-01-28 16:51:00.00000088,6.9,2012-01-28 16:51:00 UTC,-73.983142,40.741987,-736.500000,40.755255,1
...,...,...,...,...,...,...,...,...
55191164,2011-06-03 18:40:00.00000046,4.9,2011-06-03 18:40:00 UTC,1.510003,-2541.548795,804.010087,3292.725070,1
55194997,2012-06-17 13:02:00.00000084,4.5,2012-06-17 13:02:00 UTC,-73.991467,40.749787,-740.016667,40.749142,1
55364552,2011-11-18 08:21:00.000000232,8.5,2011-11-18 08:21:00 UTC,-73.980002,40.770077,-731.516667,40.756565,2
55383247,2011-11-10 23:25:00.000000223,2.5,2011-11-10 23:25:00 UTC,-2959.462187,2371.279908,770.672687,-774.447875,1


In [22]:
train = train[(train.pickup_latitude < 90) & (train.pickup_latitude > -90)]
train = train[(train.pickup_longitude < 180) & (train.pickup_longitude > -180)]
train = train[(train.dropoff_latitude < 90) & (train.dropoff_latitude > -90)]
train = train[(train.dropoff_longitude < 180) & (train.dropoff_longitude > -180)]
train.shape

(55418166, 8)

# Feature engineering

## Manhattan distance

In [23]:
%%time
def m_dist_fe(df):
    df['abs_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['manhattan_dist'] = df['abs_long_diff'] + df['abs_lat_diff']
    return df[['abs_long_diff', 'abs_lat_diff', 'manhattan_dist']]

m_dist_fe(train)
m_dist_fe(test)

CPU times: user 12 s, sys: 38.3 s, total: 50.3 s
Wall time: 28.3 s


Unnamed: 0,abs_long_diff,abs_lat_diff,manhattan_dist
0,0.008110,0.019970,0.028080
1,0.012024,0.019817,0.031841
2,0.002870,0.005121,0.007991
3,0.009288,0.016172,0.025460
4,0.022519,0.045348,0.067867
...,...,...,...
9909,0.012482,0.016609,0.029091
9910,0.014702,0.027229,0.041931
9911,0.201859,0.079597,0.281456
9912,0.046394,0.066299,0.112694


## Euclidean distance

In [24]:
%%time
def e_dist_fe(df):
    df['sqrt_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).pow(2)
    df['sqrt_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).pow(2)
    df['euclidean_dist'] = (df['sqrt_long_diff'] + df['sqrt_lat_diff']).pow(0.5)
    return df[['sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist']]

e_dist_fe(train)
e_dist_fe(test)

CPU times: user 22.8 s, sys: 39.8 s, total: 1min 2s
Wall time: 31.4 s


Unnamed: 0,sqrt_long_diff,sqrt_lat_diff,euclidean_dist
0,0.000066,0.000399,0.021554
1,0.000145,0.000393,0.023180
2,0.000008,0.000026,0.005870
3,0.000086,0.000262,0.018649
4,0.000507,0.002056,0.050631
...,...,...,...
9909,0.000156,0.000276,0.020776
9910,0.000216,0.000741,0.030945
9911,0.040747,0.006336,0.216985
9912,0.002152,0.004396,0.080920


## Geopy distance

In [26]:
%%time
def geopy_dist_fe(df):
    df['geodesic_km_dist'] = df.apply(lambda x: geodesic((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    df['great_circle_km_dist'] = df.apply(lambda x: great_circle((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    return df[['geodesic_km_dist', 'great_circle_km_dist']]

geopy_dist_fe(train)
geopy_dist_fe(test)

CPU times: user 5h 22min 43s, sys: 40min 56s, total: 6h 3min 39s
Wall time: 7h 9min 50s


Unnamed: 0,geodesic_km_dist,great_circle_km_dist
0,2.320991,2.323263
1,2.423802,2.425356
2,0.618182,0.618629
3,1.959671,1.961035
4,5.382833,5.387309
...,...,...
9909,2.124110,2.124877
9910,3.268511,3.270974
9911,19.217032,19.183968
9912,8.339644,8.343498


## [Haversine distance](https://www.kaggle.com/madhurisivalenka/cleansing-eda-modelling-lgbm-xgboost-starters)

In [27]:
def h_dist_fe(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  #radius of earth in kilometers
        #R = 3959 #radius of earth in miles
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) #in kilometers
        i['haversine_dist'] = d
    return d

In [28]:
%%time
h_dist_fe('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

CPU times: user 16.1 s, sys: 16.9 s, total: 33 s
Wall time: 12.8 s


0        2.323260
1        2.425353
2        0.618628
3        1.961033
4        5.387301
          ...    
9909     2.124874
9910     3.270969
9911    19.183941
9912     8.343486
9913     1.180825
Length: 9914, dtype: float64

## [Center point](https://www.kdnuggets.com/2018/12/feature-building-techniques-tricks-kaggle.html)

In [29]:
%%time
def center_fe(df):
    df['center_lat'] = (df['pickup_latitude'].values + df['dropoff_latitude'].values) / 2
    df['center_long'] = (df['pickup_longitude'].values + df['dropoff_longitude'].values) / 2
    return df[['center_lat', 'center_long']]

center_fe(train)
center_fe(test)

CPU times: user 14.7 s, sys: 48.1 s, total: 1min 2s
Wall time: 1min 16s


Unnamed: 0,center_lat,center_long
0,40.753820,-73.977375
1,40.729292,-73.992874
2,40.748700,-73.981089
3,40.759721,-73.985804
4,40.767101,-73.977305
...,...,...
9909,40.788692,-73.961884
9910,40.789986,-73.952862
9911,40.686810,-73.890671
9912,40.768581,-73.962376


## Datetime

## Convert column types

In [30]:
%%time
data = [train, test]
for df in data:
    df["key"] = pd.to_datetime(df["key"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [31]:
%%time
def dt_fe(df):
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['week'] = df['pickup_datetime'].dt.week
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
    df['day_of_year'] = df['pickup_datetime'].dt.dayofyear
    df['quarter'] = df['pickup_datetime'].dt.quarter
    return df[['year', 'month', 'week', 'day', 'hour', 
               'minute', 'day_of_week', 'day_of_year', 'quarter']]

dt_fe(train)
dt_fe(test)

CPU times: user 39.5 s, sys: 43.1 s, total: 1min 22s
Wall time: 54.7 s


Unnamed: 0,year,month,week,day,hour,minute,day_of_week,day_of_year,quarter
0,2015,1,5,27,13,8,1,27,1
1,2015,1,5,27,13,8,1,27,1
2,2011,10,40,8,11,53,5,281,4
3,2012,12,48,1,21,12,5,336,4
4,2012,12,48,1,21,12,5,336,4
...,...,...,...,...,...,...,...,...,...
9909,2015,5,19,10,12,37,6,130,2
9910,2015,1,3,12,17,5,0,12,1
9911,2015,4,16,19,20,44,6,109,2
9912,2015,1,5,31,1,5,5,31,1


In [32]:
print(train.year.unique())

[2009 2010 2011 2012 2013 2014 2015]


In [33]:
%%time
def part_of_day(t):
    #early morning: 5-7
    if t > 4  and t <= 7:
        return 0
    #morning: 8-10
    elif t > 7   and t <= 10:
        return 1
    #midday/noon: 11-14
    elif t > 10   and t <= 14:
        return 2
    #afternoon: 15-17
    elif t > 14   and t <= 17:
        return 3
    #evening: 18-21
    elif t > 17   and t <= 21:
        return 4
    #night: 22-4
    else:
        return 5  

def part_of_day_fe(df):
    df['part_of_day'] = df['pickup_datetime'].dt.hour.apply(lambda x: part_of_day(x))
    return df['part_of_day']

part_of_day_fe(train)
part_of_day_fe(test)

CPU times: user 31.3 s, sys: 2.3 s, total: 33.6 s
Wall time: 33.3 s


0       2
1       2
2       2
3       4
4       4
       ..
9909    2
9910    3
9911    4
9912    5
9913    2
Name: part_of_day, Length: 9914, dtype: int64

In [34]:
%%time
def is_fe(df):
    df['is_weekday'] = (df['pickup_datetime'].dt.dayofweek < 5).astype(int)
    df['is_weekend'] = (df['pickup_datetime'].dt.dayofweek >= 5).astype(int)
    df['is_month_start'] = df['pickup_datetime'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['pickup_datetime'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['pickup_datetime'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['pickup_datetime'].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df['pickup_datetime'].dt.is_year_start.astype(int)
    df['is_year_end'] = df['pickup_datetime'].dt.is_year_end.astype(int)
    df['is_leap_year'] = df['pickup_datetime'].dt.is_leap_year.astype(int)
    df['is_holiday'] = df['pickup_datetime'].apply(lambda x: 0 if holidays.US().get(x) is None else 1)
    return df[['is_weekday', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 
               'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday']] 

is_fe(train)
is_fe(test)

CPU times: user 2h 6min 59s, sys: 2min 25s, total: 2h 9min 25s
Wall time: 2h 10min 11s


Unnamed: 0,is_weekday,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_leap_year,is_holiday
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,1,0
4,0,1,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9909,0,1,0,0,0,0,0,0,0,0
9910,1,0,0,0,0,0,0,0,0,0
9911,0,1,0,0,0,0,0,0,0,0
9912,0,1,0,1,0,0,0,0,0,0


# reverse_geocoder

In [36]:
def rg_fe(df):
    df_pickup = df[['pickup_latitude', 'pickup_longitude']]
    df_dropoff = df[['dropoff_latitude', 'dropoff_longitude']]

    pickup_results = rg.search([tuple(x) for x in df_pickup.values])
    dropoff_results = rg.search([tuple(x) for x in df_dropoff.values])

    pickup_place = [pickup_results[i]['name'] for i in range(len(pickup_results))]
    pickup_admin1 = [pickup_results[i]['admin1'] for i in range(len(pickup_results))]
    pickup_admin2 = [pickup_results[i]['admin2'] for i in range(len(pickup_results))]
    dropoff_place = [dropoff_results[i]['name'] for i in range(len(dropoff_results))]
    dropoff_admin1 = [dropoff_results[i]['admin1'] for i in range(len(pickup_results))]
    dropoff_admin2 = [dropoff_results[i]['admin2'] for i in range(len(pickup_results))]

    df['pickup_place'] = pd.Series(pickup_place).values
    df['pickup_admin1'] = pd.Series(pickup_admin1).values
    df['pickup_admin2'] = pd.Series(pickup_admin2).values
    df['dropoff_place'] = pd.Series(dropoff_place).values
    df['dropoff_admin1'] = pd.Series(dropoff_admin1).values
    df['dropoff_admin2'] = pd.Series(dropoff_admin2).values
    
    df.loc[df['pickup_admin1'].str.len() == 0, 'pickup_admin1'] = 'None'
    df.loc[df['pickup_admin2'].str.len() == 0, 'pickup_admin2'] = 'None'
    df.loc[df['pickup_place'].str.len() == 0, 'pickup_place'] = 'None'
    df.loc[df['dropoff_admin1'].str.len() == 0, 'dropoff_admin1'] = 'None'
    df.loc[df['dropoff_admin2'].str.len() == 0, 'dropoff_admin2'] = 'None'
    df.loc[df['dropoff_place'].str.len() == 0, 'dropoff_place'] = 'None'
    
    return df[['pickup_place', 'pickup_admin1', 'pickup_admin2', 
               'dropoff_place', 'dropoff_admin1', 'dropoff_admin2']]

In [37]:
%%time
rg_fe(train)
rg_fe(test)

Loading formatted geocoded file...
CPU times: user 6min 19s, sys: 1min 20s, total: 7min 40s
Wall time: 8min 30s


Unnamed: 0,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,New York City,New York,,New York City,New York,
2,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,Manhattan,New York,New York County,New York City,New York,
...,...,...,...,...,...,...
9909,Manhattan,New York,New York County,Manhattan,New York,New York County
9910,Manhattan,New York,New York County,Manhattan,New York,New York County
9911,New York City,New York,,Jamaica,New York,Queens County
9912,New York City,New York,,Manhattan,New York,New York County


In [38]:
print(train.shape)
print(train.pickup_place.unique())
print(train.dropoff_place.unique())

(55418166, 45)
['Borough of Queens' 'New York City' 'Manhattan' ... 'Woodbury Heights'
 'Beduido' 'Shippensburg']
['Borough of Queens' 'Manhattan' 'Weehawken' ... 'Kure Beach'
 'National Park' 'Hell-Ville']


In [39]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2009-06-15 17:26:21.000000100,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,0.002701,0.009041,...,0,0,0,0,Borough of Queens,New York,Queens County,Borough of Queens,New York,Queens County
1,2010-01-05 16:52:16.000000200,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,0.03678,0.070701,...,0,0,0,0,New York City,New York,,Manhattan,New York,New York County
2,2011-08-18 00:35:00.000000490,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,0.008504,0.010708,...,0,0,0,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
3,2012-04-21 04:30:42.000000100,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,0.004437,0.024949,...,0,0,1,0,New York City,New York,,Weehawken,New Jersey,Hudson County
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,0.01144,0.015754,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County


In [40]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2015-01-27 13:08:24.000000200,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.02808,...,0,0,0,0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,2015-01-27 13:08:24.000000300,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.031841,...,0,0,0,0,New York City,New York,,New York City,New York,
2,2011-10-08 11:53:44.000000200,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1,0.00287,0.005121,0.007991,...,0,0,0,0,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,2012-12-01 21:12:12.000000200,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1,0.009288,0.016172,0.02546,...,0,0,1,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,2012-12-01 21:12:12.000000300,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,0.022519,0.045348,0.067867,...,0,0,1,0,Manhattan,New York,New York County,New York City,New York,


In [41]:
data = [train, test]
for df in data:
    print(df.shape)

(55418166, 45)
(9914, 44)


# Output Data
output data as csv

In [42]:
train.to_csv("data/train5.csv", index=False)
test.to_csv("data/test5.csv", index=False)