# [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

## Import packages

In [1]:
import numpy as np 
import pandas as pd
from geopy.distance import geodesic, great_circle
import reverse_geocoder as rg
import holidays

## Import data

In [2]:
%%time
train2 = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

CPU times: user 1min 44s, sys: 1min 46s, total: 3min 30s
Wall time: 4min 55s


In [3]:
train2.shape

(55423856, 8)

In [4]:
train = train2.iloc[1000000:21000000]

In [5]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1000000,2010-03-06 21:13:04.0000003,10.1,2010-03-06 21:13:04 UTC,-73.993113,40.755552,-73.969351,40.797908,1
1000001,2010-03-10 16:19:00.000000128,3.7,2010-03-10 16:19:00 UTC,-73.948102,40.770608,-73.952923,40.768025,1
1000002,2014-02-27 08:50:19.0000006,11.0,2014-02-27 08:50:19 UTC,-73.977236,40.743045,-73.997854,40.719837,1
1000003,2011-02-11 17:46:17.0000005,7.3,2011-02-11 17:46:17 UTC,-73.95406,39.603285,-73.977015,40.776712,3
1000004,2011-07-25 12:48:00.00000074,6.9,2011-07-25 12:48:00 UTC,-73.978575,40.753067,-73.964632,40.764258,1


In [6]:
data = [train, test]
for df in data:
    print(df.shape)

(20000000, 8)
(9914, 7)


In [7]:
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [8]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,20000000.0,20000000.0,20000000.0,19999860.0,19999860.0,20000000.0
mean,11.3424,-72.51049,39.92014,-72.50928,39.91876,1.6853
std,16.89884,13.1048,9.605009,13.02184,9.734502,1.322029
min,-107.75,-3439.245,-3492.264,-3442.025,-3547.887,0.0
25%,6.0,-73.99207,40.73491,-73.9914,40.73403,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75315,1.0
75%,12.5,-73.96709,40.76712,-73.96368,40.76808,2.0
max,61550.86,3457.626,3406.008,3457.622,3400.392,208.0


In [9]:
train.apply(lambda x: x.nunique())

key                  20000000
fare_amount              6853
pickup_datetime      11369179
pickup_longitude       257887
pickup_latitude        279653
dropoff_longitude      335860
dropoff_latitude       368703
passenger_count            15
dtype: int64

# Data preprocessing

## Checking null values

In [10]:
def print_null(df):
    """
    prints null value of the given data
    """
    print(df.isnull().sum()[df.isnull().sum() != 0])

print_null(train)
print("------------")
print_null(test)

dropoff_longitude    136
dropoff_latitude     136
dtype: int64
------------
Series([], dtype: int64)


In [11]:
train = train.dropna(how='any', axis=0)

In [12]:
print_null(train)
print("------------")
print_null(test)

Series([], dtype: int64)
------------
Series([], dtype: int64)


## Natural Constraint

No more than 6 passengers are allowed in 1 trip.

In [13]:
max(train['passenger_count'])

208

In [14]:
train[train['passenger_count'] > 6]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1007609,2014-06-24 15:13:00.00000040,104.0,2014-06-24 15:13:00 UTC,-74.01578,40.71542,-74.17028,40.70834,9
2154045,2010-12-16 11:21:00.000000209,3.3,2010-12-16 11:21:00 UTC,0.0,0.0,0.0,0.0,208
2198549,2010-12-15 14:20:00.00000010,3.3,2010-12-15 14:20:00 UTC,0.0,0.0,0.0,0.0,208
2910347,2010-12-16 06:44:00.00000039,4.5,2010-12-16 06:44:00 UTC,0.0,0.0,0.0,0.0,208
3107489,2009-05-12 14:50:00.000000175,2.7,2009-05-12 14:50:00 UTC,-73.937818,40.75826,-73.937827,40.75825,208
3323791,2011-08-27 01:24:00.000000168,8.5,2011-08-27 01:24:00 UTC,-73.987858,40.76034,-74.000682,40.752413,129
4095440,2015-06-14 08:56:16.0000001,37.04,2015-06-14 08:56:16 UTC,-73.982094,40.756252,-73.872482,40.774506,9
4103745,2010-12-22 12:11:00.000000230,23.7,2010-12-22 12:11:00 UTC,0.0,0.0,0.0,0.0,208
4432483,2009-05-11 13:56:00.00000088,11.1,2009-05-11 13:56:00 UTC,-73.937733,40.758267,-73.937737,40.758273,208
4467314,2015-01-01 21:32:16.0000007,8.5,2015-01-01 21:32:16 UTC,-74.005867,40.740643,-73.988045,40.7351,7


In [15]:
train = train[train['passenger_count'] <= 6]
train.shape

(19999830, 8)

Fares are expected to be larger than 0

In [16]:
train[train.fare_amount < 0]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1032448,2015-03-28 21:59:19.0000007,-6.0,2015-03-28 21:59:19 UTC,-73.987518,40.735992,-73.992729,40.729916,4
1042337,2010-03-24 14:42:10.0000003,-5.3,2010-03-24 14:42:10 UTC,-73.984802,40.764207,-73.986053,40.762030,5
1054606,2010-02-10 10:32:10.0000002,-45.0,2010-02-10 10:32:10 UTC,-73.980128,40.745807,-73.980535,40.745232,1
1071092,2010-02-20 20:51:10.0000002,-2.9,2010-02-20 20:51:10 UTC,-73.974512,40.680023,-73.974535,40.680132,2
1083722,2015-06-19 12:13:11.0000004,-4.5,2015-06-19 12:13:11 UTC,-73.955826,40.772327,-73.961792,40.770283,1
...,...,...,...,...,...,...,...,...
20933978,2015-02-21 04:43:14.0000001,-3.5,2015-02-21 04:43:14 UTC,-73.986938,40.723648,-73.991432,40.727066,1
20943964,2010-02-09 19:08:10.0000002,-2.5,2010-02-09 19:08:10 UTC,-73.793600,40.657057,-73.793600,40.657055,1
20972031,2015-04-06 15:26:45.0000004,-3.5,2015-04-06 15:26:45 UTC,-73.989113,40.721470,-73.986061,40.726471,1
20990244,2015-02-24 07:56:55.0000004,-2.5,2015-02-24 07:56:55 UTC,-73.918495,40.743359,0.000000,0.000000,1


In [17]:
train = train[train.fare_amount >= 0]
train.shape

(19998983, 8)

The max and min for latitude and longitude ranges from -90 to 90 and -180 to 180.

In [18]:
train[(train.pickup_latitude > 90) | (train.pickup_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1019868,2011-05-19 00:34:00.000000159,7.7,2011-05-19 00:34:00 UTC,351.052520,1669.582038,1717.003405,1989.728077,1
1197804,2012-04-18 12:12:00.000000148,7.3,2012-04-18 12:12:00 UTC,-73.962313,404.233332,-73.969750,40.763007,1
1237988,2011-12-30 09:10:00.00000087,49.8,2011-12-30 09:10:00 UTC,-3007.205450,284.006283,-844.140168,2950.087992,1
1258028,2012-01-16 12:53:00.000000205,6.5,2012-01-16 12:53:00 UTC,1721.172660,1283.652605,2469.149042,2193.823087,1
1273485,2011-07-01 01:55:00.00000070,12.5,2011-07-01 01:55:00 UTC,2417.411523,825.064365,2537.705273,3305.291182,1
...,...,...,...,...,...,...,...,...
20748719,2012-06-10 14:11:00.000000139,16.9,2012-06-10 14:11:00 UTC,-74.015395,402.816667,-73.964407,40.757272,1
20840323,2012-01-26 20:24:00.000000164,11.3,2012-01-26 20:24:00 UTC,-1748.898883,-2173.827827,-1748.898883,-2173.827827,1
20942426,2012-07-21 17:33:00.0000005,22.9,2012-07-21 17:33:00 UTC,-73.776737,404.350000,-73.788507,40.712267,6
20988556,2011-12-28 02:44:00.00000010,6.9,2011-12-28 02:44:00 UTC,-2055.062392,-1772.561112,2873.225260,2544.755355,1


In [19]:
train[(train.pickup_longitude > 180) | (train.pickup_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1019868,2011-05-19 00:34:00.000000159,7.70,2011-05-19 00:34:00 UTC,351.052520,1669.582038,1717.003405,1989.728077,1
1043169,2013-03-15 11:11:00.00000019,36.83,2013-03-15 11:11:00 UTC,-736.333333,40.755922,-73.871502,40.774202,1
1055960,2011-11-05 23:26:00.000000309,33.70,2011-11-05 23:26:00 UTC,-735.200000,40.770092,-73.980187,40.765530,1
1111257,2012-07-29 15:40:00.00000083,14.90,2012-07-29 15:40:00 UTC,-736.266667,40.777002,0.000000,0.000000,2
1237988,2011-12-30 09:10:00.00000087,49.80,2011-12-30 09:10:00 UTC,-3007.205450,284.006283,-844.140168,2950.087992,1
...,...,...,...,...,...,...,...,...
20952365,2012-01-22 10:37:00.00000013,5.30,2012-01-22 10:37:00 UTC,-736.416665,40.770262,-73.955862,40.764515,5
20958675,2012-07-04 19:42:00.00000049,3.70,2012-07-04 19:42:00 UTC,-773.975650,0.050000,-73.979585,40.776350,2
20988556,2011-12-28 02:44:00.00000010,6.90,2011-12-28 02:44:00 UTC,-2055.062392,-1772.561112,2873.225260,2544.755355,1
20991946,2011-12-08 12:42:00.00000072,10.50,2011-12-08 12:42:00 UTC,2553.034222,332.964298,3060.559903,1580.062852,1


In [20]:
train[(train.dropoff_latitude > 90) | (train.dropoff_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1019868,2011-05-19 00:34:00.000000159,7.7,2011-05-19 00:34:00 UTC,351.052520,1669.582038,1717.003405,1989.728077,1
1041515,2012-06-11 14:36:00.0000009,14.9,2012-06-11 14:36:00 UTC,-73.949868,40.822197,0.000000,1903.111567,1
1227322,2011-09-30 20:20:00.000000205,7.3,2011-09-30 20:20:00 UTC,-74.004168,40.707692,-74.004062,433.483332,1
1237988,2011-12-30 09:10:00.00000087,49.8,2011-12-30 09:10:00 UTC,-3007.205450,284.006283,-844.140168,2950.087992,1
1254291,2012-04-18 11:08:00.00000021,6.1,2012-04-18 11:08:00 UTC,-74.000640,40.728432,0.000000,-2515.672033,1
...,...,...,...,...,...,...,...,...
20695967,2012-07-29 10:18:00.00000087,4.1,2012-07-29 10:18:00 UTC,-73.983760,40.721617,-73.979810,404.550000,1
20798274,2012-11-19 17:06:00.000000120,9.0,2012-11-19 17:06:00 UTC,-73.979678,40.784485,0.000000,1646.418480,1
20840323,2012-01-26 20:24:00.000000164,11.3,2012-01-26 20:24:00 UTC,-1748.898883,-2173.827827,-1748.898883,-2173.827827,1
20988556,2011-12-28 02:44:00.00000010,6.9,2011-12-28 02:44:00 UTC,-2055.062392,-1772.561112,2873.225260,2544.755355,1


In [21]:
train[(train.dropoff_longitude > 180) | (train.dropoff_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
1019868,2011-05-19 00:34:00.000000159,7.7,2011-05-19 00:34:00 UTC,351.052520,1669.582038,1717.003405,1989.728077,1
1177891,2012-01-14 08:35:00.00000046,17.7,2012-01-14 08:35:00 UTC,-73.946155,40.788698,-1183.275658,40.773920,5
1237988,2011-12-30 09:10:00.00000087,49.8,2011-12-30 09:10:00 UTC,-3007.205450,284.006283,-844.140168,2950.087992,1
1258028,2012-01-16 12:53:00.000000205,6.5,2012-01-16 12:53:00 UTC,1721.172660,1283.652605,2469.149042,2193.823087,1
1268218,2012-07-30 15:54:00.00000049,8.1,2012-07-30 15:54:00 UTC,-73.964177,40.807482,-773.941040,40.805880,6
...,...,...,...,...,...,...,...,...
20870232,2012-03-14 21:07:00.00000060,28.9,2012-03-14 21:07:00 UTC,-73.862885,40.768800,-740.100000,40.708382,1
20889457,2012-06-10 00:21:00.00000094,10.1,2012-06-10 00:21:00 UTC,-73.964355,40.760732,-736.583333,40.730117,5
20986820,2012-04-05 08:45:00.00000093,10.1,2012-04-05 08:45:00 UTC,-73.960595,40.757190,-738.816667,40.740217,1
20988556,2011-12-28 02:44:00.00000010,6.9,2011-12-28 02:44:00 UTC,-2055.062392,-1772.561112,2873.225260,2544.755355,1


In [22]:
train = train[(train.pickup_latitude < 90) & (train.pickup_latitude > -90)]
train = train[(train.pickup_longitude < 180) & (train.pickup_longitude > -180)]
train = train[(train.dropoff_latitude < 90) & (train.dropoff_latitude > -90)]
train = train[(train.dropoff_longitude < 180) & (train.dropoff_longitude > -180)]
train.shape

(19997960, 8)

# Feature engineering

## Manhattan distance

In [23]:
%%time
def m_dist_fe(df):
    df['abs_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['manhattan_dist'] = df['abs_long_diff'] + df['abs_lat_diff']
    return df[['abs_long_diff', 'abs_lat_diff', 'manhattan_dist']]

m_dist_fe(train)
m_dist_fe(test)

CPU times: user 3.17 s, sys: 4.25 s, total: 7.43 s
Wall time: 4.33 s


Unnamed: 0,abs_long_diff,abs_lat_diff,manhattan_dist
0,0.008110,0.019970,0.028080
1,0.012024,0.019817,0.031841
2,0.002870,0.005121,0.007991
3,0.009288,0.016172,0.025460
4,0.022519,0.045348,0.067867
...,...,...,...
9909,0.012482,0.016609,0.029091
9910,0.014702,0.027229,0.041931
9911,0.201859,0.079597,0.281456
9912,0.046394,0.066299,0.112694


## Euclidean distance

In [24]:
def e_dist_fe(df):
    df['sqrt_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).pow(2)
    df['sqrt_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).pow(2)
    df['euclidean_dist'] = (df['sqrt_long_diff'] + df['sqrt_lat_diff']).pow(0.5)
    return df[['sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist']]

e_dist_fe(train)
e_dist_fe(test)

Unnamed: 0,sqrt_long_diff,sqrt_lat_diff,euclidean_dist
0,0.000066,0.000399,0.021554
1,0.000145,0.000393,0.023180
2,0.000008,0.000026,0.005870
3,0.000086,0.000262,0.018649
4,0.000507,0.002056,0.050631
...,...,...,...
9909,0.000156,0.000276,0.020776
9910,0.000216,0.000741,0.030945
9911,0.040747,0.006336,0.216985
9912,0.002152,0.004396,0.080920


## Geopy distance

In [25]:
%%time
def geopy_dist_fe(df):
    df['geodesic_km_dist'] = df.apply(lambda x: geodesic((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    df['great_circle_km_dist'] = df.apply(lambda x: great_circle((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    return df[['geodesic_km_dist', 'great_circle_km_dist']]

geopy_dist_fe(train)
geopy_dist_fe(test)

CPU times: user 1h 55min 11s, sys: 5min 53s, total: 2h 1min 4s
Wall time: 2h 7min 2s


Unnamed: 0,geodesic_km_dist,great_circle_km_dist
0,2.320991,2.323263
1,2.423802,2.425356
2,0.618182,0.618629
3,1.959671,1.961035
4,5.382833,5.387309
...,...,...
9909,2.124110,2.124877
9910,3.268511,3.270974
9911,19.217032,19.183968
9912,8.339644,8.343498


## [Haversine distance](https://www.kaggle.com/madhurisivalenka/cleansing-eda-modelling-lgbm-xgboost-starters)

In [26]:
def h_dist_fe(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  #radius of earth in kilometers
        #R = 3959 #radius of earth in miles
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) #in kilometers
        i['haversine_dist'] = d
    return d

In [27]:
%%time
h_dist_fe('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

CPU times: user 6.32 s, sys: 6.67 s, total: 13 s
Wall time: 5.94 s


0        2.323260
1        2.425353
2        0.618628
3        1.961033
4        5.387301
          ...    
9909     2.124874
9910     3.270969
9911    19.183941
9912     8.343486
9913     1.180825
Length: 9914, dtype: float64

## [Center point](https://www.kdnuggets.com/2018/12/feature-building-techniques-tricks-kaggle.html)

In [28]:
%%time
def center_fe(df):
    df['center_lat'] = (df['pickup_latitude'].values + df['dropoff_latitude'].values) / 2
    df['center_long'] = (df['pickup_longitude'].values + df['dropoff_longitude'].values) / 2
    return df[['center_lat', 'center_long']]

center_fe(train)
center_fe(test)

CPU times: user 4.96 s, sys: 16 s, total: 20.9 s
Wall time: 11 s


Unnamed: 0,center_lat,center_long
0,40.753820,-73.977375
1,40.729292,-73.992874
2,40.748700,-73.981089
3,40.759721,-73.985804
4,40.767101,-73.977305
...,...,...
9909,40.788692,-73.961884
9910,40.789986,-73.952862
9911,40.686810,-73.890671
9912,40.768581,-73.962376


## Datetime

## Convert column types

In [29]:
data = [train, test]
for df in data:
    df["key"] = pd.to_datetime(df["key"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [30]:
%%time
def dt_fe(df):
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['week'] = df['pickup_datetime'].dt.week
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
    df['day_of_year'] = df['pickup_datetime'].dt.dayofyear
    df['quarter'] = df['pickup_datetime'].dt.quarter
    return df[['year', 'month', 'week', 'day', 'hour', 
               'minute', 'day_of_week', 'day_of_year', 'quarter']]

dt_fe(train)
dt_fe(test)

CPU times: user 14.4 s, sys: 5.26 s, total: 19.7 s
Wall time: 12.9 s


Unnamed: 0,year,month,week,day,hour,minute,day_of_week,day_of_year,quarter
0,2015,1,5,27,13,8,1,27,1
1,2015,1,5,27,13,8,1,27,1
2,2011,10,40,8,11,53,5,281,4
3,2012,12,48,1,21,12,5,336,4
4,2012,12,48,1,21,12,5,336,4
...,...,...,...,...,...,...,...,...,...
9909,2015,5,19,10,12,37,6,130,2
9910,2015,1,3,12,17,5,0,12,1
9911,2015,4,16,19,20,44,6,109,2
9912,2015,1,5,31,1,5,5,31,1


In [31]:
print(train.year.unique())

[2010 2014 2011 2015 2009 2013 2012]


In [32]:
%%time
def is_fe(df):
    df['is_weekday'] = (df['pickup_datetime'].dt.dayofweek < 5).astype(int)
    df['is_weekend'] = (df['pickup_datetime'].dt.dayofweek >= 5).astype(int)
    df['is_month_start'] = df['pickup_datetime'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['pickup_datetime'].dt.is_month_end.astype(int)
    df['is_quarter_start'] = df['pickup_datetime'].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df['pickup_datetime'].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df['pickup_datetime'].dt.is_year_start.astype(int)
    df['is_year_end'] = df['pickup_datetime'].dt.is_year_end.astype(int)
    df['is_leap_year'] = df['pickup_datetime'].dt.is_leap_year.astype(int)
    df['is_holiday'] = df['pickup_datetime'].apply(lambda x: 0 if holidays.US().get(x) is None else 1)
    return df[['is_weekday', 'is_weekend', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 
               'is_year_start', 'is_year_end', 'is_leap_year', 'is_holiday']] 

is_fe(train)
is_fe(test)

CPU times: user 41min 7s, sys: 33.4 s, total: 41min 40s
Wall time: 41min 36s


Unnamed: 0,is_weekday,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,is_leap_year,is_holiday
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,1,0
4,0,1,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
9909,0,1,0,0,0,0,0,0,0,0
9910,1,0,0,0,0,0,0,0,0,0
9911,0,1,0,0,0,0,0,0,0,0
9912,0,1,0,1,0,0,0,0,0,0


# reverse_geocoder

In [33]:
def rg_fe(df):
    df_pickup = df[['pickup_latitude', 'pickup_longitude']]
    df_dropoff = df[['dropoff_latitude', 'dropoff_longitude']]

    pickup_results = rg.search([tuple(x) for x in df_pickup.values])
    dropoff_results = rg.search([tuple(x) for x in df_dropoff.values])

    pickup_place = [pickup_results[i]['name'] for i in range(len(pickup_results))]
    pickup_admin1 = [pickup_results[i]['admin1'] for i in range(len(pickup_results))]
    pickup_admin2 = [pickup_results[i]['admin2'] for i in range(len(pickup_results))]
    dropoff_place = [dropoff_results[i]['name'] for i in range(len(dropoff_results))]
    dropoff_admin1 = [dropoff_results[i]['admin1'] for i in range(len(pickup_results))]
    dropoff_admin2 = [dropoff_results[i]['admin2'] for i in range(len(pickup_results))]

    df['pickup_place'] = pd.Series(pickup_place).values
    df['pickup_admin1'] = pd.Series(pickup_admin1).values
    df['pickup_admin2'] = pd.Series(pickup_admin2).values
    df['dropoff_place'] = pd.Series(dropoff_place).values
    df['dropoff_admin1'] = pd.Series(dropoff_admin1).values
    df['dropoff_admin2'] = pd.Series(dropoff_admin2).values
    
    df.loc[df['pickup_admin1'].str.len() == 0, 'pickup_admin1'] = 'None'
    df.loc[df['pickup_admin2'].str.len() == 0, 'pickup_admin2'] = 'None'
    df.loc[df['pickup_place'].str.len() == 0, 'pickup_place'] = 'None'
    df.loc[df['dropoff_admin1'].str.len() == 0, 'dropoff_admin1'] = 'None'
    df.loc[df['dropoff_admin2'].str.len() == 0, 'dropoff_admin2'] = 'None'
    df.loc[df['dropoff_place'].str.len() == 0, 'dropoff_place'] = 'None'
    
    return df[['pickup_place', 'pickup_admin1', 'pickup_admin2', 
               'dropoff_place', 'dropoff_admin1', 'dropoff_admin2']]

In [34]:
%%time
rg_fe(train)
rg_fe(test)

Loading formatted geocoded file...
CPU times: user 2min 14s, sys: 17.3 s, total: 2min 32s
Wall time: 2min 44s


Unnamed: 0,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,New York City,New York,,New York City,New York,
2,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,Manhattan,New York,New York County,New York City,New York,
...,...,...,...,...,...,...
9909,Manhattan,New York,New York County,Manhattan,New York,New York County
9910,Manhattan,New York,New York County,Manhattan,New York,New York County
9911,New York City,New York,,Jamaica,New York,Queens County
9912,New York City,New York,,Manhattan,New York,New York County


In [35]:
print(train.shape)
print(train.pickup_place.unique())
print(train.dropoff_place.unique())

(19997960, 44)
['Weehawken' 'Manhattan' 'Long Island City' ... 'Port Carbon' 'Renfrew'
 'Ciruelas']
['Manhattan' 'New York City' 'Takoradi' ... 'Chute-aux-Outardes'
 'Port Carbon' 'Colonel Hill']


In [36]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
1000000,2010-03-06 21:13:04.000000300,10.1,2010-03-06 21:13:04+00:00,-73.993113,40.755552,-73.969351,40.797908,1,0.023762,0.042356,...,0,0,0,0,Weehawken,New Jersey,Hudson County,Manhattan,New York,New York County
1000001,2010-03-10 16:19:00.000000128,3.7,2010-03-10 16:19:00+00:00,-73.948102,40.770608,-73.952923,40.768025,1,0.004821,0.002583,...,0,0,0,0,Manhattan,New York,New York County,Manhattan,New York,New York County
1000002,2014-02-27 08:50:19.000000600,11.0,2014-02-27 08:50:19+00:00,-73.977236,40.743045,-73.997854,40.719837,1,0.020618,0.023208,...,0,0,0,0,Long Island City,New York,Queens County,New York City,New York,
1000003,2011-02-11 17:46:17.000000500,7.3,2011-02-11 17:46:17+00:00,-73.95406,39.603285,-73.977015,40.776712,3,0.022954,1.173426,...,0,0,0,0,Surf City,New Jersey,Ocean County,Manhattan,New York,New York County
1000004,2011-07-25 12:48:00.000000740,6.9,2011-07-25 12:48:00+00:00,-73.978575,40.753067,-73.964632,40.764258,1,0.013943,0.011191,...,0,0,0,0,Long Island City,New York,Queens County,Manhattan,New York,New York County


In [37]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,...,is_year_start,is_year_end,is_leap_year,is_holiday,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2015-01-27 13:08:24.000000200,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.02808,...,0,0,0,0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,2015-01-27 13:08:24.000000300,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.031841,...,0,0,0,0,New York City,New York,,New York City,New York,
2,2011-10-08 11:53:44.000000200,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1,0.00287,0.005121,0.007991,...,0,0,0,0,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,2012-12-01 21:12:12.000000200,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1,0.009288,0.016172,0.02546,...,0,0,1,0,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,2012-12-01 21:12:12.000000300,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,0.022519,0.045348,0.067867,...,0,0,1,0,Manhattan,New York,New York County,New York City,New York,


In [38]:
data = [train, test]
for df in data:
    print(df.shape)

(19997960, 44)
(9914, 43)


# Output Data
output data as csv

In [39]:
train.to_csv("train2.csv", index=False)
test.to_csv("test2.csv", index=False)