# [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

## Import packages

In [1]:
import numpy as np 
import pandas as pd
from geopy.distance import geodesic, great_circle
import reverse_geocoder as rg

## Import data

In [35]:
%%time
train2 = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

CPU times: user 1min 51s, sys: 1min 59s, total: 3min 50s
Wall time: 5min 33s


In [36]:
train2.shape

(55423856, 8)

In [37]:
train = train2.iloc[150000:500000]

In [38]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
150000,2010-04-23 03:55:07.0000001,8.5,2010-04-23 03:55:07 UTC,-74.008438,40.719055,-73.995641,40.733019,2
150001,2013-04-20 13:00:00.00000010,22.0,2013-04-20 13:00:00 UTC,-73.957793,40.761445,-74.00519,40.715377,1
150002,2013-11-08 22:16:00.000000127,21.5,2013-11-08 22:16:00 UTC,-73.981947,40.757987,-74.017322,40.705052,1
150003,2010-01-14 22:49:00.00000026,31.3,2010-01-14 22:49:00 UTC,-74.010195,40.704812,-73.85324,40.727048,3
150004,2013-03-17 02:45:00.000000167,34.5,2013-03-17 02:45:00 UTC,-74.00608,40.723872,-73.912677,40.607827,5


In [5]:
data = [train, test]
for df in data:
    print(df.shape)

(150000, 8)
(9914, 7)


In [6]:
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [7]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,150000.0,150000.0,150000.0,149999.0,149999.0,150000.0
mean,11.357902,-72.545746,39.934732,-72.551808,39.925912,1.6884
std,9.994606,12.018591,11.078622,11.400394,6.956032,1.312452
min,-6.5,-740.0,-3116.285383,-1329.621332,-1189.61544,0.0
25%,6.0,-73.992076,40.735011,-73.991447,40.733989,1.0
50%,8.5,-73.981787,40.752752,-73.980193,40.753127,1.0
75%,12.5,-73.967198,40.767072,-73.963661,40.768103,2.0
max,495.0,2140.60116,1703.092772,40.82531,404.133332,6.0


In [8]:
train.apply(lambda x: x.nunique())

key                  150000
fare_amount            1086
pickup_datetime      146214
pickup_longitude      62924
pickup_latitude       72929
dropoff_longitude     67444
dropoff_latitude      77605
passenger_count           7
dtype: int64

# Data preprocessing

## Checking null values

In [9]:
def print_null(df):
    """
    prints null value of the given data
    """
    print(df.isnull().sum()[df.isnull().sum() != 0])

print_null(train)
print("------------")
print_null(test)

dropoff_longitude    1
dropoff_latitude     1
dtype: int64
------------
Series([], dtype: int64)


In [10]:
train = train.dropna(how='any', axis=0)

In [11]:
print_null(train)
print("------------")
print_null(test)

Series([], dtype: int64)
------------
Series([], dtype: int64)


## Natural Constraint

No more than 6 passengers are allowed in 1 trip.

In [12]:
max(train['passenger_count'])

6

In [13]:
train[train['passenger_count'] > 6]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


In [14]:
train = train[train['passenger_count'] <= 6]
train.shape

(149999, 8)

Fares are expected to be larger than 0

In [15]:
train[train.fare_amount < 0]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
165147,2015-05-31 10:23:50.0000003,-2.5,2015-05-31 10:23:50 UTC,-73.982162,40.773621,-73.982094,40.773811,2
179311,2015-04-21 22:45:11.0000002,-3.0,2015-04-21 22:45:11 UTC,-73.944504,40.791683,-73.94516,40.795979,1
182341,2015-02-10 01:34:08.0000002,-5.0,2015-02-10 01:34:08 UTC,-73.990974,40.750374,-73.982193,40.757858,3
288960,2015-02-13 00:36:36.0000002,-4.5,2015-02-13 00:36:36 UTC,-74.006142,40.740425,-74.009827,40.738804,1
298412,2010-03-12 12:17:10.0000001,-6.5,2010-03-12 12:17:10 UTC,-73.989493,40.730085,-73.975762,40.749307,1


In [16]:
train = train[train.fare_amount >= 0]
train.shape

(149994, 8)

The max and min for latitude and longitude ranges from -90 to 90 and -180 to 180.

In [17]:
train[(train.pickup_latitude > 90) | (train.pickup_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
150559,2012-08-03 07:43:00.000000176,25.3,2012-08-03 07:43:00 UTC,0.0,-3116.285383,-73.9536,40.787998,1
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.60116,1703.092772,-1251.19589,-1189.61544,1
272439,2011-04-23 02:55:00.00000012,9.3,2011-04-23 02:55:00 UTC,-74.002497,405.35,-73.9786,40.739962,1


In [18]:
train[(train.pickup_longitude > 180) | (train.pickup_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.60116,1703.092772,-1251.19589,-1189.61544,1
217355,2012-06-03 23:21:00.00000077,6.1,2012-06-03 23:21:00 UTC,-740.0,40.74762,0.0,0.0,6
243342,2012-08-02 10:38:00.000000111,7.3,2012-08-02 10:38:00 UTC,-736.333333,40.76648,-73.987928,40.751742,3


In [19]:
train[(train.dropoff_latitude > 90) | (train.dropoff_latitude < -90)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.60116,1703.092772,-1251.19589,-1189.61544,1
181973,2012-01-03 09:04:00.000000130,6.5,2012-01-03 09:04:00 UTC,-74.008918,40.717827,-74.000855,404.133332,1


In [20]:
train[(train.dropoff_longitude > 180) | (train.dropoff_longitude < -180)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
174356,2011-11-21 21:36:00.00000081,9.7,2011-11-21 21:36:00 UTC,2140.60116,1703.092772,-1251.19589,-1189.61544,1
269695,2012-05-24 09:00:00.000000101,17.7,2012-05-24 09:00:00 UTC,-73.997572,40.720945,-1329.621332,40.773717,1
282374,2011-04-25 13:09:00.000000123,5.7,2011-04-25 13:09:00 UTC,-73.987105,40.755732,-732.6,40.744832,1


In [21]:
train = train[(train.pickup_latitude < 90) & (train.pickup_latitude > -90)]
train = train[(train.pickup_longitude < 180) & (train.pickup_longitude > -180)]
train = train[(train.dropoff_latitude < 90) & (train.dropoff_latitude > -90)]
train = train[(train.dropoff_longitude < 180) & (train.dropoff_longitude > -180)]
train.shape

(149986, 8)

# Feature engineering

## Manhattan distance

In [22]:
%%time
def m_dist_fe(df):
    df['abs_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['manhattan_dist'] = df['abs_long_diff'] + df['abs_lat_diff']
    return df[['abs_long_diff', 'abs_lat_diff', 'manhattan_dist']]

m_dist_fe(train)
m_dist_fe(test)

CPU times: user 73.9 ms, sys: 21.7 ms, total: 95.5 ms
Wall time: 42.1 ms


Unnamed: 0,abs_long_diff,abs_lat_diff,manhattan_dist
0,0.008110,0.019970,0.028080
1,0.012024,0.019817,0.031841
2,0.002870,0.005121,0.007991
3,0.009288,0.016172,0.025460
4,0.022519,0.045348,0.067867
...,...,...,...
9909,0.012482,0.016609,0.029091
9910,0.014702,0.027229,0.041931
9911,0.201859,0.079597,0.281456
9912,0.046394,0.066299,0.112694


## Euclidean distance

In [23]:
def e_dist_fe(df):
    df['sqrt_long_diff'] = (df.dropoff_longitude - df.pickup_longitude).pow(2)
    df['sqrt_lat_diff'] = (df.dropoff_latitude - df.pickup_latitude).pow(2)
    df['euclidean_dist'] = (df['sqrt_long_diff'] + df['sqrt_lat_diff']).pow(0.5)
    return df[['sqrt_long_diff', 'sqrt_lat_diff', 'euclidean_dist']]

e_dist_fe(train)
e_dist_fe(test)

Unnamed: 0,sqrt_long_diff,sqrt_lat_diff,euclidean_dist
0,0.000066,0.000399,0.021554
1,0.000145,0.000393,0.023180
2,0.000008,0.000026,0.005870
3,0.000086,0.000262,0.018649
4,0.000507,0.002056,0.050631
...,...,...,...
9909,0.000156,0.000276,0.020776
9910,0.000216,0.000741,0.030945
9911,0.040747,0.006336,0.216985
9912,0.002152,0.004396,0.080920


## Geopy distance

In [24]:
%%time
def geopy_dist_fe(df):
    df['geodesic_km_dist'] = df.apply(lambda x: geodesic((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    df['great_circle_km_dist'] = df.apply(lambda x: great_circle((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    return df[['geodesic_km_dist', 'great_circle_km_dist']]

geopy_dist_fe(train)
geopy_dist_fe(test)

CPU times: user 58.4 s, sys: 873 ms, total: 59.3 s
Wall time: 58.6 s


Unnamed: 0,geodesic_km_dist,great_circle_km_dist
0,2.320991,2.323263
1,2.423802,2.425356
2,0.618182,0.618629
3,1.959671,1.961035
4,5.382833,5.387309
...,...,...
9909,2.124110,2.124877
9910,3.268511,3.270974
9911,19.217032,19.183968
9912,8.339644,8.343498


## [Haversine distance](https://www.kaggle.com/madhurisivalenka/cleansing-eda-modelling-lgbm-xgboost-starters)

In [25]:
def h_dist_fe(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  #radius of earth in kilometers
        #R = 3959 #radius of earth in miles
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) #in kilometers
        i['haversine_dist'] = d
    return d

In [26]:
%%time
h_dist_fe('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

CPU times: user 42.7 ms, sys: 12.5 ms, total: 55.2 ms
Wall time: 30.6 ms


0        2.323260
1        2.425353
2        0.618628
3        1.961033
4        5.387301
          ...    
9909     2.124874
9910     3.270969
9911    19.183941
9912     8.343486
9913     1.180825
Length: 9914, dtype: float64

## Datetime

## Convert column types

In [27]:
data = [train, test]
for df in data:
    df["key"] = pd.to_datetime(df["key"])
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [28]:
data = [train,test]
for df in data:
    df['Year'] = df['pickup_datetime'].dt.year
    df['Month'] = df['pickup_datetime'].dt.month
    df['Date'] = df['pickup_datetime'].dt.day
    df['Day of Week'] = df['pickup_datetime'].dt.dayofweek
    df['Hour'] = df['pickup_datetime'].dt.hour

# reverse_geocoder

In [29]:
def rg_fe(df):
    df_pickup = df[['pickup_latitude', 'pickup_longitude']]
    df_dropoff = df[['dropoff_latitude', 'dropoff_longitude']]

    pickup_results = rg.search([tuple(x) for x in df_pickup.values])
    dropoff_results = rg.search([tuple(x) for x in df_dropoff.values])

    pickup_place = [pickup_results[i]['name'] for i in range(len(pickup_results))]
    pickup_admin1 = [pickup_results[i]['admin1'] for i in range(len(pickup_results))]
    pickup_admin2 = [pickup_results[i]['admin2'] for i in range(len(pickup_results))]
    dropoff_place = [dropoff_results[i]['name'] for i in range(len(dropoff_results))]
    dropoff_admin1 = [dropoff_results[i]['admin1'] for i in range(len(pickup_results))]
    dropoff_admin2 = [dropoff_results[i]['admin2'] for i in range(len(pickup_results))]

    df['pickup_place'] = pd.Series(pickup_place).values
    df['pickup_admin1'] = pd.Series(pickup_admin1).values
    df['pickup_admin2'] = pd.Series(pickup_admin2).values
    df['dropoff_place'] = pd.Series(dropoff_place).values
    df['dropoff_admin1'] = pd.Series(dropoff_admin1).values
    df['dropoff_admin2'] = pd.Series(dropoff_admin2).values
    
    df.loc[df['pickup_admin1'].str.len() == 0, 'pickup_admin1'] = 'None'
    df.loc[df['pickup_admin2'].str.len() == 0, 'pickup_admin2'] = 'None'
    df.loc[df['pickup_place'].str.len() == 0, 'pickup_place'] = 'None'
    df.loc[df['dropoff_admin1'].str.len() == 0, 'dropoff_admin1'] = 'None'
    df.loc[df['dropoff_admin2'].str.len() == 0, 'dropoff_admin2'] = 'None'
    df.loc[df['dropoff_place'].str.len() == 0, 'dropoff_place'] = 'None'
    
    return df[['pickup_place', 'pickup_admin1', 'pickup_admin2', 
               'dropoff_place', 'dropoff_admin1', 'dropoff_admin2']]

In [30]:
%%time
rg_fe(train)
rg_fe(test)

Loading formatted geocoded file...
CPU times: user 2.75 s, sys: 564 ms, total: 3.31 s
Wall time: 3.36 s


Unnamed: 0,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,New York City,New York,,New York City,New York,
2,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,Manhattan,New York,New York County,New York City,New York,
...,...,...,...,...,...,...
9909,Manhattan,New York,New York County,Manhattan,New York,New York County
9910,Manhattan,New York,New York County,Manhattan,New York,New York County
9911,New York City,New York,,Jamaica,New York,Queens County
9912,New York City,New York,,Manhattan,New York,New York County


In [31]:
print(train.shape)
print(train.pickup_place.unique())
print(train.dropoff_place.unique())

(149986, 28)
['New York City' 'Long Island City' 'Manhattan' 'Weehawken' 'Brooklyn'
 'The Bronx' 'Ansonia' 'Borough of Queens' 'Guttenberg' 'Takoradi'
 'Hoboken' 'Englewood' 'Edgewater' 'Jamaica' 'Inwood' 'West New York'
 'Newark' 'Secaucus' 'Port-aux-Francais' 'Fairview' 'Passaic'
 'Bellerose Terrace' 'Bensonhurst' 'Surf City' 'Herbes' 'Aguiar da Beira'
 'Fort Lee' 'White Plains' 'Beach Haven' 'Wood-Ridge' 'Woodland Park'
 'La Calera' 'West Orange' 'Carlstadt' 'Milford' 'East New York'
 'New Cassel' 'Atlantic Beach' 'Jersey City' 'Rubielos de la Cerida'
 'Lincoln Park' 'Westbury' 'Cliffside Park' 'Harbor Isle' 'Torre de Arcas'
 'Chatham' 'North Bergen' 'Union Beach' 'Old Tappan' 'Englewood Cliffs'
 'Coney Island' 'Woodbury' 'Bernardsville' 'Bloomfield' 'Guayabetal'
 'Rutherford' 'South Valley Stream' 'Lake Success' 'Cedar Grove'
 'Ho-Ho-Kus' 'Buarcos' 'Union City' 'Leonia' 'Dover' 'Mount Joy'
 'Elizabeth' 'East Setauket' 'Port Chester' 'East Atlantic Beach'
 'Penalba' 'Point Lookout' 

In [32]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,...,Month,Date,Day of Week,Hour,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
150000,2010-04-23 03:55:07.000000100,8.5,2010-04-23 03:55:07+00:00,-74.008438,40.719055,-73.995641,40.733019,2,0.012797,0.013964,...,4,23,4,3,New York City,New York,,New York City,New York,
150001,2013-04-20 13:00:00.000000100,22.0,2013-04-20 13:00:00+00:00,-73.957793,40.761445,-74.00519,40.715377,1,0.047397,0.046068,...,4,20,5,13,Long Island City,New York,Queens County,New York City,New York,
150002,2013-11-08 22:16:00.000000127,21.5,2013-11-08 22:16:00+00:00,-73.981947,40.757987,-74.017322,40.705052,1,0.035375,0.052935,...,11,8,4,22,Manhattan,New York,New York County,New York City,New York,
150003,2010-01-14 22:49:00.000000260,31.3,2010-01-14 22:49:00+00:00,-74.010195,40.704812,-73.85324,40.727048,3,0.156955,0.022236,...,1,14,3,22,New York City,New York,,Borough of Queens,New York,Queens County
150004,2013-03-17 02:45:00.000000167,34.5,2013-03-17 02:45:00+00:00,-74.00608,40.723872,-73.912677,40.607827,5,0.093403,0.116045,...,3,17,6,2,New York City,New York,,Brooklyn,New York,Kings County


In [33]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_long_diff,abs_lat_diff,manhattan_dist,...,Month,Date,Day of Week,Hour,pickup_place,pickup_admin1,pickup_admin2,dropoff_place,dropoff_admin1,dropoff_admin2
0,2015-01-27 13:08:24.000000200,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1,0.00811,0.01997,0.02808,...,1,27,1,13,Manhattan,New York,New York County,Long Island City,New York,Queens County
1,2015-01-27 13:08:24.000000300,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,0.012024,0.019817,0.031841,...,1,27,1,13,New York City,New York,,New York City,New York,
2,2011-10-08 11:53:44.000000200,2011-10-08 11:53:44+00:00,-73.982524,40.75126,-73.979654,40.746139,1,0.00287,0.005121,0.007991,...,10,8,5,11,Long Island City,New York,Queens County,Long Island City,New York,Queens County
3,2012-12-01 21:12:12.000000200,2012-12-01 21:12:12+00:00,-73.98116,40.767807,-73.990448,40.751635,1,0.009288,0.016172,0.02546,...,12,1,5,21,Manhattan,New York,New York County,Weehawken,New Jersey,Hudson County
4,2012-12-01 21:12:12.000000300,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,0.022519,0.045348,0.067867,...,12,1,5,21,Manhattan,New York,New York County,New York City,New York,


In [None]:
data = [train, test]
for df in data:
    print(df.shape)

# Output Data
output data as csv

In [None]:
train_df.to_csv("train2.csv", index=False)
test_df.to_csv("test2.csv", index=False)