In [35]:
import numpy as np 
import pandas as pd
import scipy as scipy
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import datetime as dt
from math import sqrt

In [2]:
train_df =  pd.read_csv('train.csv',nrows = 5000000, usecols=[1,2,3,4,5,6])
test_df =  pd.read_csv('test.csv')

In [22]:
def Submit_model(model, fname):
    Predictions_test = model.predict(XTEST)
    submission_df = pd.read_csv('sample_submission.csv')
    submission_df['fare_amount'] = Predictions_test
    submission_df.to_csv(fname, index=None)
    return submission_df

In [4]:
def Correction_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]


In [5]:
def Haversine_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    Radius_earth = 6371
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * Radius_earth * np.arcsin(np.sqrt(a))
def Bearing_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    Radius_earth = 6371
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a 

In [6]:
def Airport_dist(data):
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) 
    nyc_coord = (40.7141667,-74.0063889) 
    pickup_lat = data['pickup_latitude']
    dropoff_lat = data['dropoff_latitude']
    pickup_lon = data['pickup_longitude']
    dropoff_lon = data['dropoff_longitude']
    pickup_jfk = Haversine_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = Haversine_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = Haversine_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = Haversine_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = Haversine_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = Haversine_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = Haversine_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = Haversine_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = Haversine_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = Haversine_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    data['jfk_dist'] = pickup_jfk + dropoff_jfk
    data['ewr_dist'] = pickup_ewr + dropoff_ewr
    data['lga_dist'] = pickup_lga + dropoff_lga
    data['sol_dist'] = pickup_sol + dropoff_sol
    data['nyc_dist'] = pickup_nyc + dropoff_nyc
    return data

In [7]:
def Time_Date_info(dataset):
    dataset['year'] = dataset.pickup_datetime.dt.year
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    return dataset

In [8]:
cols = train_df.columns
train_df = train_df.dropna(how = 'any', axis = 'rows')
train_df = Correction_df(train_df)
train_df = Time_Date_info(train_df)
train_df = Airport_dist(train_df)
train_df['pickup_latitude'] = np.radians(train_df['pickup_latitude'])
train_df['pickup_longitude'] = np.radians(train_df['pickup_longitude'])
train_df['dropoff_latitude'] = np.radians(train_df['dropoff_latitude'])
train_df['dropoff_longitude'] = np.radians(train_df['dropoff_longitude'])
train_df['distance'] = Haversine_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 
train_df['bearing'] = Bearing_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 
train_df.drop(columns=[ 'pickup_datetime'], inplace=True)

In [9]:
print(test_df.head())
test_df = Time_Date_info(test_df)
test_df = Airport_dist(test_df)
test_df['pickup_latitude'] = np.radians(test_df['pickup_latitude'])
test_df['pickup_longitude'] = np.radians(test_df['pickup_longitude'])
test_df['dropoff_latitude'] = np.radians(test_df['dropoff_latitude'])
test_df['dropoff_longitude'] = np.radians(test_df['dropoff_longitude'])
test_df['distance'] = Haversine_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                   test_df['dropoff_latitude'] , test_df['dropoff_longitude'])
test_df['bearing'] = Bearing_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                    test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
test_key = test_df['key']
test_df = test_df.drop(columns=['key', 'pickup_datetime'])

                           key          pickup_datetime  pickup_longitude  \
0  2015-01-27 13:08:24.0000002  2015-01-27 13:08:24 UTC        -73.973320   
1  2015-01-27 13:08:24.0000003  2015-01-27 13:08:24 UTC        -73.986862   
2  2011-10-08 11:53:44.0000002  2011-10-08 11:53:44 UTC        -73.982524   
3  2012-12-01 21:12:12.0000002  2012-12-01 21:12:12 UTC        -73.981160   
4  2012-12-01 21:12:12.0000003  2012-12-01 21:12:12 UTC        -73.966046   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0        40.763805         -73.981430         40.743835                1  
1        40.719383         -73.998886         40.739201                1  
2        40.751260         -73.979654         40.746139                1  
3        40.767807         -73.990448         40.751635                1  
4        40.789775         -73.988565         40.744427                1  


In [10]:
train_df.head(10)

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,4.5,-1.288826,0.710721,-1.288779,0.710563,17,15,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,16.9,-1.291824,0.710546,-1.291182,0.71178,16,5,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,5.7,-1.291242,0.711418,-1.291391,0.711231,0,18,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,7.7,-1.291319,0.710927,-1.291396,0.711363,4,21,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905
4,5.3,-1.290987,0.711536,-1.290787,0.711811,7,9,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703
5,12.1,-1.29156,0.710901,-1.291071,0.711365,9,6,1,3,2011,42.335622,32.82493,20.648176,15.741133,7.649541,3.787239,-0.674251
6,7.5,-1.291195,0.711251,-1.291086,0.711481,20,20,11,1,2012,42.563234,35.482608,18.113693,19.12667,10.99315,1.555807,-0.34226
7,16.5,-1.290694,0.711643,-1.291371,0.71124,17,4,1,2,2012,42.533214,36.829343,16.949505,20.548489,12.449213,4.155444,2.236596
8,9.0,-1.291656,0.710815,-1.291423,0.710901,13,3,12,0,2012,42.240568,29.608237,23.942272,11.667601,3.637447,1.253232,-1.11947
9,8.9,-1.291206,0.71094,-1.291396,0.711364,1,2,9,2,2009,42.218767,33.106804,20.538553,16.193381,8.129651,2.849627,0.327463


In [11]:
del train_df['pickup_longitude']
del train_df['pickup_latitude']
del train_df['dropoff_longitude']
del train_df['dropoff_latitude']
del test_df['pickup_longitude']
del test_df['pickup_latitude']
del test_df['dropoff_longitude']
del test_df['dropoff_latitude']

In [12]:
train_df.head(10)

Unnamed: 0,fare_amount,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,4.5,17,15,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,16.9,16,5,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,5.7,0,18,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,7.7,4,21,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905
4,5.3,7,9,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703
5,12.1,9,6,1,3,2011,42.335622,32.82493,20.648176,15.741133,7.649541,3.787239,-0.674251
6,7.5,20,20,11,1,2012,42.563234,35.482608,18.113693,19.12667,10.99315,1.555807,-0.34226
7,16.5,17,4,1,2,2012,42.533214,36.829343,16.949505,20.548489,12.449213,4.155444,2.236596
8,9.0,13,3,12,0,2012,42.240568,29.608237,23.942272,11.667601,3.637447,1.253232,-1.11947
9,8.9,1,2,9,2,2009,42.218767,33.106804,20.538553,16.193381,8.129651,2.849627,0.327463


In [13]:
train_df.describe()

Unnamed: 0,fare_amount,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
count,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0,4899925.0
mean,11.32513,13.51161,15.72073,6.267325,3.041796,2011.736,59.62866,53.44829,37.36567,36.69523,28.96633,4.604715,0.3027828
std,9.706493,6.514948,8.684655,3.43577,1.949249,1.862334,693.791,694.8623,695.1187,695.0503,695.2878,85.60587,1.822971
min,0.01,0.0,1.0,1.0,0.0,2009.0,0.5927744,0.8571477,0.2145543,0.5325453,0.0175207,0.0,-3.141593
25%,6.0,9.0,8.0,3.0,1.0,2010.0,41.31249,32.12026,17.04948,14.81033,7.066689,1.253016,-0.8820057
50%,8.5,14.0,16.0,6.0,3.0,2012.0,42.48428,34.6667,19.51264,18.20959,10.32588,2.152419,-0.1000955
75%,12.5,19.0,23.0,9.0,5.0,2013.0,43.69001,38.05301,22.03715,22.13763,14.17214,3.912644,2.241403
max,500.0,23.0,31.0,12.0,6.0,2015.0,34829.37,34814.21,34850.43,34821.83,34829.25,19108.8,3.141593


In [14]:
test_df.describe()

Unnamed: 0,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
count,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0,9914.0
mean,1.671273,13.46742,16.19417,6.857979,2.852834,2011.815816,41.909917,35.796937,19.653573,19.017516,11.2909,3.435371,0.262679
std,1.278747,6.868584,8.838482,3.353272,1.994451,1.803347,6.139945,7.294679,5.935953,7.653985,7.663088,3.972374,1.825297
min,1.0,0.0,1.0,1.0,0.0,2009.0,1.149244,15.034963,1.213073,5.961436,0.551142,0.0,-3.14063
25%,1.0,8.0,9.0,4.0,1.0,2010.0,41.282474,32.153504,17.021703,14.811264,7.131034,1.298277,-0.919187
50%,1.0,15.0,16.0,7.0,3.0,2012.0,42.48175,34.689158,19.547526,18.204584,10.376138,2.217412,-0.156683
75%,2.0,19.0,25.0,10.0,5.0,2014.0,43.690225,38.116042,22.067134,22.184392,14.208813,4.045302,2.20143
max,6.0,23.0,31.0,12.0,6.0,2015.0,270.963212,299.147633,253.458889,286.448564,278.129873,99.99604,3.14013


In [15]:
test_df.head(10)

Unnamed: 0,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,1,13,27,1,1,2015,42.055277,35.042878,18.501273,18.309374,10.095252,2.32326,2.843096
1,1,13,27,1,1,2015,41.244373,30.827012,23.023659,12.664639,4.599471,2.425353,0.430894
2,1,11,8,10,5,2011,41.831497,33.993679,19.353797,17.018308,8.797452,0.618628,-2.740065
3,1,21,1,12,5,2012,43.964285,34.268523,19.525661,18.544904,10.709378,1.961033,2.731208
4,1,21,1,12,5,2012,44.128523,36.440152,18.414857,20.732468,12.752865,5.387301,2.781733
5,1,21,1,12,5,2012,41.008939,36.107423,17.437571,18.919292,10.555825,3.222549,2.645876
6,1,12,6,10,3,2011,41.729422,40.213344,13.810117,23.88977,15.589303,0.929601,1.850815
7,1,12,6,10,3,2011,22.670048,50.556068,26.276216,32.299854,26.064641,21.540102,0.948493
8,1,12,6,10,3,2011,42.754126,28.763656,25.154591,10.565089,3.992362,3.873962,-0.425797
9,1,15,18,2,1,2014,43.703994,36.699447,17.399029,21.116902,13.149156,1.099794,1.016426


In [16]:
train_df = train_df.loc[train_df['fare_amount']> 2.5]

In [17]:
train_df.describe()

Unnamed: 0,fare_amount,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
count,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0,4881945.0
mean,11.35766,13.51462,15.72065,6.267161,3.041795,2011.738,59.67678,53.4751,37.40062,36.723,28.9927,4.616509,0.303239
std,9.709511,6.514787,8.684593,3.435878,1.949189,1.861801,694.6277,695.7014,695.9577,695.8893,696.1271,85.68505,1.823661
min,2.51,0.0,1.0,1.0,0.0,2009.0,0.5927744,0.8571477,0.2145543,0.5325453,0.0175207,0.0,-3.141593
25%,6.0,9.0,8.0,3.0,1.0,2010.0,41.31604,32.12014,17.05168,14.8095,7.065533,1.260383,-0.8825535
50%,8.5,14.0,16.0,6.0,3.0,2012.0,42.48545,34.6637,19.51251,18.20581,10.32159,2.159208,-0.1021593
75%,12.5,19.0,23.0,9.0,5.0,2013.0,43.68986,38.04554,22.03425,22.13181,14.16548,3.921065,2.243307
max,500.0,23.0,31.0,12.0,6.0,2015.0,34829.37,34814.21,34850.43,34821.83,34829.25,19108.8,3.141593


In [18]:
y = train_df['fare_amount']
train_df = train_df.drop(columns=['fare_amount'])
X_train,X_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
del train_df
del y

In [23]:
lr = LinearRegression(normalize=True)
lr.fit(X_train,y_train)
XTEST = test_df.drop(['passenger_count'],axis=1)
y_pred = lr.predict(X_test)
print(lr.score(X_test,y_test))
RMSE = sqrt(mean_squared_error(y_test,y_pred))
print('RMSE for Linear Regression :',RMSE)
Submit_model(lr, 'lr.csv')

0.3806545129503843
RMSE for Linear Regression : 7.57483647462791


Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,11.363739
1,2015-01-27 13:08:24.0000003,9.653300
2,2011-10-08 11:53:44.0000002,8.840779
3,2012-12-01 21:12:12.0000002,10.661012
4,2012-12-01 21:12:12.0000003,11.106242
...,...,...
9909,2015-05-10 12:37:51.0000002,13.357842
9910,2015-01-12 17:05:51.0000001,13.769773
9911,2015-04-19 20:44:15.0000001,37.026550
9912,2015-01-31 01:05:19.0000005,12.151921


In [24]:
xgb = XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42,
                               n_estimators=500, max_depth=5, learning_rate=0.1, 
                               subsample=0.8, colsample_bytree=0.8)
xgb.fit(X_train, y_train)
XTEST = test_df.drop(['passenger_count'],axis=1)
y_pred = xgb.predict(X_test)
RMSE = sqrt(mean_squared_error(y_test,y_pred))
print('RMSE for XG Boost Regressor :',RMSE)
Submit_model(xgb, 'xgb_modelsubmission.csv')

RMSE for XG Boost Regressor : 3.7789418711790015


Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.719008
1,2015-01-27 13:08:24.0000003,11.351537
2,2011-10-08 11:53:44.0000002,4.871806
3,2012-12-01 21:12:12.0000002,8.391089
4,2012-12-01 21:12:12.0000003,15.470722
...,...,...
9909,2015-05-10 12:37:51.0000002,9.363904
9910,2015-01-12 17:05:51.0000001,10.792893
9911,2015-04-19 20:44:15.0000001,52.083069
9912,2015-01-31 01:05:19.0000005,19.072939


In [51]:
import lightgbm as lgbm
params = {
    'learning_rate':0.05,
    'application':'regression',
    'max_depth':7,
    'num_leaves':200,
    'verbosity':-1,
    'metric':'RMSE',
}
train_set= lgbm.Dataset(X_train,y_train,silent=True)
lb=lgbm.train(params,train_set=train_set)
y_pred=lb.predict(X_test)
RMSE = sqrt(mean_squared_error(y_test,y_pred))
print('RMSE for lightgbm :',RMSE)
XTEST = test_df.drop(['passenger_count'],axis=1)
Submit_model(lb, 'lightgbm_modelsubmission.csv')
model = lgbm.train(params, train_set = train_set, num_boost_round=10000,verbose_eval=500)

RMSE for lightgbm : 3.7491920085977934


In [27]:
gradient_boost = GradientBoostingRegressor(random_state=0)
gradient_boost.fit(X_train,y_train)
XTEST = test_df.drop(['passenger_count'],axis=1)
y_pred = gradient_boost.predict (X_test)
RMSE = sqrt(mean_squared_error(y_test,y_pred))
print('RMSE for Gradient boosting Regressor :',RMSE)
Submit_model(gradient_boost, 'gradient_boost_modelsubmission.csv')

RMSE for Gradient boosting Regressor : 3.909091383336689


Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.863660
1,2015-01-27 13:08:24.0000003,10.901917
2,2011-10-08 11:53:44.0000002,5.148622
3,2012-12-01 21:12:12.0000002,7.859740
4,2012-12-01 21:12:12.0000003,14.565746
...,...,...
9909,2015-05-10 12:37:51.0000002,9.221788
9910,2015-01-12 17:05:51.0000001,11.984941
9911,2015-04-19 20:44:15.0000001,54.009346
9912,2015-01-31 01:05:19.0000005,21.135122


In [29]:
rfg = RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=7, n_estimators=50)
rfg.fit(X_train, y_train)
XTEST = test_df.drop(['passenger_count'],axis=1)
y_pred = rfg.predict (X_test)
RMSE = sqrt(mean_squared_error(y_test,y_pred))
print('RMSE for Random forest regressor :',RMSE)
Submit_model(rfg, 'randomforest_submission.csv')

RMSE for Random forest regressor : 3.887173648487748


Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.199643
1,2015-01-27 13:08:24.0000003,10.516933
2,2011-10-08 11:53:44.0000002,5.031999
3,2012-12-01 21:12:12.0000002,7.041899
4,2012-12-01 21:12:12.0000003,14.216866
...,...,...
9909,2015-05-10 12:37:51.0000002,9.919696
9910,2015-01-12 17:05:51.0000001,11.922394
9911,2015-04-19 20:44:15.0000001,55.083852
9912,2015-01-31 01:05:19.0000005,23.188777


In [52]:

prediction = model.predict(test_df, num_iteration = model.best_iteration,predict_disable_shape_check= True)      
submission = pd.DataFrame({
        "key": test_key,
        "fare_amount": prediction
})

submission.to_csv('Newyork_Taxi_Fare_Prediction_submission.csv',index=False)