In [78]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc

In [79]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          # (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]

In [80]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))


In [81]:
def sphere_dist_bear(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a


In [82]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)    


In [83]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [84]:
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [85]:
train_df = pd.read_csv('https://s3.wasabisys.com/iguazio/data/Taxi/ny_taxi_train_subset.csv')

In [86]:
train_df

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2013-07-10 19:50:38.0000004,10.0,2013-07-10 19:50:38 UTC,-73.990625,40.746708,-73.972030,40.763642,1
1,2012-01-28 16:13:24.0000004,10.5,2012-01-28 16:13:24 UTC,-73.972224,40.762571,-74.000314,40.742623,1
2,2013-02-26 08:24:00.000000242,10.0,2013-02-26 08:24:00 UTC,-73.976538,40.765152,-73.991300,40.739217,1
3,2010-09-30 22:04:00.000000248,13.7,2010-09-30 22:04:00 UTC,-73.975998,40.781397,-73.974398,40.736822,1
4,2013-04-09 16:47:00.000000115,7.0,2013-04-09 16:47:00 UTC,-73.782105,40.644887,-73.985012,40.757882,2
...,...,...,...,...,...,...,...,...
999995,2015-01-22 01:28:07.0000003,15.5,2015-01-22 01:28:07 UTC,-73.959343,40.814499,-73.958977,40.763290,1
999996,2013-01-28 12:39:00.000000117,6.5,2013-01-28 12:39:00 UTC,-73.970227,40.793900,-73.978307,40.777952,1
999997,2014-12-24 11:01:00.00000094,6.5,2014-12-24 11:01:00 UTC,-73.996990,40.745555,-74.008135,40.745050,3
999998,2009-11-06 17:32:00.000000202,24.9,2009-11-06 17:32:00 UTC,-73.975290,40.756642,-73.873037,40.774488,1


In [87]:
#Drop rows with null values
train_df = train_df.dropna(how = 'any', axis = 'rows')

In [88]:
train_df = clean_df(train_df)

In [89]:
train_df = add_datetime_info(train_df)
train_df = add_airport_dist(train_df)
train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude'])
train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])


train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)

y = train_df['fare_amount']
train_df = train_df.drop(columns=['fare_amount'])

In [90]:
train_df

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,-1.291380,0.711164,-1.291055,0.711460,1,19,10,7,2,2013,42.781743,34.522584,19.011441,18.129112,10.071057,2.449245,-0.693731
1,-1.291059,0.711441,-1.291549,0.711093,1,16,28,1,5,2012,43.161762,33.530732,19.970109,17.192294,9.309011,3.243245,2.323757
2,-1.291134,0.711486,-1.291392,0.711033,1,8,26,2,1,2013,42.779232,33.910031,19.709084,17.348085,9.264046,3.140503,2.734410
3,-1.291125,0.711770,-1.291097,0.710992,1,22,30,9,3,2010,42.600874,36.096412,18.397860,19.699045,11.591554,4.958346,-3.114401
4,-1.287741,0.709387,-1.291282,0.711359,2,16,9,4,1,2013,22.419926,50.119795,26.285322,31.808353,25.605667,21.223600,0.936099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,-1.290834,0.712347,-1.290828,0.711454,1,1,22,1,3,2015,45.152154,41.582243,15.830866,26.616559,18.606191,5.694208,-3.136176
999996,-1.291024,0.711988,-1.291165,0.711710,1,12,28,1,0,2013,46.313125,38.798934,17.324667,24.552892,16.851320,1.899339,2.775249
999997,-1.291491,0.711144,-1.291686,0.711135,3,11,24,12,2,2014,44.430794,30.353257,23.015190,14.361739,7.016164,0.940570,1.630470
999998,-1.291112,0.711338,-1.289328,0.711649,1,17,6,11,4,2009,38.008386,44.304671,9.254886,26.782352,18.485385,8.837186,-1.343733


In [91]:
y

0         10.0
1         10.5
2         10.0
3         13.7
4          7.0
          ... 
999995    15.5
999996     6.5
999997     6.5
999998    24.9
999999     7.3
Name: fare_amount, Length: 980020, dtype: float64

In [92]:
x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)

In [93]:
del train_df
del y
gc.collect()

140

In [94]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

In [95]:
train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
del x_train
del y_train
del x_test
del y_test
gc.collect()



Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.94419
[1000]	valid_0's rmse: 3.90426
[1500]	valid_0's rmse: 3.88816
[2000]	valid_0's rmse: 3.87736
[2500]	valid_0's rmse: 3.87228
[3000]	valid_0's rmse: 3.86966
[3500]	valid_0's rmse: 3.86657
[4000]	valid_0's rmse: 3.86669
Early stopping, best iteration is:
[3703]	valid_0's rmse: 3.86525


550

In [96]:
test_df =  pd.read_csv('https://s3.wasabisys.com/iguazio/data/Taxi/ny_taxi_train_test.csv')

In [97]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6


In [98]:
test_df = add_datetime_info(test_df)
test_df = add_airport_dist(test_df)
test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                   test_df['dropoff_latitude'] , test_df['dropoff_longitude'])

test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                    test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])

In [99]:
test_df

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-1.291078,0.711463,-1.291220,0.711114,1,13,27,1,1,2015,42.055277,35.042878,18.501273,18.309374,10.095252,2.323260,2.843096
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-1.291314,0.710687,-1.291524,0.711033,1,13,27,1,1,2015,41.244373,30.827012,23.023659,12.664639,4.599471,2.425353,0.430894
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44,-1.291239,0.711244,-1.291189,0.711154,1,11,8,10,5,2011,41.831497,33.993679,19.353797,17.018308,8.797452,0.618628,-2.740065
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12,-1.291215,0.711532,-1.291377,0.711250,1,21,1,12,5,2012,43.964285,34.268523,19.525661,18.544904,10.709378,1.961033,2.731208
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12,-1.290951,0.711916,-1.291344,0.711124,1,21,1,12,5,2012,44.128523,36.440152,18.414857,20.732468,12.752865,5.387301,2.781733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51,-1.290987,0.712042,-1.290769,0.711752,6,12,10,5,6,2015,45.269299,40.932932,15.335748,26.208435,18.272382,2.124874,-2.624228
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51,-1.290593,0.712157,-1.290849,0.711682,6,17,12,1,0,2015,44.556623,42.372609,14.177726,27.226723,19.124297,3.270969,2.753453
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15,-1.291397,0.710813,-1.287874,0.709424,6,20,19,4,6,2015,21.595562,47.745807,27.574859,28.090684,21.598123,19.183941,-2.049182
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19,-1.291292,0.710967,-1.290482,0.712125,6,1,31,1,5,2015,42.936139,38.975927,16.820474,22.487426,14.206953,8.343486,-0.487073


In [100]:
test_key = test_df['key']
test_df = test_df.drop(columns=['key', 'pickup_datetime'])

In [101]:
test_df

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,-1.291078,0.711463,-1.291220,0.711114,1,13,27,1,1,2015,42.055277,35.042878,18.501273,18.309374,10.095252,2.323260,2.843096
1,-1.291314,0.710687,-1.291524,0.711033,1,13,27,1,1,2015,41.244373,30.827012,23.023659,12.664639,4.599471,2.425353,0.430894
2,-1.291239,0.711244,-1.291189,0.711154,1,11,8,10,5,2011,41.831497,33.993679,19.353797,17.018308,8.797452,0.618628,-2.740065
3,-1.291215,0.711532,-1.291377,0.711250,1,21,1,12,5,2012,43.964285,34.268523,19.525661,18.544904,10.709378,1.961033,2.731208
4,-1.290951,0.711916,-1.291344,0.711124,1,21,1,12,5,2012,44.128523,36.440152,18.414857,20.732468,12.752865,5.387301,2.781733
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,-1.290987,0.712042,-1.290769,0.711752,6,12,10,5,6,2015,45.269299,40.932932,15.335748,26.208435,18.272382,2.124874,-2.624228
9910,-1.290593,0.712157,-1.290849,0.711682,6,17,12,1,0,2015,44.556623,42.372609,14.177726,27.226723,19.124297,3.270969,2.753453
9911,-1.291397,0.710813,-1.287874,0.709424,6,20,19,4,6,2015,21.595562,47.745807,27.574859,28.090684,21.598123,19.183941,-2.049182
9912,-1.291292,0.710967,-1.290482,0.712125,6,1,31,1,5,2015,42.936139,38.975927,16.820474,22.487426,14.206953,8.343486,-0.487073


In [102]:
#Predict from test set
prediction = model.predict(test_df, num_iteration = model.best_iteration)      
submission = pd.DataFrame({
        "key": test_key,
        "fare_amount": prediction
})

In [103]:
submission.to_csv('/v3io/bigdata/ny-taxi/taxi_fare_submission.csv',index=False)