In [1]:
import numpy as np
import pandas as pd

n = 55000000 # Number of total rows
s = 100000 # Desired sample size
skip = sorted(np.random.choice(range(n), n-s, replace=False))
skip[0] = 1
train = pd.read_csv('train.csv', skiprows=skip, header=0)

test = pd.read_csv('test.csv')

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])

train['month'] = train['pickup_datetime'].dt.month
train['hour'] = train['pickup_datetime'].dt.hour
train['day'] = train['pickup_datetime'].dt.day
train['weekday'] = train['pickup_datetime'].dt.weekday
train = train.drop(['pickup_datetime'],axis=1)

test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

test['month'] = test['pickup_datetime'].dt.month
test['hour'] = test['pickup_datetime'].dt.hour
test['day'] = test['pickup_datetime'].dt.day
test['weekday'] = test['pickup_datetime'].dt.weekday
test = test.drop(['pickup_datetime'],axis=1)

train = train.dropna()

print(train.head())

                             key  fare_amount  pickup_longitude  \
0    2011-03-19 15:00:27.0000001          8.9        -73.981030   
1    2010-04-16 20:35:21.0000004          8.5        -73.944602   
2    2013-01-24 10:15:00.0000002          4.0        -73.976085   
3  2010-06-16 21:29:00.000000168          6.5        -73.984437   
4   2014-05-03 19:57:00.00000036         15.0        -73.994265   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  \
0        40.729782         -73.993011         40.752803                1   
1        40.815676         -73.917888         40.806313                1   
2        40.755877         -73.979065         40.750322                6   
3        40.750902         -73.977672         40.760995                1   
4        40.750880         -73.988083         40.737902                1   

   month  hour  day  weekday  
0      3    15   19        5  
1      4    20   16        4  
2      1    10   24        3  
3      6    21  

In [2]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [3]:
cols = ['month','hour','day','weekday']
for col in cols:
    train = create_dummies(train, col)
    test = create_dummies(test, col)


In [4]:
train = train.drop(['month','hour','day','weekday'],axis=1)
train.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month_1,month_2,month_3,...,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,2011-03-19 15:00:27.0000001,8.9,-73.98103,40.729782,-73.993011,40.752803,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,2010-04-16 20:35:21.0000004,8.5,-73.944602,40.815676,-73.917888,40.806313,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2013-01-24 10:15:00.0000002,4.0,-73.976085,40.755877,-73.979065,40.750322,6,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2010-06-16 21:29:00.000000168,6.5,-73.984437,40.750902,-73.977672,40.760995,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2014-05-03 19:57:00.00000036,15.0,-73.994265,40.75088,-73.988083,40.737902,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
cols = list(train.columns)
features = cols[2:]
target = ['fare_amount']
features

['pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8',
 'month_9',
 'month_10',
 'month_11',
 'month_12',
 'hour_0',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'day_7',
 'day_8',
 'day_9',
 'day_10',
 'day_11',
 'day_12',
 'day_13',
 'day_14',
 'day_15',
 'day_16',
 'day_17',
 'day_18',
 'day_19',
 'day_20',
 'day_21',
 'day_22',
 'day_23',
 'day_24',
 'day_25',
 'day_26',
 'day_27',
 'day_28',
 'day_29',
 'day_30',
 'day_31',
 'weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6']

In [6]:
#Clean up the trian dataset to eliminate out of range values
train = train[train['fare_amount'] > 0]
train = train[train['pickup_longitude'] < -72]
train = train[(train['pickup_latitude'] > 40) & (train['pickup_latitude'] < 44)]
train = train[train['dropoff_longitude'] < -72]
train = train[(train['dropoff_latitude'] > 40) & (train['dropoff_latitude'] < 44)]
train = train[(train['passenger_count'] > 0) & (train['passenger_count'] < 10)]
train.dtypes

key                   object
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
month_1                uint8
month_2                uint8
month_3                uint8
month_4                uint8
month_5                uint8
month_6                uint8
month_7                uint8
month_8                uint8
month_9                uint8
month_10               uint8
month_11               uint8
month_12               uint8
hour_0                 uint8
hour_1                 uint8
hour_2                 uint8
hour_3                 uint8
hour_4                 uint8
hour_5                 uint8
hour_6                 uint8
hour_7                 uint8
hour_8                 uint8
hour_9                 uint8
hour_10                uint8
                      ...   
day_9                  uint8
day_10                 uint8
day_11                 uint8
day_12        

In [7]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month_1,month_2,month_3,month_4,...,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
count,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,...,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0,511029.0
mean,11.335625,-73.9822,40.751051,-73.986551,40.75138,1.690677,0.088946,0.084115,0.094153,0.092284,...,0.029838,0.029362,0.018739,0.12812,0.139769,0.145236,0.14923,0.152878,0.1527,0.132067
std,9.656648,2.086415,0.030315,3.760865,0.033439,1.305468,0.284666,0.27756,0.292042,0.289427,...,0.17014,0.16882,0.135601,0.334224,0.346748,0.352339,0.356316,0.35987,0.359698,0.338564
min,0.01,-740.55,40.081896,-2216.325133,40.027649,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,-73.99232,40.736592,-73.991582,40.73562,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.5,-73.982101,40.753462,-73.980605,40.753841,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,12.5,-73.968292,40.767557,-73.965396,40.768373,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,300.0,-72.176306,43.043032,-72.179201,43.267723,9.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

train_X, test_X, train_y, test_y = train_test_split(
    train[features], train[target], test_size=0.4,random_state=0)

clf = LinearRegression().fit(train_X, train_y)
predictions = clf.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.398519127304494

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

clf1 = RandomForestRegressor(n_estimators=10, criterion='mse')
grid_values = {'max_features': [i for i in range(1,15)],
              'max_depth' : [i for i in range(1,5)],
              'min_samples_split' : [i for i in range(2,5)]}
grid_clf_acc = GridSearchCV(clf1, param_grid = grid_values)
grid_clf_acc.fit(train_X, train_y)
print('Grid best parameter (max. score): ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

  from numpy.core.umath_tests import inner1d
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


('Grid best parameter (max. score): ', {'max_features': 13, 'min_samples_split': 4, 'max_depth': 4})
('Grid best score: ', 0.445440588620677)


In [11]:
clf1 = RandomForestRegressor(n_estimators=10, criterion='mse',max_features=grid_clf_acc.best_params_['max_features'], 
                             min_samples_split= grid_clf_acc.best_params_['min_samples_split'], 
                             max_depth= grid_clf_acc.best_params_['max_depth']).fit(train_X,train_y)

predictions = clf1.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

  This is separate from the ipykernel package so we can avoid doing imports until


7.315263080618265

In [12]:
test.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,hour,day,weekday,...,day_29,day_30,day_31,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,1,13,27,1,...,0,0,0,0,1,0,0,0,0,0
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,1,13,27,1,...,0,0,0,0,1,0,0,0,0,0
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,10,11,8,5,...,0,0,0,0,0,0,0,0,1,0
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,12,21,1,5,...,0,0,0,0,0,0,0,0,1,0
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,12,21,1,5,...,0,0,0,0,0,0,0,0,1,0


In [38]:
from sklearn.ensemble import GradientBoostingRegressor

clf2 = GradientBoostingRegressor().fit(train_X, train_y)

predictions = clf2.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

4.810585458372285

In [22]:
#Predict final fares for submission
GBPredictions = clf2.predict(test[features])
GBPredictions = np.round(GBPredictions, decimals=2)
GBPredictions

array([ 8.14,  8.7 ,  8.11, ..., 48.8 , 19.65,  8.13])

In [30]:
#Check predictions have the correct dimensions
GBPredictions.size


9914

In [31]:
#Set up predictions for a submittable dataframe
GB_submission = pd.DataFrame({"key": test['key'],"fare_amount": GBPredictions},columns = ['key','fare_amount'])

In [36]:
GB_submission.to_csv('GB_submission.csv',index=False)

Gradient Boosting scores 4.45136

In [33]:
#Predict final fares for submission
RFPredictions = clf1.predict(test[features])
RFPredictions = np.round(RFPredictions, decimals=2)
RFPredictions

array([10.  , 10.  , 10.17, ..., 31.21, 10.12, 10.02])

In [34]:
#Set up predictions for a submittable dataframe
RF_submission = pd.DataFrame({"key": test['key'],"fare_amount": RFPredictions},columns = ['key','fare_amount'])

In [35]:
RF_submission.to_csv('RF_submission.csv',index=False)

Random Forest scoes 7.028

In [39]:
from sklearn.neighbors import KNeighborsRegressor

clf3 = KNeighborsRegressor().fit(train_X, train_y)

predictions = clf3.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.37727119729122

In [40]:
from sklearn.linear_model import SGDRegressor

clf4 = SGDRegressor().fit(train_X, train_y)

predictions = clf4.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse



2010462310755.5288

In [42]:
from sklearn.neural_network import MLPRegressor

clf5 = MLPRegressor(hidden_layer_sizes = [100, 100]).fit(train_X, train_y)

predictions = clf5.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.60386159488703

In [43]:
from sklearn.linear_model import Ridge

clf6 = Ridge().fit(train_X, train_y)

predictions = clf6.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.398542489749142

In [44]:
from sklearn.linear_model import Lasso

clf7 = Lasso().fit(train_X, train_y)

predictions = clf7.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.628351727206448

In [46]:
from sklearn.svm import SVR

clf8 = SVR().fit(train_X, train_y)

predictions = clf8.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse

9.419686045391552

Using GridSearch to find the best parameters for Gradient Boosting Regressor

In [47]:
clf9 = GradientBoostingRegressor()

grid_values = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'criterion' : ['friedman_mse','mse'],
              'learning_rate' : [0.01,0.1,1],
              'min_samples_leaf': [i for i in range(1,5)],
              'min_samples_split': [i for i in range(2,5)]}
grid_clf_acc = GridSearchCV(clf9, param_grid = grid_values)
grid_clf_acc.fit(train_X, train_y)
print('Grid best parameter (max. score): ', grid_clf_acc.best_params_)
print('Grid best score: ', grid_clf_acc.best_score_)

('Grid best parameter (max. score): ', {'min_samples_split': 2, 'loss': 'huber', 'learning_rate': 1, 'criterion': 'friedman_mse', 'min_samples_leaf': 2})
('Grid best score: ', 0.7806194193640434)


In [48]:
clf9 = GradientBoostingRegressor(min_samples_split=grid_clf_acc.best_params_['min_samples_split'],
                                loss=grid_clf_acc.best_params_['loss'],
                                learning_rate=grid_clf_acc.best_params_['learning_rate'],
                                criterion=grid_clf_acc.best_params_['criterion'],
                                min_samples_leaf=grid_clf_acc.best_params_['min_samples_leaf']).fit(train_X, train_y)

predictions = clf9.predict(test_X)
lrmse = np.sqrt(metrics.mean_squared_error(test_y, predictions))
lrmse


4.423267789310712

In [49]:
#Predict final fares for submission
GB_New_Predictions = clf9.predict(test[features])
GB_New_Predictions = np.round(GB_New_Predictions, decimals=2)
#Set up predictions for a submittable dataframe
GB_New_submission = pd.DataFrame({"key": test['key'],"fare_amount": GB_New_Predictions},
                                 columns = ['key','fare_amount'])
GB_New_submission.to_csv('GB_New_submission.csv',index=False)

Gradient Boosting with new parameters scores 3.8543