In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../input/train.csv', nrows = 100000)
train.shape

(100000, 8)

In [3]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
test = pd.read_csv('../input/test.csv')
test.shape

(9914, 7)

In [5]:
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [6]:
train.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [7]:
train.dropna(inplace=True)
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,11.354652,-72.494682,39.914481,-72.490967,39.919053,1.67382
std,9.716777,10.693934,6.225686,10.471386,6.213427,1.300171
min,-44.9,-736.55,-74.00767,-84.654241,-74.006377,0.0
25%,6.0,-73.992041,40.734996,-73.991215,40.734182,1.0
50%,8.5,-73.981789,40.752765,-73.98,40.753243,1.0
75%,12.5,-73.966982,40.767258,-73.963433,40.768166,2.0
max,200.0,40.787575,401.083332,40.851027,404.616667,6.0


In [8]:
train = train[train['fare_amount']>0]
train.shape

(99988, 8)

In [9]:
def distance(lat1, lon1, lat2, lon2):
    a = 0.5 - np.cos((lat2 - lat1) *  0.017453292519943295)/2 + np.cos(lat1 * 0.017453292519943295) * np.cos(lat2 * 0.017453292519943295) * (1 - np.cos((lon2 - lon1) *  0.017453292519943295)) / 2
    res = 0.6213712 * 12742 * np.arcsin(np.sqrt(a))
    return res

In [10]:
train['distance'] = distance(train.pickup_latitude, train.pickup_longitude, train.dropoff_latitude,train.dropoff_longitude)
test['distance'] = distance(test.pickup_latitude, test.pickup_longitude, test.dropoff_latitude,test.dropoff_longitude)

In [11]:
train = train[train['distance']<15]
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
count,99690.0,99690.0,99690.0,99690.0,99690.0,99690.0,99690.0
mean,11.31335,-72.554007,39.944326,-72.552828,39.944712,1.673959,2.015638
std,9.561078,10.260582,6.011748,10.260455,6.011816,1.300275,2.188023
min,0.01,-75.414728,-74.00767,-75.354559,-74.006377,0.0,0.0
25%,6.0,-73.992052,40.735095,-73.99122,40.734237,1.0,0.753459
50%,8.5,-73.981814,40.752824,-73.98003,40.753261,1.0,1.312705
75%,12.5,-73.967093,40.767276,-73.963563,40.768162,2.0,2.398401
max,200.0,40.787575,42.160275,40.851027,42.168717,6.0,14.991043


In [12]:
train = train[(train['passenger_count']!=0) & (train['passenger_count']<10)]

In [13]:
train['hour'] = train.pickup_datetime.apply(lambda x: pd.to_datetime(x).hour)
train['year'] = train.pickup_datetime.apply(lambda x: pd.to_datetime(x).year)
test['hour'] = test.pickup_datetime.apply(lambda x: pd.to_datetime(x).hour)
test['year'] = test.pickup_datetime.apply(lambda x: pd.to_datetime(x).year)

In [14]:
feat_cols = ['distance','passenger_count']
X = train[feat_cols]
y = train['fare_amount']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [16]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred_lin_reg = regressor.predict(test[feat_cols])

In [17]:
submission = pd.DataFrame({'key': test.key, 'fare_amount': y_pred_lin_reg},columns = ['key', 'fare_amount'])
submission.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.228636
1,2015-01-27 13:08:24.0000003,9.459609
2,2011-10-08 11:53:44.0000002,5.372132
3,2012-12-01 21:12:12.0000002,8.409145
4,2012-12-01 21:12:12.0000003,16.160625


In [18]:
submission.to_csv('Linear_regression.csv', index = False)

In [19]:
from sklearn.ensemble import RandomForestRegressor
r_reg= RandomForestRegressor(n_estimators=500)
r_reg.fit(X_train,y_train)
y_pred_rand_forest = r_reg.predict(test[feat_cols])

In [20]:
submission = pd.DataFrame({'key': test.key, 'fare_amount': y_pred_rand_forest},columns = ['key', 'fare_amount'])
submission.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.4092
1,2015-01-27 13:08:24.0000003,9.5956
2,2011-10-08 11:53:44.0000002,4.86
3,2012-12-01 21:12:12.0000002,6.9996
4,2012-12-01 21:12:12.0000003,9.78


In [21]:
submission.to_csv('Random_Forest_regression.csv', index = False)

In [22]:
import xgboost as xgb

def XGBoost(X_train,X_test,y_train,y_test,num_rounds=500):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)
    return xgb.train(params={'objective':'reg:linear','eval_metric':'rmse'},dtrain=dtrain,num_boost_round=num_rounds, early_stopping_rounds=20,evals=[(dtest,'test')],)

xgbm = XGBoost(X_train,X_test,y_train,y_test)
xgbm_pred = xgbm.predict(xgb.DMatrix(test[feat_cols]), ntree_limit = xgbm.best_ntree_limit)

  if getattr(data, 'base', None) is not None and \


[0]	test-rmse:10.4988
Will train until test-rmse hasn't improved in 20 rounds.
[1]	test-rmse:8.04751
[2]	test-rmse:6.52163
[3]	test-rmse:5.64071
[4]	test-rmse:5.16336
[5]	test-rmse:4.91795
[6]	test-rmse:4.79996
[7]	test-rmse:4.74465
[8]	test-rmse:4.70885
[9]	test-rmse:4.6995
[10]	test-rmse:4.6938
[11]	test-rmse:4.69278
[12]	test-rmse:4.6887
[13]	test-rmse:4.68845
[14]	test-rmse:4.69132
[15]	test-rmse:4.69216
[16]	test-rmse:4.6945
[17]	test-rmse:4.69754
[18]	test-rmse:4.70064
[19]	test-rmse:4.70166
[20]	test-rmse:4.7002
[21]	test-rmse:4.70298
[22]	test-rmse:4.70376
[23]	test-rmse:4.70507
[24]	test-rmse:4.70462
[25]	test-rmse:4.70679
[26]	test-rmse:4.70766
[27]	test-rmse:4.70906
[28]	test-rmse:4.71091
[29]	test-rmse:4.71233
[30]	test-rmse:4.71293
[31]	test-rmse:4.71484
[32]	test-rmse:4.7185
[33]	test-rmse:4.72231
Stopping. Best iteration:
[13]	test-rmse:4.68845



In [23]:
submission = pd.DataFrame({'key': test.key, 'fare_amount':xgbm_pred},columns = ['key', 'fare_amount'])
submission.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.880149
1,2015-01-27 13:08:24.0000003,9.059376
2,2011-10-08 11:53:44.0000002,4.798314
3,2012-12-01 21:12:12.0000002,7.916535
4,2012-12-01 21:12:12.0000003,15.403738


In [24]:
submission.to_csv('XGboost_regression.csv', index = False)