In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [100]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [101]:
train_normal = train[(train['pickup_longitude'] > -74.023822) & (train['pickup_latitude'] < 40.886345) & (train['pickup_longitude'] < -73.901599) & (train['pickup_latitude'] > 40.698160)]

In [102]:
#sns.regplot(x = 'pickup_longitude', y = 'pickup_latitude', data = train_normal, scatter = True, scatter_kws={"s": 10})

In [103]:
train['passenger_count'].value_counts()

1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: passenger_count, dtype: int64

In [104]:
train['vendor_id'].value_counts()

2    780302
1    678342
Name: vendor_id, dtype: int64

In [105]:
train['store_and_fwd_flag'].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [106]:
train['vendor_id'].corr(train['trip_duration'])

0.020304177490054781

In [107]:
train.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [108]:
train.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [109]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
train.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object

## Month and hour are the only features I care about

In [110]:
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13


In [111]:
train['pickup_month'] = train['pickup_datetime'].dt.month
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,pickup_month
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17,3
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0,6
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11,1
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19,4
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13,3


In [112]:
train['day_of_week'] = train['pickup_datetime'].dt.dayofweek

## day_of_week, hour and month are categorical data, transform them to dummies

In [113]:
train['day_of_week'] = train['day_of_week'].astype('category')
day_of_week = pd.get_dummies(train['day_of_week'], prefix='day_of_week')

In [114]:
day_of_week.head()

Unnamed: 0,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0


In [115]:
train['pickup_hour'] = train['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(train['pickup_hour'], prefix='hour')

In [116]:
train['pickup_month'] = train['pickup_month'].astype('category')
pickup_month = pd.get_dummies(train['pickup_month'], prefix='month')

In [117]:
dt = [train, pickup_hour, pickup_month, day_of_week]

In [118]:
train = pd.concat(dt, axis=1)

In [119]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,month_4,month_5,month_6,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,1,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,1,0,0,0,0,0,0,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,0,1,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,1,0,0,0,0,1,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,1,0


## Get the difference between Log and Lat of Pickup and Dropoff

In [120]:
train['longitude_difference'] = train['dropoff_longitude'] - train['pickup_longitude']
train['latitude_difference'] = train['dropoff_latitude'] - train['pickup_latitude']
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,month_6,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,longitude_difference,latitude_difference
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,1,0,0,0,0,0,0,0.017525,-0.002335
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,1,0,0,0,0,0,0,1,-0.019066,-0.007412
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,1,0,0,0,0,0,-0.026306,-0.053852
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,1,0,0,0,0,-0.002228,-0.013252
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,1,0,0.00013,-0.010689


In [121]:
train.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_hour', 'pickup_month', 'day_of_week', 'hour_0',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'longitude_difference',
       'latitude_difference'],
      dtype='object')

In [122]:
train.dtypes

id                              object
vendor_id                        int64
pickup_datetime         datetime64[ns]
dropoff_datetime        datetime64[ns]
passenger_count                  int64
pickup_longitude               float64
pickup_latitude                float64
dropoff_longitude              float64
dropoff_latitude               float64
store_and_fwd_flag              object
trip_duration                    int64
pickup_hour                   category
pickup_month                  category
day_of_week                   category
hour_0                           uint8
hour_1                           uint8
hour_2                           uint8
hour_3                           uint8
hour_4                           uint8
hour_5                           uint8
hour_6                           uint8
hour_7                           uint8
hour_8                           uint8
hour_9                           uint8
hour_10                          uint8
hour_11                  

In [123]:
features = ['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'longitude_difference', 'latitude_difference', 'hour_0',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6']
target = 'trip_duration'

In [124]:
X_train = train[features]
y_train = train[target]
X_train.shape

(1458644, 44)

## Fit and Predict

In [125]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [126]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [127]:
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['day_of_week'] = test['pickup_datetime'].dt.dayofweek

In [128]:
test['longitude_difference'] = test['dropoff_longitude'] - test['pickup_longitude']
test['latitude_difference'] = test['dropoff_latitude'] - test['pickup_latitude']

In [129]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,pickup_hour,day_of_week,longitude_difference,latitude_difference
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,6,23,3,-0.002045,0.024651
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,6,23,3,0.004395,-0.02459
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,6,23,3,0.011276,-0.00806
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,6,23,3,-0.030357,-0.041431
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,6,23,3,0.008705,-0.005585


In [130]:
test['day_of_week'] = test['day_of_week'].astype('category')
day_of_week = pd.get_dummies(test['day_of_week'], prefix='day_of_week')

In [131]:
test['pickup_hour'] = test['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(test['pickup_hour'], prefix='hour')
test['pickup_month'] = test['pickup_month'].astype('category')
pickup_month = pd.get_dummies(test['pickup_month'], prefix='month')

In [132]:
dt2 = [test, pickup_hour, pickup_month, day_of_week]
test = pd.concat(dt2, axis = 1)

In [133]:
test.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_month', 'pickup_hour',
       'day_of_week', 'longitude_difference', 'latitude_difference', 'hour_0',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6'],
      dtype='object')

In [134]:
X_test = test[features]
X_test.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,longitude_difference,latitude_difference,hour_0,hour_1,hour_2,...,month_4,month_5,month_6,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,1,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,1,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585,0,0,0,...,0,0,1,0,0,0,1,0,0,0


## XGBoost

In [135]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [136]:
predictions = model.predict(X_test)

In [137]:
X_test['trip_duration'] = np.absolute(predictions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [138]:
X_test.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,longitude_difference,latitude_difference,hour_0,hour_1,hour_2,...,month_5,month_6,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,trip_duration
0,1,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651,0,0,0,...,0,1,0,0,0,1,0,0,0,958.982544
1,1,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459,0,0,0,...,0,1,0,0,0,1,0,0,0,934.089661
2,1,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806,0,0,0,...,0,1,0,0,0,1,0,0,0,580.370605
3,1,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431,0,0,0,...,0,1,0,0,0,1,0,0,0,1392.856689
4,1,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585,0,0,0,...,0,1,0,0,0,1,0,0,0,537.424438


In [139]:
result = pd.concat([test, X_test], axis = 1)

In [42]:
result.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,...,pickup_hour,pickup_month.1,day_of_week,pickup_longitude.1,pickup_latitude.1,dropoff_longitude.1,dropoff_latitude.1,longitude_difference,latitude_difference,trip_duration
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,6,...,23,6,3,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651,1047.276245
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,6,...,23,6,3,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459,891.1297
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,6,...,23,6,3,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806,565.81543
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,6,...,23,6,3,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431,1396.858398
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,6,...,23,6,3,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585,497.615967


In [140]:
submit = result[['id', 'trip_duration']]

In [51]:
submit.head()

Unnamed: 0,id,trip_duration
0,id3004672,1093.755537
1,id3505355,1115.73949
2,id1217141,583.309216
3,id2150126,1421.304341
4,id1598245,583.309216


In [141]:
submit.to_csv('submissions.csv', sep=',', index = False)

In [142]:
submit[submit['trip_duration'] < 0]

Unnamed: 0,id,trip_duration


In [57]:
predictions_id = test['id']
predictions_id.shape

(625134,)

In [58]:
predictions_id[:20]

0     id3004672
1     id3505355
2     id1217141
3     id2150126
4     id1598245
5     id0668992
6     id1765014
7     id0898117
8     id3905224
9     id1543102
10    id3024712
11    id3665810
12    id1836461
13    id3457080
14    id3376065
15    id3008739
16    id0902216
17    id3564824
18    id0820280
19    id0775088
Name: id, dtype: object

In [59]:
predictions_id = np.asarray(predictions_id)

In [65]:
predictions_id = predictions_id.reshape(len(predictions_id), 1)

In [68]:
predictions_id.shape

(625134, 1)

In [74]:
predictions.shape

(625134, 1)

In [75]:
predictions_id.shape

(625134, 1)

In [76]:
predictions_id[:10]

array([['id3004672'],
       ['id3505355'],
       ['id1217141'],
       ['id2150126'],
       ['id1598245'],
       ['id0668992'],
       ['id1765014'],
       ['id0898117'],
       ['id3905224'],
       ['id1543102']], dtype=object)

array([[ 1030.46522693],
       [  983.56059708],
       [  981.45641874],
       [ 1040.45682435],
       [  988.25047959],
       [ 1013.53964154],
       [  940.47679908],
       [ 1003.43816107],
       [  912.8443156 ],
       [  990.64978806]])

In [None]:
def rmsle(predicted,real):
    sum=0.0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5