In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
train['passenger_count'].value_counts()

1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: passenger_count, dtype: int64

In [4]:
train['vendor_id'].value_counts()

2    780302
1    678342
Name: vendor_id, dtype: int64

In [5]:
train['store_and_fwd_flag'].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [6]:
train['vendor_id'].corr(train['trip_duration'])

0.020304177490054781

In [7]:
train.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [8]:
train.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [9]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
train.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object

## Month and hour are the only features I care about

In [10]:
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13


In [11]:
train['pickup_month'] = train['pickup_datetime'].dt.month
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,pickup_month
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17,3
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0,6
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11,1
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19,4
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13,3


In [12]:
train['day_of_week'] = train['pickup_datetime'].dt.dayofweek

## Get the difference between Log and Lat of Pickup and Dropoff

In [13]:
train['longitude_difference'] = train['dropoff_longitude'] - train['pickup_longitude']
train['latitude_difference'] = train['dropoff_latitude'] - train['pickup_latitude']
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,pickup_month,day_of_week,longitude_difference,latitude_difference
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,17,3,0,0.017525,-0.002335
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,0,6,6,-0.019066,-0.007412
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,11,1,1,-0.026306,-0.053852
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,19,4,2,-0.002228,-0.013252
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,13,3,5,0.00013,-0.010689


In [14]:
features = ['passenger_count', 'pickup_hour', 'pickup_month', 'day_of_week', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'longitude_difference', 'latitude_difference']
target = 'trip_duration'

In [15]:
X_train = train[features]
y_train = train[target]
X_train.shape

(1458644, 10)

In [88]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators = 10, min_samples_leaf = 2)

In [89]:
RF.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [90]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [91]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

In [92]:
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['day_of_week'] = test['pickup_datetime'].dt.dayofweek

In [93]:
test['longitude_difference'] = test['dropoff_longitude'] - test['pickup_longitude']
test['latitude_difference'] = test['dropoff_latitude'] - test['pickup_latitude']

In [94]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,pickup_hour,day_of_week,longitude_difference,latitude_difference
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,6,23,3,-0.002045,0.024651
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,6,23,3,0.004395,-0.02459
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,6,23,3,0.011276,-0.00806
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,6,23,3,-0.030357,-0.041431
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,6,23,3,0.008705,-0.005585


In [95]:
X_test = test[features]
X_test.head()

Unnamed: 0,passenger_count,pickup_hour,pickup_month,day_of_week,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,longitude_difference,latitude_difference
0,1,23,6,3,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651
1,1,23,6,3,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459
2,1,23,6,3,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806
3,1,23,6,3,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431
4,1,23,6,3,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585


In [96]:
predictions = RF.predict(X_test)

In [97]:
X_test['trip_duration'] = np.absolute(predictions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [98]:
X_test.head()

Unnamed: 0,passenger_count,pickup_hour,pickup_month,day_of_week,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,longitude_difference,latitude_difference,trip_duration
0,1,23,6,3,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651,942.955556
1,1,23,6,3,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459,571.532381
2,1,23,6,3,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806,395.673333
3,1,23,6,3,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431,1129.45
4,1,23,6,3,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585,345.767857


In [99]:
result = pd.concat([test, X_test], axis = 1)

In [100]:
result.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,...,pickup_hour,pickup_month.1,day_of_week,pickup_longitude.1,pickup_latitude.1,dropoff_longitude.1,dropoff_latitude.1,longitude_difference,latitude_difference,trip_duration
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,6,...,23,6,3,-73.988129,40.732029,-73.990173,40.75668,-0.002045,0.024651,942.955556
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,6,...,23,6,3,-73.964203,40.679993,-73.959808,40.655403,0.004395,-0.02459,571.532381
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,6,...,23,6,3,-73.997437,40.737583,-73.98616,40.729523,0.011276,-0.00806,395.673333
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,6,...,23,6,3,-73.95607,40.7719,-73.986427,40.730469,-0.030357,-0.041431,1129.45
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,6,...,23,6,3,-73.970215,40.761475,-73.96151,40.75589,0.008705,-0.005585,345.767857


In [101]:
submit = result[['id', 'trip_duration']]

In [102]:
submit.head()

Unnamed: 0,id,trip_duration
0,id3004672,942.955556
1,id3505355,571.532381
2,id1217141,395.673333
3,id2150126,1129.45
4,id1598245,345.767857


In [103]:
submit.to_csv('submissions.csv', sep=',', index = False)

In [104]:
submit[submit['trip_duration'] < 0]

Unnamed: 0,id,trip_duration


In [57]:
predictions_id = test['id']
predictions_id.shape

(625134,)

In [58]:
predictions_id[:20]

0     id3004672
1     id3505355
2     id1217141
3     id2150126
4     id1598245
5     id0668992
6     id1765014
7     id0898117
8     id3905224
9     id1543102
10    id3024712
11    id3665810
12    id1836461
13    id3457080
14    id3376065
15    id3008739
16    id0902216
17    id3564824
18    id0820280
19    id0775088
Name: id, dtype: object

In [59]:
predictions_id = np.asarray(predictions_id)

In [65]:
predictions_id = predictions_id.reshape(len(predictions_id), 1)

In [68]:
predictions_id.shape

(625134, 1)

In [74]:
predictions.shape

(625134, 1)

In [75]:
predictions_id.shape

(625134, 1)

In [76]:
predictions_id[:10]

array([['id3004672'],
       ['id3505355'],
       ['id1217141'],
       ['id2150126'],
       ['id1598245'],
       ['id0668992'],
       ['id1765014'],
       ['id0898117'],
       ['id3905224'],
       ['id1543102']], dtype=object)

array([[ 1030.46522693],
       [  983.56059708],
       [  981.45641874],
       [ 1040.45682435],
       [  988.25047959],
       [ 1013.53964154],
       [  940.47679908],
       [ 1003.43816107],
       [  912.8443156 ],
       [  990.64978806]])