In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [31]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Pre-processing

In [32]:
df = df[(df['pickup_longitude'] > -74.023822) & (df['pickup_latitude'] < 40.886345) & (df['pickup_longitude'] < -73.901599) & (df['pickup_latitude'] > 40.698160)]

## get pickup_month and pickup_hour by to_datetime

In [33]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
df.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
dtype: object

In [34]:
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df.dtypes

id                            object
vendor_id                      int64
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
pickup_longitude             float64
pickup_latitude              float64
dropoff_longitude            float64
dropoff_latitude             float64
store_and_fwd_flag            object
trip_duration                  int64
pickup_month                   int64
pickup_hour                    int64
dtype: object

In [35]:
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

## Get the difference between longtitude and latitude

In [36]:
df['longitude_difference'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_difference'] = df['dropoff_latitude'] - df['pickup_latitude']

## Get dummies value for day_of_week, month, and hour

In [37]:
df['pickup_month'] = df['pickup_month'].astype('category')
pickup_month = pd.get_dummies(df['pickup_month'], prefix='month')
df['pickup_hour'] = df['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(df['pickup_hour'], prefix='hour')
df['day_of_week'] = df['day_of_week'].astype('category')
day_of_week = pd.get_dummies(df['day_of_week'], prefix='day_of_week')

In [38]:
frame = [df, pickup_month, pickup_hour, day_of_week]
df = pd.concat(frame, axis = 1)
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,1,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,0,0,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,0,1,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,1,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,1,0


## Calculte Distance

In [58]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [59]:
df['distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

## Randomly shuffle training set and test set

In [70]:
train = df.sample(frac=0.8, random_state=1)
train.shape

(1093876, 54)

In [71]:
test = df.loc[~df.index.isin(train.index)]
test.shape

(273469, 54)

In [81]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,distance
1343857,id0738517,2,2016-01-10 17:52:27,2016-01-10 17:58:51,2,-73.978043,40.74987,-73.990768,40.749699,N,...,0,0,0,0,0,0,0,0,1,1.071491
229973,id3416723,2,2016-04-09 15:09:32,2016-04-09 15:19:15,1,-73.994843,40.73999,-73.991089,40.73362,N,...,0,0,0,0,0,0,0,1,0,0.775279
1120869,id1271250,1,2016-01-30 10:11:56,2016-01-30 10:17:07,1,-73.961273,40.760609,-73.97403,40.760166,N,...,0,0,0,0,0,0,0,1,0,1.074844
1453840,id2816332,2,2016-03-04 11:42:42,2016-03-04 12:00:09,5,-73.959602,40.7761,-73.973389,40.75565,N,...,0,0,0,0,0,0,1,0,0,2.551652
1018243,id1970083,1,2016-01-01 13:31:15,2016-01-01 13:45:01,1,-73.971741,40.79443,-73.928986,40.860878,N,...,0,0,0,0,0,0,1,0,0,8.212749


## Extract useful features and get the training set and test set

In [72]:
train.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_month', 'pickup_hour', 'day_of_week',
       'longitude_difference', 'latitude_difference', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6', 'distance'],
      dtype='object')

In [73]:
features = ['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 
       'longitude_difference',
       'latitude_difference', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'day_of_week_0', 'day_of_week_1', 'day_of_week_2',
       'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'distance']
target = ['trip_duration']

In [74]:
train_set = train[features]
train_target = train[target]
test_set = test[features]
test_target = test[target]

## Use gridsearch to find best parameters

In [14]:
from sklearn.model_selection import GridSearchCV
parameters = {'learning_rate': [0.1, 0.15, 0.2, 0.25], 'n_estimators': [5, 10, 15, 20, 25, 30]}

## Select algorithm, train, and fit

In [108]:
from xgboost import XGBRegressor
xgb = XGBRegressor(learning_rate=0.15, n_estimators=50)
#xgb = GridSearchCV(xgbst, parameters)

In [109]:
xgb.fit(train_set, train['trip_duration'])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.15, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=350, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [98]:
#xgb.best_estimator_

In [110]:
predictions = xgb.predict(test_set)

In [20]:
def rmsle(predicted,real):
    sum=0.0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [111]:
prediction = np.absolute(predictions)

In [112]:
test_target_temp = test_target.as_matrix()

In [113]:
score = rmsle(prediction, test_target_temp)
print(score)

[ 0.5428737]


## TEST ORIGINAL FILE

In [114]:
test_original = pd.read_csv("test.csv")
test_original['pickup_datetime'] = pd.to_datetime(test_original['pickup_datetime'])
#test_original['dropoff_datetime'] = pd.to_datetime(test_original['dropoff_datetime'])
test_original['pickup_month'] = test_original['pickup_datetime'].dt.month
test_original['pickup_hour'] = test_original['pickup_datetime'].dt.hour
test_original['day_of_week'] = test_original['pickup_datetime'].dt.dayofweek
test_original['longitude_difference'] = test_original['dropoff_longitude'] - test_original['pickup_longitude']
test_original['latitude_difference'] = test_original['dropoff_latitude'] - test_original['pickup_latitude']
test_original['pickup_month'] = test_original['pickup_month'].astype('category')
pickup_month = pd.get_dummies(test_original['pickup_month'], prefix='month')
test_original['pickup_hour'] = test_original['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(test_original['pickup_hour'], prefix='hour')
test_original['day_of_week'] = test_original['day_of_week'].astype('category')
day_of_week = pd.get_dummies(test_original['day_of_week'], prefix='day_of_week')

test_original['distance'] = haversine_np(test_original['pickup_longitude'], test_original['pickup_latitude'], test_original['dropoff_longitude'], test_original['dropoff_latitude'])

frame2 = [test_original, pickup_month, pickup_hour, day_of_week]
test_original = pd.concat(frame2, axis = 1)
features = ['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 
       'longitude_difference',
       'latitude_difference', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'day_of_week_0', 'day_of_week_1', 'day_of_week_2',
       'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'distance']
target = ['trip_duration']

In [115]:
predictions_test_origin = xgb.predict(test_original[features])

In [116]:
test_original['trip_duration'] = np.absolute(predictions_test_origin)

In [117]:
result = test_original[['id', 'trip_duration']]

In [118]:
result.head()

Unnamed: 0,id,trip_duration
0,id3004672,1033.678467
1,id3505355,1344.416992
2,id1217141,562.044922
3,id2150126,1266.525757
4,id1598245,488.40567


## Export csv file

In [119]:
result.to_csv('submissions_add_distance_350.csv', sep=',', index = False)