In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
from sklearn.cluster import MiniBatchKMeans
from math import radians, cos, sin, asin, sqrt
%matplotlib inline

In [42]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Pre-processing

In [43]:
df = df[(df['pickup_longitude'] > -74.023822) & (df['pickup_latitude'] < 40.886345) & (df['pickup_longitude'] < -73.901599) & (df['pickup_latitude'] > 40.698160)]
m = np.mean(df['trip_duration'])
s = np.std(df['trip_duration'])
df = df[(df['trip_duration'] <= m + 2*s) & (df['trip_duration'] >= m -2*s)]

## get pickup_month and pickup_hour by to_datetime

In [44]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [45]:
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_hour'] = df['pickup_datetime'].dt.hour

In [46]:
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

## Get the difference between longtitude and latitude

In [47]:
# df['longitude_difference'] = df['dropoff_longitude'] - df['pickup_longitude']
# df['latitude_difference'] = df['dropoff_latitude'] - df['pickup_latitude']

## Get dummies value for day_of_week, month, and hour

In [48]:
df['pickup_month'] = df['pickup_month'].astype('category')
pickup_month = pd.get_dummies(df['pickup_month'], prefix='month')
df['pickup_hour'] = df['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(df['pickup_hour'], prefix='hour')
df['day_of_week'] = df['day_of_week'].astype('category')
day_of_week = pd.get_dummies(df['day_of_week'], prefix='day_of_week')

In [49]:
frame = [df, pickup_month, pickup_hour, day_of_week]
df = pd.concat(frame, axis = 1)
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,1,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,0,0,0,0,0,0,0,1
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,0,1,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,1,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,1,0


## Calculte Distance

In [50]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km
def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [51]:
coords = np.vstack((df[['pickup_latitude', 'pickup_longitude']],
                    df[['dropoff_latitude', 'dropoff_longitude']]))

In [52]:
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

In [53]:
df['pickup_cluster'] = kmeans.predict(df[['pickup_latitude', 'pickup_longitude']])
df['dropoff_cluster'] = kmeans.predict(df[['dropoff_latitude', 'dropoff_longitude']])

In [54]:
df['distance'] = haversine_np(df['pickup_longitude'], df['pickup_latitude'], df['dropoff_longitude'], df['dropoff_latitude'])

In [55]:
cluster_pickup_train = pd.get_dummies(df['pickup_cluster'], prefix='p', prefix_sep='_')
cluster_dropoff_train = pd.get_dummies(df['dropoff_cluster'], prefix='d', prefix_sep='_')
df2 = [df, cluster_pickup_train, cluster_dropoff_train]
df = pd.concat(df2, axis = 1)

In [56]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,d_90,d_91,d_92,d_93,d_94,d_95,d_96,d_97,d_98,d_99
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,0,0,0,0,0,0,0,0,0,0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,0,0,1,0,0,0,0,0,0,0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,0,0,0,0,0,0,0,0,0,0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,0,0,0,0,0,0,0,0,0,0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,0,0,0,0,0,0,0,0,0,0


## Randomly shuffle training set and test set

In [57]:
train = df.sample(frac=0.9, random_state=1)
train.shape

(1228868, 229)

In [58]:
test = df.loc[~df.index.isin(train.index)]
test.shape

(136541, 229)

In [59]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,d_90,d_91,d_92,d_93,d_94,d_95,d_96,d_97,d_98,d_99
1360248,id2465156,1,2016-05-25 12:17:55,2016-05-25 12:29:15,2,-73.974045,40.747536,-73.992203,40.73444,N,...,0,0,0,0,0,0,0,0,0,0
804703,id1176352,1,2016-02-17 20:51:25,2016-02-17 20:56:07,2,-73.985947,40.721096,-73.984123,40.71069,N,...,0,0,0,0,0,0,0,0,0,0
46078,id0099358,1,2016-03-28 20:08:08,2016-03-28 20:13:20,1,-73.996986,40.752548,-73.998528,40.740223,N,...,0,0,0,0,0,0,0,0,0,0
658305,id0011763,1,2016-04-23 01:39:15,2016-04-23 01:47:49,4,-73.986626,40.760738,-73.991859,40.735432,N,...,0,0,0,0,0,0,0,0,0,0
1072443,id2844502,2,2016-03-10 15:28:25,2016-03-10 15:39:33,1,-73.972733,40.759998,-73.980171,40.748501,N,...,0,0,0,0,0,0,0,0,0,0


## Extract useful features and get the training set and test set

In [60]:
#list(train.columns)

In [61]:
target = ['trip_duration']

In [62]:
train_set = train.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','pickup_datetime',
 'dropoff_datetime','trip_duration',
 'pickup_month', 'pickup_hour', 'day_of_week',  'pickup_cluster', 'dropoff_cluster','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis = 1)
train_target = train[target]
test_set = test.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','pickup_datetime',
 'dropoff_datetime','trip_duration',
 'pickup_month', 'pickup_hour', 'day_of_week',  'pickup_cluster', 'dropoff_cluster','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis = 1)
test_target = test[target]

In [63]:
train.shape

(1228868, 229)

In [64]:
target = df['trip_duration']
train_all = df.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','pickup_datetime',
 'dropoff_datetime','trip_duration',
 'pickup_month', 'pickup_hour', 'day_of_week',  'pickup_cluster', 'dropoff_cluster','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], axis = 1)

In [65]:
#list(train_all.columns)

## Use gridsearch to find best parameters

In [66]:
# from sklearn.model_selection import GridSearchCV
# parameters = {'learning_rate': [0.1, 0.15, 0.2, 0.25], 'n_estimators': [5, 10, 15, 20, 25, 30]}

## Select algorithm, train, and fit

In [67]:
# from xgboost import XGBRegressor
# xgb = XGBRegressor(learning_rate=0.15, n_estimators=30)

In [68]:
# xgb.fit(train_all, target)

In [69]:
Y_train = train_target
Y_test = test_target
dtrain = xgb.DMatrix(train_set, label=Y_train)
dvalid = xgb.DMatrix(test_set, label=Y_test)
dtest = xgb.DMatrix(test_set)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [89]:
list(train_set.columns)

['month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'hour_0',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'distance',
 'p_0',
 'p_1',
 'p_3',
 'p_4',
 'p_5',
 'p_6',
 'p_8',
 'p_9',
 'p_12',
 'p_13',
 'p_14',
 'p_16',
 'p_17',
 'p_18',
 'p_19',
 'p_22',
 'p_23',
 'p_24',
 'p_25',
 'p_26',
 'p_27',
 'p_28',
 'p_30',
 'p_31',
 'p_32',
 'p_33',
 'p_34',
 'p_35',
 'p_38',
 'p_39',
 'p_40',
 'p_41',
 'p_42',
 'p_43',
 'p_45',
 'p_46',
 'p_48',
 'p_49',
 'p_50',
 'p_51',
 'p_52',
 'p_55',
 'p_57',
 'p_59',
 'p_60',
 'p_61',
 'p_62',
 'p_63',
 'p_64',
 'p_65',
 'p_67',
 'p_69',
 'p_70',
 'p_71',
 'p_72',
 'p_73',
 'p_74',
 'p_77

In [150]:
xgb_pars = {'min_child_weight': 1, 'eta': 0.15, 'colsample_bytree': 0.9, 
            'max_depth': 10,
'subsample': 0.9, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
'eval_metric': 'rmse', 'objective': 'reg:linear'}
model = xgb.train(xgb_pars, dtrain, 20, watchlist, early_stopping_rounds=2,
      maximize=False, verbose_eval=1)
print('Modeling RMSLE %.5f' % model.best_score)

[0]	train-rmse:850.978	valid-rmse:852.952
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2 rounds.
[1]	train-rmse:745.448	valid-rmse:747.422
[2]	train-rmse:658.723	valid-rmse:660.832
[3]	train-rmse:587.616	valid-rmse:589.887
[4]	train-rmse:529.794	valid-rmse:532.099
[5]	train-rmse:483.511	valid-rmse:486.008
[6]	train-rmse:446.632	valid-rmse:449.289
[7]	train-rmse:417.18	valid-rmse:419.992
[8]	train-rmse:399.699	valid-rmse:402.548
[9]	train-rmse:380.629	valid-rmse:383.634
[10]	train-rmse:365.793	valid-rmse:368.983
[11]	train-rmse:354.246	valid-rmse:357.58
[12]	train-rmse:345.456	valid-rmse:348.921
[13]	train-rmse:338.553	valid-rmse:342.137
[14]	train-rmse:334.208	valid-rmse:337.859
[15]	train-rmse:329.628	valid-rmse:333.39
[16]	train-rmse:325.878	valid-rmse:329.791
[17]	train-rmse:322.482	valid-rmse:326.501
[18]	train-rmse:320.561	valid-rmse:324.587
[19]	train-rmse:318.245	valid-rmse:322.396
Modeling 

In [72]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.grid_search import GridSearchCV
# param_grid = {
#     'n_estimators': [8, 10, 12, 15],
#     'min_samples_leaf': [1, 3, 5, 7]
# }
# rf = GridSearchCV(RandomForestRegressor(), param_grid)
# #xgb = GridSearchCV(xgbst, parameters)

In [141]:
pred = model.predict(dtest)

In [74]:
#xgb.best_estimator_

In [75]:
#rf.best_estimator_

In [76]:
#predictions = rf.predict(test_set)

In [77]:
def rmsle(predicted,real):
    sum=0.0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [142]:
prediction = np.absolute(pred).ravel()

In [143]:
test_target_temp = test_target.as_matrix()

In [144]:
score = rmsle(prediction, test_target_temp)
print(score)

[ 0.42597062]


In [69]:
print(model.score(test_set, test['trip_duration']))

AttributeError: 'Booster' object has no attribute 'score'

## TEST ORIGINAL FILE

In [91]:
test_original = pd.read_csv("test.csv")
test_original['pickup_datetime'] = pd.to_datetime(test_original['pickup_datetime'])
#test_original['dropoff_datetime'] = pd.to_datetime(test_original['dropoff_datetime'])
test_original['pickup_month'] = test_original['pickup_datetime'].dt.month
test_original['pickup_hour'] = test_original['pickup_datetime'].dt.hour
test_original['day_of_week'] = test_original['pickup_datetime'].dt.dayofweek
test_original['longitude_difference'] = test_original['dropoff_longitude'] - test_original['pickup_longitude']
test_original['latitude_difference'] = test_original['dropoff_latitude'] - test_original['pickup_latitude']
test_original['pickup_month'] = test_original['pickup_month'].astype('category')
pickup_month = pd.get_dummies(test_original['pickup_month'], prefix='month')
test_original['pickup_hour'] = test_original['pickup_hour'].astype('category')
pickup_hour = pd.get_dummies(test_original['pickup_hour'], prefix='hour')
test_original['day_of_week'] = test_original['day_of_week'].astype('category')
day_of_week = pd.get_dummies(test_original['day_of_week'], prefix='day_of_week')

test_original['distance'] = haversine_np(test_original['pickup_longitude'], test_original['pickup_latitude'], test_original['dropoff_longitude'], test_original['dropoff_latitude'])

frame2 = [test_original, pickup_month, pickup_hour, day_of_week]
test_original = pd.concat(frame2, axis = 1)
features_to_drop = ['id','vendor_id','passenger_count','store_and_fwd_flag','pickup_datetime',
 
 'pickup_month', 'pickup_hour', 'day_of_week',  'pickup_cluster', 'dropoff_cluster','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']
target = ['trip_duration']

coords = np.vstack((test_original[['pickup_latitude', 'pickup_longitude']],
                    test_original[['dropoff_latitude', 'dropoff_longitude']]))
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])
test_original['pickup_cluster'] = kmeans.predict(test_original[['pickup_latitude', 'pickup_longitude']])
test_original['dropoff_cluster'] = kmeans.predict(test_original[['dropoff_latitude', 'dropoff_longitude']])
test_original['distance'] = haversine_np(test_original['pickup_longitude'], test_original['pickup_latitude'], test_original['dropoff_longitude'], test_original['dropoff_latitude'])
cluster_pickup_test = pd.get_dummies(test_original['pickup_cluster'], prefix='p', prefix_sep='_')
cluster_dropoff_test = pd.get_dummies(test_original['dropoff_cluster'], prefix='d', prefix_sep='_')
df3 = [test_original, cluster_pickup_test, cluster_dropoff_test]
test_final = pd.concat(df3, axis = 1)
id_frame = test_final['id']

In [92]:
list(test_final.columns)

['id',
 'vendor_id',
 'pickup_datetime',
 'passenger_count',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'store_and_fwd_flag',
 'pickup_month',
 'pickup_hour',
 'day_of_week',
 'longitude_difference',
 'latitude_difference',
 'distance',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'hour_0',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'pickup_cluster',
 'dropoff_cluster',
 'p_0',
 'p_1',
 'p_2',
 'p_3',
 'p_4',
 'p_5',
 'p_6',
 'p_7',
 'p_8',
 'p_9',
 'p_10',
 'p_11',
 'p_12',
 'p_13',
 'p_14',
 'p_15',
 'p_16',
 'p_17',
 'p_18',
 'p_19',
 'p_20',
 'p_21',
 'p_22',
 'p_23',


In [93]:
test_final = test_final[['month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'hour_0',
 'hour_1',
 'hour_2',
 'hour_3',
 'hour_4',
 'hour_5',
 'hour_6',
 'hour_7',
 'hour_8',
 'hour_9',
 'hour_10',
 'hour_11',
 'hour_12',
 'hour_13',
 'hour_14',
 'hour_15',
 'hour_16',
 'hour_17',
 'hour_18',
 'hour_19',
 'hour_20',
 'hour_21',
 'hour_22',
 'hour_23',
 'day_of_week_0',
 'day_of_week_1',
 'day_of_week_2',
 'day_of_week_3',
 'day_of_week_4',
 'day_of_week_5',
 'day_of_week_6',
 'distance',
 'p_0',
 'p_1',
 'p_3',
 'p_4',
 'p_5',
 'p_6',
 'p_8',
 'p_9',
 'p_12',
 'p_13',
 'p_14',
 'p_16',
 'p_17',
 'p_18',
 'p_19',
 'p_22',
 'p_23',
 'p_24',
 'p_25',
 'p_26',
 'p_27',
 'p_28',
 'p_30',
 'p_31',
 'p_32',
 'p_33',
 'p_34',
 'p_35',
 'p_38',
 'p_39',
 'p_40',
 'p_41',
 'p_42',
 'p_43',
 'p_45',
 'p_46',
 'p_48',
 'p_49',
 'p_50',
 'p_51',
 'p_52',
 'p_55',
 'p_57',
 'p_59',
 'p_60',
 'p_61',
 'p_62',
 'p_63',
 'p_64',
 'p_65',
 'p_67',
 'p_69',
 'p_70',
 'p_71',
 'p_72',
 'p_73',
 'p_74',
 'p_77',
 'p_79',
 'p_81',
 'p_82',
 'p_83',
 'p_84',
 'p_85',
 'p_87',
 'p_88',
 'p_89',
 'p_90',
 'p_92',
 'p_93',
 'p_94',
 'p_95',
 'p_96',
 'p_97',
 'p_98',
 'd_0',
 'd_1',
 'd_2',
 'd_3',
 'd_4',
 'd_5',
 'd_6',
 'd_7',
 'd_8',
 'd_9',
 'd_10',
 'd_11',
 'd_12',
 'd_13',
 'd_14',
 'd_15',
 'd_16',
 'd_17',
 'd_18',
 'd_19',
 'd_20',
 'd_21',
 'd_22',
 'd_23',
 'd_24',
 'd_25',
 'd_26',
 'd_27',
 'd_28',
 'd_29',
 'd_30',
 'd_31',
 'd_32',
 'd_33',
 'd_34',
 'd_35',
 'd_36',
 'd_37',
 'd_38',
 'd_39',
 'd_40',
 'd_41',
 'd_42',
 'd_43',
 'd_44',
 'd_45',
 'd_46',
 'd_47',
 'd_48',
 'd_49',
 'd_50',
 'd_51',
 'd_52',
 'd_53',
 'd_54',
 'd_55',
 'd_56',
 'd_57',
 'd_58',
 'd_59',
 'd_60',
 'd_61',
 'd_62',
 'd_63',
 'd_64',
 'd_65',
 'd_66',
 'd_67',
 'd_68',
 'd_69',
 'd_70',
 'd_71',
 'd_72',
 'd_73',
 'd_74',
 'd_75',
 'd_76',
 'd_77',
 'd_78',
 'd_79',
 'd_80',
 'd_81',
 'd_82',
 'd_83',
 'd_84',
 'd_85',
 'd_86',
 'd_87',
 'd_88',
 'd_89',
 'd_90',
 'd_91',
 'd_92',
 'd_93',
 'd_94',
 'd_95',
 'd_96',
 'd_97',
 'd_98',
 'd_99']]

In [94]:
ddtest = xgb.DMatrix(test_final)

In [145]:
predictions_test_origin = model.predict(ddtest)

In [96]:
#prediction_xgb = xgb.predict(test_final)

In [100]:
#test_original['trip_duration'] = np.absolute(predictions_test_origin)
predictions_test_origin

array([  811.23956299,   728.87414551,   464.56454468, ...,  1531.82836914,
        2638.63671875,  1261.11437988], dtype=float32)

In [146]:
predictions_test_origin = np.absolute(predictions_test_origin)

In [147]:
predict_frame = pd.DataFrame(predictions_test_origin)
con = [test_final, predict_frame]
#result = pd.concat(con, axis = 1)
predict_frame.head()

Unnamed: 0,0
0,590.47699
1,743.217346
2,489.713165
3,1262.988403
4,363.124695


In [148]:
id_df = pd.DataFrame(id_frame)

In [136]:
con = [id_df, predict_frame]
result = pd.concat(con, axis = 1)

In [137]:
result.head()

Unnamed: 0,id,0
0,id3004672,633.563599
1,id3505355,752.042297
2,id1217141,509.640839
3,id2150126,1269.369141
4,id1598245,395.357391


In [138]:
result.head()

Unnamed: 0,id,0
0,id3004672,633.563599
1,id3505355,752.042297
2,id1217141,509.640839
3,id2150126,1269.369141
4,id1598245,395.357391


## Export csv file

In [149]:
result.to_csv('submissions_depth_10_repeat_30.csv', sep=',', index = False)