# New York City Taxi Trip Duration 02 - Machine Learning

###   
### 목표 : 데이터를 가지고 택시 주행거리 예측 모델 구축
###   
###   

## 1. 데이터 전처리 

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
submission = pd.read_csv("sample_submission.csv")

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 


train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9


####     
####  

## 2. 변수 선택 및 데이터 조정

### 1) 변수 선택

In [2]:
print(train.shape)
train.columns

(1458644, 32)


Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_year', 'pickup_month', 'pickup_day',
       'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
       'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 'vendor_1', 'vendor_2',
       'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
       'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7',
       'passenger_8', 'passenger_9'],
      dtype='object')

In [3]:
print(test.shape)
test.columns

(625134, 30)


Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_year', 'pickup_month',
       'pickup_day', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_dayofweek', 'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
       'vendor_1', 'vendor_2', 'passenger_0', 'passenger_1', 'passenger_2',
       'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6',
       'passenger_7', 'passenger_8', 'passenger_9'],
      dtype='object')

#### 모델링에 필요한 변수들 선택

In [None]:
feature_names = [ 'pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

#### 'id'는 단순 구분을 위한 값이고,  'pickup_datetime'의 경우 날짜형태라 제외
#### 'dropoff_longitude', 'dropoff_latitude', 'dropoff_datetime'는 test 데이터에 없음
#### 'store_and_fwd_flag'의 경우 Y와 N이 문자열이기 때문에 계산을 못함. 원핫인코딩만 사용
#### 'trip_duration'은 목표값이니 제외

In [None]:
basic_features : 모든 변수 포함
feature_names =['pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

feautres_1 : vendor_id와 passenger_count의 경우 원핫인코딩 제외
feautres_1 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_id',
              'passenger_count']
    
feautres_2 : vendor_id와 passenger_count의 경우 원핫인코딩만 사용
feautres_2 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']
    
feautres_3 :  초 제외 /vendor_id와 passenger_count의 경우 원핫인코딩만 사용
feautres_3 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']

In [21]:
feature_names = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']


In [15]:
feature_names =['pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

### 2) 데이터 조정

In [22]:
x_train = train[feature_names]
print(x_train.shape)
x_train.head()

(1458644, 22)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_dayofweek,...,passenger_0,passenger_1,passenger_2,passenger_3,passenger_4,passenger_5,passenger_6,passenger_7,passenger_8,passenger_9
0,-73.982155,40.767937,False,True,2016,3,14,17,24,0,...,False,True,False,False,False,False,False,False,False,False
1,-73.980415,40.738564,False,True,2016,6,12,0,43,6,...,False,True,False,False,False,False,False,False,False,False
2,-73.979027,40.763939,False,True,2016,1,19,11,35,1,...,False,True,False,False,False,False,False,False,False,False
3,-74.01004,40.719971,False,True,2016,4,6,19,32,2,...,False,True,False,False,False,False,False,False,False,False
4,-73.973053,40.793209,False,True,2016,3,26,13,30,5,...,False,True,False,False,False,False,False,False,False,False


In [23]:
x_test = test[feature_names]
print(x_test.shape)
x_test.head()

(625134, 22)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_dayofweek,...,passenger_0,passenger_1,passenger_2,passenger_3,passenger_4,passenger_5,passenger_6,passenger_7,passenger_8,passenger_9
0,-73.988129,40.732029,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
1,-73.964203,40.679993,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
2,-73.997437,40.737583,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
3,-73.95607,40.7719,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
4,-73.970215,40.761475,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False


In [7]:
label_name = "trip_duration"

y_train = train[label_name]
print(y_train.shape)
y_train.head()

(1458644,)


0     455
1     663
2    2124
3     429
4     435
Name: trip_duration, dtype: int64

In [8]:
y_train = np.log1p(y_train)
y_train.head()

0    6.122493
1    6.498282
2    7.661527
3    6.063785
4    6.077642
Name: trip_duration, dtype: float64

####   
####    

## 3. Linear Regression

In [12]:
from sklearn import linear_model

LR_model = linear_model.LinearRegression()
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([722.56916785, 897.60609149, 699.99670213, 670.02295395,
       665.14265145, 675.24490863, 707.08564857, 828.40092795,
       787.31657492, 694.75568353])

In [13]:
submission['trip_duration'] = predictions
submission.to_csv('LR4.csv', index = False)

#### 결과보고서

In [None]:
LR / basic_features : 0.77677
LR / features_1     : 0.77714
LR / features_2     : 0.77677
LR / features_3     : 0.77677

####  
####  

## 4. Decision Tree Regressor

In [43]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor()
DT_model.fit(x_train, y_train)

predictions = DT_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([  72.,  192., 1311., 1290.,  974.,  814.,  464.,  265.,  338.,
        748.])

In [44]:
submission['trip_duration'] = predictions
submission.to_csv('DC8.csv', index = False)

#### 결과보고서 

In [None]:
DT / basic_features : 1.07356
DT / features_1     : 1.07007
DT / features_2     : 1.07381
DT / features_3     : 1.07070

####   
####   

## 5. Random Forest Regressor

In [22]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(x_train, y_train)

predictions = RF_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 607.59663335,  542.54899759,  842.85520849,  730.95257284,
        659.34357294,  726.39376415,  470.19939027,  536.68291401,
        419.325969  , 1887.22982695])

In [23]:
submission['trip_duration'] = predictions
submission.to_csv('RF4.csv', index = False)

#### 결과보고서

In [None]:
RF / basic_features : 0.77799
RF / features_1     : 0.77918
RF / features_2     : 0.77863
RF / features_3     : 0.77942

####   
####   

## 6. XGBoost Regressor 

In [24]:
import xgboost as xgb

XGB_model = xgb.XGBRegressor()
XGB_model.fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([643.61865, 628.2386 , 643.61865, 546.9475 , 594.29193, 661.1439 ,
       636.3633 , 899.7914 , 681.5244 , 664.80994], dtype=float32)

In [25]:
submission['trip_duration'] = predictions
submission.to_csv('XGB4.csv', index = False)

#### 결과보고서

In [None]:
XGB / basic_features : 0.74462
XGB / features_1     : 0.74462
XGB / features_2     : 0.74448
XGB / features_3     : 0.74448