# New York City Taxi Trip Duration 02 - Machine Learning

###   
### 목표 : 데이터를 가지고 택시 주행거리 예측 모델 구축
###   
###   

## 1. 데이터 전처리 

In [55]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
submission = pd.read_csv("sample_submission.csv")

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 


train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9


####     
####  

## 2. 변수 선택 및 데이터 조정

### 1) 변수 선택

In [4]:
print(train.shape)
train.columns

(1458644, 32)


Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_year', 'pickup_month', 'pickup_day',
       'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
       'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 'vendor_1', 'vendor_2',
       'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
       'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7',
       'passenger_8', 'passenger_9'],
      dtype='object')

In [5]:
print(test.shape)
test.columns

(625134, 30)


Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_year', 'pickup_month',
       'pickup_day', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_dayofweek', 'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
       'vendor_1', 'vendor_2', 'passenger_0', 'passenger_1', 'passenger_2',
       'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6',
       'passenger_7', 'passenger_8', 'passenger_9'],
      dtype='object')

#### 모델링에 필요한 변수들 선택

In [46]:
feature_names = [ 'pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

#### 'id'는 단순 구분을 위한 값이고,  'pickup_datetime'의 경우 날짜형태라 제외
#### 'dropoff_longitude', 'dropoff_latitude', 'dropoff_datetime'는 test 데이터에 없음
#### 'store_and_fwd_flag'의 경우 Y와 N이 문자열이기 때문에 계산을 못함. 원핫인코딩만 사용
#### 'trip_duration'은 목표값이니 제외

In [None]:
basic_features : 모든 변수 포함
feature_names =['pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

feautres_1 : vendor_id와 passenger_count의 경우 원핫인코딩 제외
feautres_1 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_id',
              'passenger_count']
    

In [56]:
feature_names = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_id',
              'passenger_count']
    

### 2) 데이터 조정

In [57]:
x_train = train[feature_names]
print(x_train.shape)
x_train.head()

(1458644, 13)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,vendor_id,passenger_count
0,-73.982155,40.767937,False,True,2016,3,14,17,24,55,0,2,1
1,-73.980415,40.738564,False,True,2016,6,12,0,43,35,6,1,1
2,-73.979027,40.763939,False,True,2016,1,19,11,35,24,1,2,1
3,-74.01004,40.719971,False,True,2016,4,6,19,32,31,2,2,1
4,-73.973053,40.793209,False,True,2016,3,26,13,30,55,5,2,1


In [58]:
x_test = test[feature_names]
print(x_test.shape)
x_test.head()

(625134, 13)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_dayofweek,vendor_id,passenger_count
0,-73.988129,40.732029,False,True,2016,6,30,23,59,58,3,1,1
1,-73.964203,40.679993,False,True,2016,6,30,23,59,53,3,1,1
2,-73.997437,40.737583,False,True,2016,6,30,23,59,47,3,1,1
3,-73.95607,40.7719,False,True,2016,6,30,23,59,41,3,2,1
4,-73.970215,40.761475,False,True,2016,6,30,23,59,33,3,1,1


In [59]:
label_name = "trip_duration"

y_train = train[label_name]
print(y_train.shape)
y_train.head()

(1458644,)


0     455
1     663
2    2124
3     429
4     435
Name: trip_duration, dtype: int64

In [60]:
y_train = np.log1p(y_train)
y_train.head()

0    6.122493
1    6.498282
2    7.661527
3    6.063785
4    6.077642
Name: trip_duration, dtype: float64

####   
####    

## 3. Linear Regression

In [61]:
from sklearn import linear_model

LR_model = linear_model.LinearRegression()
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([729.80673705, 906.99512094, 707.05120278, 675.30260338,
       671.79502179, 682.08701454, 714.38791486, 797.50526717,
       755.98802116, 700.34034333])

In [62]:
submission['trip_duration'] = predictions
submission.to_csv('DC2.csv', index = False)

#### 결과보고서

In [None]:
LR / basic_features : 0.77677
LR / features_1     : 0.77714