# New York City Taxi Trip Duration 05 - HyperParameter in XGBoost

###   
### 목표 : XGBoost 모델을 HyperParmeter 조정을 통해 가장 좋은 모델로 업그레이드 한다.
###   
###   

### ● 그동안 놓쳤던 변수 'dropoff_longitude'과 'dropoff_latitude' 변수를 같이 활용한다.
### ● Parameter 값 조정을 통해 최소 수준의 성능의 모델을 완성한다.  
###   

## 1.  데이터 전처리

In [9]:
import pandas as pd
import numpy as np
from math import sin, cos, sqrt, atan2, radians 
import time
start = time.time()
print("시작")

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
weather = pd.read_csv("weather.csv", parse_dates = ['datetime'])  # 데이터 전처리한 파일
submission = pd.read_csv("sample_submission.csv")

# 날짜데이터 분할

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 

# 원핫인코딩

train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

# 요일 원핫인코딩 추가

train["mon"] = train["pickup_dayofweek"] == 0
train["tue"] = train["pickup_dayofweek"] == 1
train["wed"] = train["pickup_dayofweek"] == 2
train["thu"] = train["pickup_dayofweek"] == 3
train["fri"] = train["pickup_dayofweek"] == 4
train["sat"] = train["pickup_dayofweek"] == 5
train["sun"] = train["pickup_dayofweek"] == 6

test["mon"] = test["pickup_dayofweek"] == 0
test["tue"] = test["pickup_dayofweek"] == 1
test["wed"] = test["pickup_dayofweek"] == 2
test["thu"] = test["pickup_dayofweek"] == 3
test["fri"] = test["pickup_dayofweek"] == 4
test["sat"] = test["pickup_dayofweek"] == 5
test["sun"] = test["pickup_dayofweek"] == 6

# 이동거리 추가(하버사인 + 맨하탄)

def haversine(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def haversine(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * np.arcsin(sqrt(a)) 
    haversine_km = 6367 * c
    return haversine_km

def manhattan_distance(train):
    lon1 = train['pickup_longitude']
    lat1 = train['pickup_latitude']
    lon2 = train['dropoff_longitude']
    lat2 = train['dropoff_latitude']
    manhattan_km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return manhattan_km

def manhattan_distance(test):
    lon1 = test['pickup_longitude']
    lat1 = test['pickup_latitude']
    lon2 = test['dropoff_longitude']
    lat2 = test['dropoff_latitude']
    km = (abs(lon1 - lon2)*111) + (abs(lat1 - lat2)*85.397)
    return km

train['haversine_distance'] = train.apply(lambda train: haversine(train), axis=1)
test['haversine_distance'] = test.apply(lambda test: haversine(test), axis=1)
train['manhattan_distance'] = train.apply(lambda train: manhattan_distance(train), axis=1)
test['manhattan_distance'] = test.apply(lambda test: manhattan_distance(test), axis=1)


# 날씨데이터 추가

weather['year'] = weather['datetime'].dt.year  
weather['month'] = weather['datetime'].dt.month
weather['day'] = weather['datetime'].dt.day

weather = weather.rename(columns = {'year' : 'pickup_year', 'month' : 'pickup_month', 'day' : 'pickup_day'})

train = pd.merge(train, weather)
test = pd.merge(test, weather)


print("완료!")
print('Running Time : %.02f초' % (time.time() - start))


시작
완료!
Running Time : 618.67초


####  
####   

## 2. 변수 선택 및 데이터 조정

### 1) 변수 선택

#### 지난 번 모델에서 빠졌던 변수 'dropoff_longitude', 'dropoff_latitude'와 '요일 별 원핫인코딩'이 추가했다.

#### 사용할 전체 변수

In [None]:
feature_names = [  'pickup_longitude', 'pickup_latitude',
                   'dropoff_longitude', 'dropoff_latitude',
                 
                   'pickup_year', 'pickup_month', 'pickup_day',
                   'pickup_hour', 'pickup_minute', 'pickup_second', 
                 
                   'pickup_dayofweek', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                 
                   'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                   
                   'vendor_id', 'vendor_1', 'vendor_2',
                  
                   'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                   'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                   'passenger_7', 'passenger_8', 'passenger_9', 
                    
                
                   'haversine_distance', 'manhattan_distance', 
                 
                   'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                   'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                   'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                   'Precip. (mm)', 'rain', 'snow', 'fog']

In [None]:
basic_features = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

features_2  :  원핫인코딩 이외 제거(vendor_id 와 passenger_count,  pickup_dayofweek)
features_2 =     ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_1', 'vendor_2',
                  'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']


features_3  : 원핫인코딩 제거
features_3 = ['pickup_longitude', 'pickup_latitude',
              'dropoff_longitude', 'dropoff_latitude',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
              'vendor_id', 'passenger_count', 
              'haversine_distance', 'manhattan_distance', 
              'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
              'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
              'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
              'Precip. (mm)', 'rain', 'snow', 'fog']

features_4  : 초 제거
features_4 = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']


features_5  : 요일 제거 
features_5 = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

features_6  : 요일 제거 + 초제거 
features_6 = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

In [28]:
feature_names = features_4 = ['pickup_longitude', 'pickup_latitude',
                  'dropoff_longitude', 'dropoff_latitude',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
                  'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 
                  'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6', 
                  'passenger_7', 'passenger_8', 'passenger_9', 
                  'haversine_distance', 'manhattan_distance', 
                  'T_high', 'T_avg', 'T_low', 'D_high', 'D_avg', 'D_low', 
                  'H_high', 'H_avg', 'H_low', 'S_high', 'S_avg', 'S_low', 
                  'V_high', 'V_avg', 'V_low', 'W_high', 'W_avg', 'W_high.1',
                  'Precip. (mm)', 'rain', 'snow', 'fog']

### 2) 데이터 조정

In [29]:
x_train = train[feature_names]
x_test = test[feature_names]
label_name = "trip_duration"
y_train = train[label_name]
y_train = np.log1p(y_train)

In [30]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(1458644, 57)
(625134, 57)
(1458644,)


####   
####  

## 3. XGBoost Regressor

#### 학습 및 예측

In [31]:
start = time.time()
print("시작")
import xgboost as xgb

XGB_model = xgb.XGBRegressor(max_depth = 12, n_estimators = 2000, min_child_weight = 8, 
                             gamma = 0.4, colsample_bytree = 0.6, seed = 25, 
                             learning_rate = 0.02, subsample = 0.8, 
                             nthread = 5).fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)
print("완료")
print('학습 및 예측시간 : %.02f초' % (time.time() - start))
predictions[0:10]

시작
완료
학습 및 예측시간 : 5102.02초


array([ 848.9599 ,  597.46765,  420.14587,  956.4998 ,  347.9488 ,
        930.9195 , 1225.124  ,  866.6667 , 2754.348  ,  525.80725],
      dtype=float32)

In [None]:
f4 서브샘플 0.8

#### 결과 제출

In [32]:
submission['trip_duration'] = predictions
submission.to_csv('XGB102.csv', index = False)

####  
####  

## 4. 결과보고서 

#### 지난 'New York City Taxi Trip Duration 04 - 2nd Machine Learning'에선 가장 좋은 결과는 '0.42278'이었다.
#### 'dropoff_longitude', 'dropoff_latitude', '요일 원핫 인코딩'을 제외한 basic_features 변수들을 취하고,
#### max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9 의 hyperparameter 값을 가진다.
#### nthread의 경우 cpu 사용 갯수를 지정하는 조건이니 제외하고 나머지 hyperparameter 값을 좀 더 조장한다.

In [None]:
basic_features

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38801
    
# subsample 만 조정
subsample = 0.8 : 0.38840

In [None]:
features_2

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38825

# subsample 만 조정
subsample = 0.8    : 0.38881

In [None]:
features_3

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38852

In [None]:
features_4

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38773
    
# subsample 만 조정

subsample = 0.8    : 0.38814

In [None]:
features_5

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38822

In [None]:
features_6

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4, 
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9    : 0.38779

In [None]:
basic_features

max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, seed = 25, learning_rate = 0.02, subsample = 0.9  : 0.42594
    
# n_estimators  : 2000        
# learning_rate : 0.02


    
# subsample 만 조정
  
subsample = 0.5 : 

    
# colsample_bytree 만 조정

colsample_bytree = 0.7 : 


# gamma 만 조정

gamma = 0.3 : 
    
# min_child_weight 만 조정

min_child_weight = 9 : 0.72810
    
# seed만 조정  

seed = 30 : 0.72804
    
