# New York City Taxi Trip Duration 02 - Machine Learning

###   
### 목표 : 데이터를 가지고 택시 주행거리 예측 모델 구축
###   
###   

## 1. 데이터 전처리 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv("train.csv", parse_dates = ['pickup_datetime', 'dropoff_datetime'])
test = pd.read_csv("test.csv", parse_dates = ['pickup_datetime'])
submission = pd.read_csv("sample_submission.csv")

train['pickup_year'] = train['pickup_datetime'].dt.year  
train['pickup_month'] = train['pickup_datetime'].dt.month
train['pickup_day'] = train['pickup_datetime'].dt.day
train['pickup_hour'] = train['pickup_datetime'].dt.hour
train['pickup_minute'] = train['pickup_datetime'].dt.minute
train['pickup_second'] = train['pickup_datetime'].dt.second
train["pickup_dayofweek"] = train["pickup_datetime"].dt.dayofweek 

test['pickup_year'] = test['pickup_datetime'].dt.year  
test['pickup_month'] = test['pickup_datetime'].dt.month
test['pickup_day'] = test['pickup_datetime'].dt.day
test['pickup_hour'] = test['pickup_datetime'].dt.hour
test['pickup_minute'] = test['pickup_datetime'].dt.minute
test['pickup_second'] = test['pickup_datetime'].dt.second
test["pickup_dayofweek"] = test["pickup_datetime"].dt.dayofweek 


train['store_and_fwd_flag_Y'] = train['store_and_fwd_flag'] == 'Y'
train['store_and_fwd_flag_N'] = train['store_and_fwd_flag'] == 'N'

test['store_and_fwd_flag_Y'] = test['store_and_fwd_flag'] == 'Y'
test['store_and_fwd_flag_N'] = test['store_and_fwd_flag'] == 'N'


train['vendor_1'] = train['vendor_id'] == 1
train['vendor_2'] = train['vendor_id'] == 2

test['vendor_1'] = test['vendor_id'] == 1
test['vendor_2'] = test['vendor_id'] == 2


train['passenger_0'] = train['passenger_count'] == 0
train['passenger_1'] = train['passenger_count'] == 1
train['passenger_2'] = train['passenger_count'] == 2
train['passenger_3'] = train['passenger_count'] == 3
train['passenger_4'] = train['passenger_count'] == 4
train['passenger_5'] = train['passenger_count'] == 5
train['passenger_6'] = train['passenger_count'] == 6
train['passenger_7'] = train['passenger_count'] == 7
train['passenger_8'] = train['passenger_count'] == 8
train['passenger_9'] = train['passenger_count'] == 9

test['passenger_0'] = test['passenger_count'] == 0
test['passenger_1'] = test['passenger_count'] == 1
test['passenger_2'] = test['passenger_count'] == 2
test['passenger_3'] = test['passenger_count'] == 3
test['passenger_4'] = test['passenger_count'] == 4
test['passenger_5'] = test['passenger_count'] == 5
test['passenger_6'] = test['passenger_count'] == 6
test['passenger_7'] = test['passenger_count'] == 7
test['passenger_8'] = test['passenger_count'] == 8
test['passenger_9'] = test['passenger_count'] == 9

print("완료!")

완료!


####     
####  

## 2. 변수 선택 및 데이터 조정

### 1) 변수 선택

In [2]:
print(train.shape)
train.columns

(1458644, 32)


Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_year', 'pickup_month', 'pickup_day',
       'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
       'store_and_fwd_flag_Y', 'store_and_fwd_flag_N', 'vendor_1', 'vendor_2',
       'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
       'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7',
       'passenger_8', 'passenger_9'],
      dtype='object')

In [3]:
print(test.shape)
test.columns

(625134, 30)


Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'pickup_year', 'pickup_month',
       'pickup_day', 'pickup_hour', 'pickup_minute', 'pickup_second',
       'pickup_dayofweek', 'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
       'vendor_1', 'vendor_2', 'passenger_0', 'passenger_1', 'passenger_2',
       'passenger_3', 'passenger_4', 'passenger_5', 'passenger_6',
       'passenger_7', 'passenger_8', 'passenger_9'],
      dtype='object')

#### 모델링에 필요한 변수들 선택

In [None]:
feature_names = [ 'pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

#### 'id'는 단순 구분을 위한 값이고,  'pickup_datetime'의 경우 날짜형태라 제외
#### 'dropoff_longitude', 'dropoff_latitude', 'dropoff_datetime'는 test 데이터에 없음
#### 'store_and_fwd_flag'의 경우 Y와 N이 문자열이기 때문에 계산을 못함. 원핫인코딩만 사용
#### 'trip_duration'은 목표값이니 제외

In [None]:
basic_features : 모든 변수 포함
feature_names =['pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']

feautres_1 : vendor_id와 passenger_count의 경우 원핫인코딩 제외
feautres_1 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_id',
              'passenger_count']
    
feautres_2 : vendor_id와 passenger_count의 경우 원핫인코딩만 사용
feautres_2 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']
    
feautres_3 :  초 제외 /vendor_id와 passenger_count의 경우 원핫인코딩만 사용
feautres_3 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_year', 'pickup_month', 'pickup_day',
              'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']

feautres_4 :  날짜 제외 / vendor_id와 passenger_count의 경우 원핫인코딩만 사용
feautres_4 = ['pickup_longitude', 'pickup_latitude',  
              'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
              'pickup_hour', 'pickup_minute', 'pickup_dayofweek',
              'vendor_1', 'vendor_2',
              'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
              'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
              'passenger_9']

In [2]:
feature_names =['pickup_longitude', 'pickup_latitude',  
                  'store_and_fwd_flag_Y', 'store_and_fwd_flag_N',
                  'pickup_year', 'pickup_month', 'pickup_day',
                  'pickup_hour', 'pickup_minute', 'pickup_second', 'pickup_dayofweek',
                  'vendor_id', 'vendor_1', 'vendor_2',
                  'passenger_count', 'passenger_0', 'passenger_1', 'passenger_2', 'passenger_3',
                  'passenger_4', 'passenger_5', 'passenger_6', 'passenger_7', 'passenger_8', 
                  'passenger_9']  # basic

# RF

### 2) 데이터 조정

In [3]:
x_train = train[feature_names]
print(x_train.shape)
x_train.head()

(1458644, 22)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_dayofweek,...,passenger_0,passenger_1,passenger_2,passenger_3,passenger_4,passenger_5,passenger_6,passenger_7,passenger_8,passenger_9
0,-73.982155,40.767937,False,True,2016,3,14,17,24,0,...,False,True,False,False,False,False,False,False,False,False
1,-73.980415,40.738564,False,True,2016,6,12,0,43,6,...,False,True,False,False,False,False,False,False,False,False
2,-73.979027,40.763939,False,True,2016,1,19,11,35,1,...,False,True,False,False,False,False,False,False,False,False
3,-74.01004,40.719971,False,True,2016,4,6,19,32,2,...,False,True,False,False,False,False,False,False,False,False
4,-73.973053,40.793209,False,True,2016,3,26,13,30,5,...,False,True,False,False,False,False,False,False,False,False


In [4]:
x_test = test[feature_names]
print(x_test.shape)
x_test.head()

(625134, 22)


Unnamed: 0,pickup_longitude,pickup_latitude,store_and_fwd_flag_Y,store_and_fwd_flag_N,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_dayofweek,...,passenger_0,passenger_1,passenger_2,passenger_3,passenger_4,passenger_5,passenger_6,passenger_7,passenger_8,passenger_9
0,-73.988129,40.732029,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
1,-73.964203,40.679993,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
2,-73.997437,40.737583,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
3,-73.95607,40.7719,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False
4,-73.970215,40.761475,False,True,2016,6,30,23,59,3,...,False,True,False,False,False,False,False,False,False,False


In [5]:
label_name = "trip_duration"

y_train = train[label_name]
print(y_train.shape)
y_train.head()

(1458644,)


0     455
1     663
2    2124
3     429
4     435
Name: trip_duration, dtype: int64

In [6]:
y_train = np.log1p(y_train)
y_train.head()

0    6.122493
1    6.498282
2    7.661527
3    6.063785
4    6.077642
Name: trip_duration, dtype: float64

####   
####    

## 3. Linear Regression

#### 기본

In [8]:
from sklearn import linear_model

LR_model = linear_model.LinearRegression()
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([722.4012868 , 897.43401767, 699.8967529 , 669.96028581,
       665.1235513 , 675.24198506, 707.16894828, 828.53860806,
       787.15288412, 694.66236882])

#### hyperparameters 조정

In [2]:
from sklearn import linear_model
LR_model = linear_model.LinearRegression()
LR_model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
from sklearn import linear_model
LR_model = linear_model.LinearRegression(fit_intercept=False)
LR_model.fit(x_train, y_train)

predictions = LR_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([722.40128681, 897.43401768, 699.89675291, 669.96028582,
       665.12355131, 675.24198507, 707.16894829, 828.53860807,
       787.15288413, 694.66236883])

#### 제출

In [17]:
submission['trip_duration'] = predictions
submission.to_csv('LR8.csv', index = False)

#### 결과보고서

In [None]:
LR / basic_features : 0.77677
LR / features_1     : 0.77714
LR / features_2     : 0.77677
LR / features_3     : 0.77677
LR / features_4     : 0.77762
    
LR / basic_features / normalize = True : 0.95757
LR / basic_features / normalize=True  fit_intercept=False : 0.77677
LR / basic_features / fit_intercept=False : 0.77677    

####  
####  

## 4. Decision Tree Regressor

#### 기본

In [9]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor()
DT_model.fit(x_train, y_train)

predictions = DT_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 452.,  279., 2214., 1025.,  249., 1464.,  760.,    3.,  184.,
        222.])

#### hyperparameters 조정

In [18]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor()
DT_model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [None]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor(criterion='mae',
                                 max_depth = 10, random_state = 50)     # 12시 35분 시작
DT_model.fit(x_train, y_train)

predictions = DT_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

#### 제출

In [37]:
submission['trip_duration'] = predictions
submission.to_csv('DC19.csv', index = False)

#### 결과보고서 

In [None]:
DT / basic_features : 1.07356
DT / features_1     : 1.07007
DT / features_2     : 1.07381
DT / features_3     : 1.07070
DT / features_4     : 1.06416
    
DT / basic_features / max_depth = 12 random_state = 50 : 0.74603
DT / basic_features / max_depth = 10 random_state = 50 : 0.74505
DT / basic_features / max_depth = 8 random_state = 50 :  0.74690
DT / basic_features / max_depth = 9 random_state = 50 :  0.74551

DT / features_4  / max_depth = 10 random_state = 50 : 0.74460
DT / features_4  / max_depth = 10 random_state = 50 max_features = 'auto' :  0.74460
DT / features_4  / max_depth = 10 random_state = 50 max_features = 'sqrt' :  0.75377
DT / features_4  / max_depth = 10 random_state = 50 max_features = 'log2' :  0.75377
    
DT / features_4  / max_depth = 10 random_state = 50 : 0.74460 criterion='friedman_mse'  : 0.75377   
DT / features_4  / max_depth = 10 random_state = 50 : 0.74460 criterion='mae'  : 너무 오래걸려 실패

####   
####   

## 5. Random Forest Regressor

#### 기본

In [11]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(x_train, y_train)

predictions = RF_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([ 493.97920857,  471.46022727, 1365.54733309, 1155.00641377,
        720.39310441,  924.81350014,  808.66244501,  349.61084641,
        638.72083385,  700.85066106])

#### hyperparameters 조정

In [7]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [33]:
import time
start = time.time()

from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor(n_estimators = 75)

RF_model.fit(x_train, y_train)

predictions = RF_model.predict(x_test)
predictions = np.expm1(predictions)


print('Running Time : %.02f초' % (time.time() - start))
predictions[0:10]

Running Time : 1061.50초


array([613.54855286, 695.37340227, 620.47647402, 515.89588524,
       568.42534459, 596.16213353, 542.25022547, 613.21597493,
       585.26054582, 578.17163974])

#### 제출

In [34]:
submission['trip_duration'] = predictions
submission.to_csv('RF20.csv', index = False)

#### 결과보고서

In [None]:
RF / basic_features : 0.77799
RF / features_1     : 0.77918
RF / features_2     : 0.77863
RF / features_3     : 0.77942
RF / features_4     : 0.78734
    
RF / basic_features / random_state =  50 : 0.77764
RF / basic_features / n_estimators = 100 random_state =  50 : 0.74300
RF / basic_features / n_estimators = 100 random_state =  50 max_depth = 20 max_features = 8 : 0.73541
RF / basic_features / n_estimators = 100 random_state =  50 max_depth = 20 max_features = 10 : 0.73423
RF / basic_features / n_estimators = 100 random_state =  50 max_depth = 30 max_features = 10 : 0.73765   
RF / basic_features / n_estimators = 100 random_state =  50 max_depth = 20 max_features = 12 : 0.73380 ☆
RF / basic_features / n_estimators = 100 random_state =  40 max_depth = 20 max_features = 12 : 0.73385
RF / basic_features / n_estimators = 100 random_state =  60 max_depth = 20 max_features = 12 : 0.73388

    
RF / basic_features / n_estimators = 200 random_state =  50 : 0.74087
RF / basic_features / n_estimators = 200 random_state =  50 max_depth = 20 max_features = 10 : 0.73379 
    
RF / basic_features / n_estimators = 300 random_state =  50 : 메모리 오류
    
RF / basic_features / n_estimators = 50 random_state =  50 max_depth = 20 max_features = 12 : 0.73416  
RF / basic_features / n_estimators = 75 random_state =  50 max_depth = 20 max_features = 12 : 0.73359 ☆ 
RF / basic_features / n_estimators = 75 : 0.74591

####   
####   

## 6. Gradient Boost Regressor

#### 기본

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor()
GB_model.fit(x_train, y_train)

predictions = GB_model.predict(x_test)
predcitions = np.expm1(predictions)
predictions[0:10]

array([6.47258578, 6.43930132, 6.47258578, 6.31684105, 6.39497078,
       6.50748801, 6.46367237, 6.80874323, 6.514045  , 6.50444774])

#### hyperparameters 조정

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor()
GB_model

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [27]:
import time
start = time.time()

from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor(n_estimators = 550, random_state =  50)
GB_model.fit(x_train, y_train)

predictions = GB_model.predict(x_test)
predcitions = np.expm1(predictions)

print('Running Time : %.02f초' % (time.time() - start))
predictions[0:10]

Running Time : 1045.19초


array([6.43344207, 6.25316501, 6.39557329, 6.22814576, 6.33882377,
       6.55647984, 6.40817036, 6.84669248, 6.51057977, 6.47189709])

#### 제출

In [28]:
submission['trip_duration'] = predictions
submission.to_csv('GB5.csv', index = False)

#### 결과보고서

In [None]:
GB / basic_features : 4.52659
GB / features_1     : 4.52659
    
GB / basic_features / n_estimators = 750 random_state =  50 : 4.52634
GB / basic_features / n_estimators = 550 random_state =  50 : 4.52636

####   
####   

## 7. XGBoost Regressor 

#### 기본

In [11]:
import xgboost as xgb

XGB_model = xgb.XGBRegressor()
XGB_model.fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)
predictions[0:10]

array([643.61865, 628.2386 , 643.61865, 546.9475 , 594.29193, 661.1439 ,
       636.3633 , 899.7914 , 681.5244 , 664.80994], dtype=float32)

#### hyperparameters 조정

In [12]:
import xgboost as xgb

XGB_model = xgb.XGBRegressor()
XGB_model

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [25]:
import time
start = time.time()

import xgboost as xgb

XGB_model = xgb.XGBRegressor(max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
                            colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25, learning_rate = 0.1)
XGB_model.fit(x_train, y_train)

predictions = XGB_model.predict(x_test)
predictions = np.expm1(predictions)

print('Running Time : %.02f초' % (time.time() - start))
predictions[0:10]

Running Time : 286.08초


array([721.39355, 591.198  , 648.9896 , 602.868  , 721.0041 , 682.9645 ,
       594.0169 , 942.6936 , 696.6244 , 654.05756], dtype=float32)

#### 제출

In [26]:
submission['trip_duration'] = predictions
submission.to_csv('XGB18.csv', index = False)

#### 결과보고서

In [None]:
XGB / basic_features : 0.74462
XGB / features_1     : 0.74462
XGB / features_2     : 0.74448 ☆
XGB / features_3     : 0.74448 ☆
XGB / features_4     : 0.74536

    
XGB / feautres_2 / max_depth = 12, n_estimators = 2000, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25,learning_rate = 0.05       : 0.73437
XGB / feautres_2 / max_depth = 12, n_estimators = 1500, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25,learning_rate = 0.05       : 0.73231 ☆
XGB / feautres_2 / n_estimators = 1500, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25,learning_rate = 0.05       : 0.73822
XGB / feautres_2 / max_depth = 14, n_estimators = 1500, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25,learning_rate = 0.05       : 0.73705
    
# n_estimators만 조정 
max_depth = 12, min_child_weight = 8, gamma = 0.4, learning_rate = 0.05
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25

XGB / feautres_3 / n_estimators = 1500 : 0.73231
XGB / feautres_3 / n_estimators = 1400 : 0.73093
XGB / feautres_3 / n_estimators = 1200 : 0.73031
XGB / feautres_3 / n_estimators = 1000 : 0.72973
XGB / feautres_3 / n_estimators = 500  : 0.72891   
XGB / feautres_3 / n_estimators = 300  : 0.72880 ☆
XGB / feautres_3 / n_estimators = 100  : 0.73256    
XGB / feautres_3 / n_estimators = 200  : 0.72922  

# learning_rate만 조정
max_depth = 12, n_estimators = 300, min_child_weight = 8, gamma = 0.4,
colsample_bytree = 0.6, nthread = 5, subsample = 0.6, seed = 25

XGB / feautres_3 / learning_rate = 0.01       : 0.79178
XGB / feautres_3 / learning_rate = 0.1        : 0.73046   