# XGBoost(Extreme Gradient Boosting) 
### : 고성능 그레디언트 부스팅 머신(GBM)

- 그레디언트 부스팅 : 앙상블 학습법, 여러 개의 약한 모델을 순차적으로 학습시켜 강한 모델을 만든다. 각 모델은 이전 모델이 가진 오류를 줄이도록 학습된다.

- 초기 단순한 모델 -> 오류 계산 -> 새 모델 학습 -> 모델 결합

1. 정규화 : 모델의 복잡도를 낮춘다.(과적합을 방지) 모델의 복잡도의 패널티를 부여하여 단순한 모델을 설계할 수 있도록 유도
  
2. 희소성 : 트리 분할 시 결측값을 처리하는 기능이 내장되어 있어 희소 데이터를 효율적으로 처리
3. 대규모 : 대규모 데이터 셋을 효율적으로 분할하기 위해 가중치 분위 스케치를 사용한다.
4. 병렬화 : 트리의 각 노트를 병렬로 처리하여 학습속도가 빠르다.ㅑ

In [44]:
import xgboost as xgb
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

In [45]:
boston = fetch_openml(name='boston', version=1, as_frame=True)

In [46]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target

In [47]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [48]:
y[:10]

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
5    28.7
6    22.9
7    27.1
8    16.5
9    18.9
Name: MEDV, dtype: float64

In [49]:
# CHAS, RAD 데이터 카테고리 형식으로 수정
X = pd.get_dummies(X)

X.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,...,CHAS_1,RAD_1,RAD_2,RAD_24,RAD_3,RAD_4,RAD_5,RAD_6,RAD_7,RAD_8
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,296.0,15.3,396.9,...,False,True,False,False,False,False,False,False,False,False
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,...,False,False,True,False,False,False,False,False,False,False
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,...,False,False,True,False,False,False,False,False,False,False
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,...,False,False,False,False,True,False,False,False,False,False
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,...,False,False,False,False,True,False,False,False,False,False


In [50]:
# 데이터 분할(test, train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
print(xgb)

<module 'xgboost' from '/Users/jeon-yewon/miniforge3/envs/tensorflow/lib/python3.9/site-packages/xgboost/__init__.py'>


In [52]:
import xgboost as xgb

In [53]:
# XGBoost 전용 데이터 셋 : DMatrix
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [54]:
params = {
    'objective' : 'reg:squarederror',
    'max_depth' : 4,
    'eta' : 0.1,
    'sumsample' : 0.8,
    'colsample_bytree' : 0.8
}

In [55]:
bst = xgb.train(params, dtrain, num_boost_round = 100)
preds = bst.predict(dtest)

Parameters: { "sumsample" } are not used.



In [56]:
print(preds[:10], y_test[:10])

[24.973385 32.36859  16.751245 24.43808  16.980234 22.029192 18.818548
 14.781816 20.804693 20.602318] 173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
76     20.0
316    17.8
140    14.0
471    19.6
500    16.8
Name: MEDV, dtype: float64


In [57]:
import pandas as pd
import numpy as np
import os
import json

In [63]:
DATA_IN_PATH = '/Users/jeon-yewon/Desktop/데이터 분석 강의/부트캠프/11주차/'
DATA_OUT_PATH = '/Users/jeon-yewon/Desktop/데이터 분석 강의/부트캠프/11주차/'

train_q1_data_file = 'train_q1.npy'
train_q2_data_file = 'train_q2.npy'
train_label_data_file = 'train_label.npy'

train_q1_data = np.load(open(DATA_IN_PATH + train_q1_data_file, 'rb'))
train_q2_data = np.load(open(DATA_IN_PATH + train_q2_data_file, 'rb'))
train_labels = np.load(open(DATA_IN_PATH + train_label_data_file, 'rb'))

In [64]:
train_input = np.stack((train_q1_data, train_q2_data), axis=1)

In [65]:
print(train_input.shape)

(298526, 2, 31)


In [67]:
from sklearn.model_selection import train_test_split

train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_labels, test_size=0.2, random_state=42)

In [69]:
train_data = xgb.DMatrix(train_input.sum(axis=1), label=train_label)
eval_data = xgb.DMatrix(eval_input.sum(axis=1), label=eval_label)

In [70]:
train_data, eval_data

(<xgboost.core.DMatrix at 0x15c9a0490>, <xgboost.core.DMatrix at 0x15c9b67f0>)

In [71]:
params = {
    'objective' : 'binary:logistic',
    'eval_metric' : 'rmse'
}

In [73]:
data_list = [(train_data, 'train'), (eval_data, 'valid')]
bst = xgb.train(params, train_data, num_boost_round=1000, early_stopping_rounds=10, evals=data_list)

[0]	train-rmse:0.48416	valid-rmse:0.48442
[1]	train-rmse:0.47384	valid-rmse:0.47438
[2]	train-rmse:0.46718	valid-rmse:0.46804
[3]	train-rmse:0.46261	valid-rmse:0.46370
[4]	train-rmse:0.45931	valid-rmse:0.46055
[5]	train-rmse:0.45628	valid-rmse:0.45788
[6]	train-rmse:0.45438	valid-rmse:0.45614
[7]	train-rmse:0.45203	valid-rmse:0.45393
[8]	train-rmse:0.45041	valid-rmse:0.45254
[9]	train-rmse:0.44936	valid-rmse:0.45163
[10]	train-rmse:0.44872	valid-rmse:0.45112
[11]	train-rmse:0.44710	valid-rmse:0.44963
[12]	train-rmse:0.44533	valid-rmse:0.44810
[13]	train-rmse:0.44477	valid-rmse:0.44771
[14]	train-rmse:0.44435	valid-rmse:0.44740
[15]	train-rmse:0.44355	valid-rmse:0.44673
[16]	train-rmse:0.44225	valid-rmse:0.44556
[17]	train-rmse:0.44193	valid-rmse:0.44544
[18]	train-rmse:0.44160	valid-rmse:0.44527
[19]	train-rmse:0.44128	valid-rmse:0.44509
[20]	train-rmse:0.44084	valid-rmse:0.44471
[21]	train-rmse:0.43961	valid-rmse:0.44362
[22]	train-rmse:0.43919	valid-rmse:0.44336
[23]	train-rmse:0.438

In [74]:
test_q1_data_file = 'test_q1.npy'
test_q2_data_file = 'test_q2.npy'
test_id_data_file = 'test_id.npy'

test_q1_data = np.load(open(DATA_IN_PATH + test_q1_data_file, 'rb'))
test_q2_data = np.load(open(DATA_IN_PATH + test_q2_data_file, 'rb'))
test_id_data = np.load(open(DATA_IN_PATH + test_id_data_file, 'rb'), allow_pickle=True)

In [75]:
test_input = np.stack((test_q1_data, test_q2_data), axis=1)
test_data = xgb.DMatrix(test_input.sum(axis=1))

In [76]:
test_pred = bst.predict(test_data)

In [77]:
test_pred

array([0.29748943, 0.6804871 , 0.7502862 , ..., 0.5774315 , 0.48275614,
       0.00831276], dtype=float32)

In [78]:
test_pred = (test_pred >= 0.5).astype(int)
test_pred

array([0, 1, 1, ..., 1, 0, 0])

In [79]:
output = pd.DataFrame({'test_id':test_id_data, 'is_duplicate':test_pred})
output.to_csv(DATA_OUT_PATH + 'simple_xgb.csv', index=False)