# XGBoost - Python Native

In [2]:
import xgboost as xgb

In [3]:
print(xgb.__version__)

2.0.0


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df['target'] = cancer.target
data_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
  data_df.drop('target', axis=1),
  data_df['target'],
  random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
  X_train,
  y_train,
  random_state=42
)

## DMatrix 변화
- 넘파이 배열, 판다스 데이터프레임에서도 변환이 가능

In [7]:
dtr = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_valid, label=y_valid)
dtest = xgb.DMatrix(data=X_test, label=y_test)

## 하이퍼 파라미터 설정

In [8]:
params = {
  'max_depth': 3,
  'eta': 0.05,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss'
}

num_rounds = 400

### 학습 데이터 세트는 'train', 평가(검증) 데이터 세트는 'eval'

In [9]:
eval_list = [
  (dtr, 'train'),
  (dval, 'eval')
]

xgb_model = xgb.train(
  params=params,
  dtrain=dtr,
  num_boost_round=10000,
  early_stopping_rounds=50, # 성능 개선이 50라운드 이내에 이루어지지 않으면 학습을 종료
  evals=eval_list
)

[0]	train-logloss:0.62706	eval-logloss:0.60229
[1]	train-logloss:0.59030	eval-logloss:0.57112
[2]	train-logloss:0.55534	eval-logloss:0.54548
[3]	train-logloss:0.52351	eval-logloss:0.52160
[4]	train-logloss:0.49440	eval-logloss:0.50040
[5]	train-logloss:0.46768	eval-logloss:0.48104
[6]	train-logloss:0.44391	eval-logloss:0.45909
[7]	train-logloss:0.42107	eval-logloss:0.44270
[8]	train-logloss:0.39994	eval-logloss:0.42712
[9]	train-logloss:0.38036	eval-logloss:0.41490
[10]	train-logloss:0.36265	eval-logloss:0.40124
[11]	train-logloss:0.34564	eval-logloss:0.39045
[12]	train-logloss:0.32978	eval-logloss:0.38058
[13]	train-logloss:0.31499	eval-logloss:0.37156
[14]	train-logloss:0.30116	eval-logloss:0.36372
[15]	train-logloss:0.28823	eval-logloss:0.35494
[16]	train-logloss:0.27603	eval-logloss:0.34791
[17]	train-logloss:0.26459	eval-logloss:0.34123
[18]	train-logloss:0.25401	eval-logloss:0.33128
[19]	train-logloss:0.24382	eval-logloss:0.32515
[20]	train-logloss:0.23430	eval-logloss:0.31550
[2

In [10]:
import numpy as np

pred_props = xgb_model.predict(dtest)
print(np.round(pred_props[:10], 3))

[0.932 0.001 0.001 0.999 1.    0.001 0.002 0.682 0.476 0.997]


In [11]:
preds = [1 if x>0.5 else 0 for x in pred_props]
preds[:10]

[1, 0, 0, 1, 1, 0, 0, 1, 0, 1]

## XGBoost - Scikit Learn Wrapper

In [12]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
  n_estimators=400,
  learning_rate=0.05,
  max_depth=3,
  eval_metrics='logloss'
)

In [13]:
xgb_clf.fit(
  X_train, y_train,
  verbose=True
)

In [14]:
preds = xgb_clf.predict(X_test)
preds[:10]

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1])

In [15]:
pred_proba = xgb_clf.predict_proba(X_test)
pred_proba[:10]

array([[7.3437452e-02, 9.2656255e-01],
       [9.9860471e-01, 1.3952850e-03],
       [9.9924982e-01, 7.5019279e-04],
       [1.5795231e-03, 9.9842048e-01],
       [2.6434660e-04, 9.9973565e-01],
       [9.9915171e-01, 8.4831589e-04],
       [9.9709845e-01, 2.9015488e-03],
       [3.3829182e-01, 6.6170818e-01],
       [5.5896002e-01, 4.4103998e-01],
       [3.5931468e-03, 9.9640685e-01]], dtype=float32)

In [16]:
# Early Stopping
xgb_clf = XGBClassifier(
  n_estimators=400,
  learning_rate=0.05,
  max_depth=3
)

In [17]:
# 검증할 세트를 따로 지정

eval_sets = [
  (X_train, y_train),
  (X_valid, y_valid)
]

xgb_clf.fit(
  X_train, y_train,
  early_stopping_rounds=50,
  eval_set = eval_sets,
  verbose=True
)

[0]	validation_0-logloss:0.62706	validation_1-logloss:0.60229
[1]	validation_0-logloss:0.59030	validation_1-logloss:0.57112
[2]	validation_0-logloss:0.55534	validation_1-logloss:0.54548
[3]	validation_0-logloss:0.52351	validation_1-logloss:0.52160
[4]	validation_0-logloss:0.49440	validation_1-logloss:0.50040
[5]	validation_0-logloss:0.46768	validation_1-logloss:0.48104
[6]	validation_0-logloss:0.44391	validation_1-logloss:0.45909
[7]	validation_0-logloss:0.42107	validation_1-logloss:0.44270
[8]	validation_0-logloss:0.39994	validation_1-logloss:0.42712
[9]	validation_0-logloss:0.38036	validation_1-logloss:0.41490
[10]	validation_0-logloss:0.36265	validation_1-logloss:0.40124
[11]	validation_0-logloss:0.34564	validation_1-logloss:0.39045
[12]	validation_0-logloss:0.32978	validation_1-logloss:0.38058
[13]	validation_0-logloss:0.31499	validation_1-logloss:0.37156
[14]	validation_0-logloss:0.30116	validation_1-logloss:0.36372
[15]	validation_0-logloss:0.28823	validation_1-logloss:0.35494
[1