# XGBoost - Python Native

In [None]:
!pip install xgboost==1.5.0

Collecting xgboost==1.5.0
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.5/173.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.7.6
    Uninstalling xgboost-1.7.6:
      Successfully uninstalled xgboost-1.7.6
Successfully installed xgboost-1.5.0


In [1]:
import xgboost as xgb
print(xgb.__version__)

1.7.6


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df['target'] = cancer.target
data_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(
    data_df.drop("target", axis=1),
    data_df['target'],
    random_state=42
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    random_state=42
)

# DMatrix 변환
- 넘파이 배열, 판다스 데이터프레임에서도 변환이 가능

In [5]:
dtr = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_valid, label=y_valid)
dtest = xgb.DMatrix(data=X_test, label=y_test)

# 하이퍼 파라미터 설정

In [6]:
params = {
    "max_depth": 3,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

num_rounds = 400

# 학습 데이터 세트는 'train', 평가(검증) 데이터 세트는 'eval'

In [7]:
eval_list = [
    (dtr, 'train'),
    (dval, 'eval')
]

xgb_model = xgb.train(
    params=params,
    dtrain=dtr,
    num_boost_round=10000,
    early_stopping_rounds=50, # 성능 개선이 50라운드 이내에 이루어지지 않으면 학습을 종료
    evals=eval_list
)

[0]	train-logloss:0.65124	eval-logloss:0.65404
[1]	train-logloss:0.61200	eval-logloss:0.62023
[2]	train-logloss:0.57754	eval-logloss:0.58633
[3]	train-logloss:0.54459	eval-logloss:0.55826
[4]	train-logloss:0.51435	eval-logloss:0.53266
[5]	train-logloss:0.48771	eval-logloss:0.50902
[6]	train-logloss:0.46188	eval-logloss:0.48738
[7]	train-logloss:0.43893	eval-logloss:0.46707
[8]	train-logloss:0.41664	eval-logloss:0.44913
[9]	train-logloss:0.39683	eval-logloss:0.43179
[10]	train-logloss:0.37746	eval-logloss:0.41644
[11]	train-logloss:0.36018	eval-logloss:0.40182
[12]	train-logloss:0.34321	eval-logloss:0.38998
[13]	train-logloss:0.32734	eval-logloss:0.37724
[14]	train-logloss:0.31250	eval-logloss:0.36721
[15]	train-logloss:0.29931	eval-logloss:0.35642
[16]	train-logloss:0.28622	eval-logloss:0.34786
[17]	train-logloss:0.27395	eval-logloss:0.34002
[18]	train-logloss:0.26244	eval-logloss:0.33208
[19]	train-logloss:0.25146	eval-logloss:0.32362
[20]	train-logloss:0.24146	eval-logloss:0.31627
[2

In [8]:
import numpy as np

pred_props = xgb_model.predict(dtest)
print(np.round(pred_props[:10], 3))

[0.935 0.001 0.    0.999 1.    0.001 0.002 0.642 0.521 0.997]


In [9]:
preds = [1 if x > 0.5 else 0 for x in pred_props ]
preds[:10]

[1, 0, 0, 1, 1, 0, 0, 1, 1, 1]

# XGBoost-Scikit Learn Wrapper

In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=3,
    eval_metric='logloss'
)

In [None]:
xgb_clf.fit(
    X_train, y_train,
    verbose=True
)

In [None]:
preds= xgb_clf.predict(X_test)
preds[:10]

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1])

In [None]:
pred_proba = xgb_clf.predict_proba(X_test)
pred_proba[:10]

array([[7.0421875e-02, 9.2957813e-01],
       [9.9837631e-01, 1.6236610e-03],
       [9.9944514e-01, 5.5486610e-04],
       [1.5223622e-03, 9.9847764e-01],
       [2.8401613e-04, 9.9971598e-01],
       [9.9914682e-01, 8.5319200e-04],
       [9.9682111e-01, 3.1789127e-03],
       [4.3057954e-01, 5.6942046e-01],
       [5.5072081e-01, 4.4927916e-01],
       [3.7894249e-03, 9.9621058e-01]], dtype=float32)

In [None]:
# Early Stopping
xgb_clf = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=3
)

In [None]:
# 검증할 세트를 따로 지정
eval_sets = [
    (X_train, y_train),
    (X_valid, y_valid)
]

xgb_clf.fit(
    X_train, y_train,
    early_stopping_rounds=50,
    eval_set=eval_sets,
    verbose=True
)

[0]	validation_0-logloss:0.65124	validation_1-logloss:0.65404
[1]	validation_0-logloss:0.61200	validation_1-logloss:0.62023
[2]	validation_0-logloss:0.57754	validation_1-logloss:0.58633
[3]	validation_0-logloss:0.54460	validation_1-logloss:0.55826
[4]	validation_0-logloss:0.51435	validation_1-logloss:0.53266
[5]	validation_0-logloss:0.48771	validation_1-logloss:0.50902
[6]	validation_0-logloss:0.46188	validation_1-logloss:0.48738
[7]	validation_0-logloss:0.43893	validation_1-logloss:0.46707
[8]	validation_0-logloss:0.41664	validation_1-logloss:0.44913
[9]	validation_0-logloss:0.39683	validation_1-logloss:0.43179
[10]	validation_0-logloss:0.37746	validation_1-logloss:0.41644
[11]	validation_0-logloss:0.36018	validation_1-logloss:0.40182
[12]	validation_0-logloss:0.34321	validation_1-logloss:0.38998
[13]	validation_0-logloss:0.32734	validation_1-logloss:0.37724
[14]	validation_0-logloss:0.31250	validation_1-logloss:0.36721
[15]	validation_0-logloss:0.29930	validation_1-logloss:0.35642
[1