## 深入理解xgboost十一

### 交叉验证

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [2]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)

In [4]:
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "eta": 0.1,
    "max_depth": 6
}
num_round = 80
watch_list = [(xgb_train, "training"), (xgb_test, "testing")]

In [5]:
res = xgb.cv(params=params, dtrain=xgb_train, num_boost_round=num_round, nfold=5, metrics={"auc"}, seed=0, callbacks=[xgb.callback.TrainingCallback()])

In [6]:
print(res)

    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0         0.998011       0.001436       0.965799      0.022197
1         0.999015       0.000287       0.979118      0.019201
2         0.999201       0.000389       0.977854      0.018810
3         0.999246       0.000332       0.982150      0.016045
4         0.999525       0.000405       0.982782      0.014338
..             ...            ...            ...           ...
75        1.000000       0.000000       0.991569      0.006516
76        1.000000       0.000000       0.991666      0.006586
77        1.000000       0.000000       0.991445      0.006534
78        1.000000       0.000000       0.991445      0.006534
79        1.000000       0.000000       0.991668      0.006576

[80 rows x 4 columns]


In [7]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-logloss:0.60594	testing-logloss:0.61639
[1]	training-logloss:0.53422	testing-logloss:0.55362
[2]	training-logloss:0.47394	testing-logloss:0.50049
[3]	training-logloss:0.42265	testing-logloss:0.45501
[4]	training-logloss:0.37862	testing-logloss:0.41682
[5]	training-logloss:0.33972	testing-logloss:0.38017
[6]	training-logloss:0.30625	testing-logloss:0.35091
[7]	training-logloss:0.27646	testing-logloss:0.32244
[8]	training-logloss:0.25069	testing-logloss:0.30262
[9]	training-logloss:0.22759	testing-logloss:0.28135
[10]	training-logloss:0.20748	testing-logloss:0.26679
[11]	training-logloss:0.18936	testing-logloss:0.25174
[12]	training-logloss:0.17325	testing-logloss:0.23866
[13]	training-logloss:0.15915	testing-logloss:0.22750
[14]	training-logloss:0.14593	testing-logloss:0.21555
[15]	training-logloss:0.13443	testing-logloss:0.20662
[16]	training-logloss:0.12374	testing-logloss:0.19843
[17]	training-logloss:0.11466	testing-logloss:0.19028
[18]	training-logloss:0.10652	testing-

In [8]:
# 保存评估结果
evals_result = {}
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list, evals_result=evals_result)

[0]	training-logloss:0.60594	testing-logloss:0.61639
[1]	training-logloss:0.53422	testing-logloss:0.55362
[2]	training-logloss:0.47394	testing-logloss:0.50049
[3]	training-logloss:0.42265	testing-logloss:0.45501
[4]	training-logloss:0.37862	testing-logloss:0.41682
[5]	training-logloss:0.33972	testing-logloss:0.38017
[6]	training-logloss:0.30625	testing-logloss:0.35091
[7]	training-logloss:0.27646	testing-logloss:0.32244
[8]	training-logloss:0.25069	testing-logloss:0.30262
[9]	training-logloss:0.22759	testing-logloss:0.28135
[10]	training-logloss:0.20748	testing-logloss:0.26679
[11]	training-logloss:0.18936	testing-logloss:0.25174
[12]	training-logloss:0.17325	testing-logloss:0.23866
[13]	training-logloss:0.15915	testing-logloss:0.22750
[14]	training-logloss:0.14593	testing-logloss:0.21555
[15]	training-logloss:0.13443	testing-logloss:0.20662
[16]	training-logloss:0.12374	testing-logloss:0.19843
[17]	training-logloss:0.11466	testing-logloss:0.19028
[18]	training-logloss:0.10652	testing-

In [9]:
print(evals_result)

{'training': OrderedDict([('logloss', [0.605935958322588, 0.5342157895748432, 0.4739384930212419, 0.42265164937291827, 0.37861661865161017, 0.3397195373262678, 0.3062464347252479, 0.2764594297487657, 0.2506903997161886, 0.2275873405265284, 0.20748347977360526, 0.18936423043628314, 0.17325016004698618, 0.15915230502794078, 0.14592915919128355, 0.13443403604266407, 0.12374464859674265, 0.11466306484007574, 0.10652392390337619, 0.09836366433691192, 0.09106403808672349, 0.08461510216469294, 0.07871330427926976, 0.07338011152632944, 0.06828002264047717, 0.06414230740987338, 0.06026462849516135, 0.05634501818772201, 0.05283418204840068, 0.04972030706703663, 0.04681452486131873, 0.04430488567021522, 0.0419534796910299, 0.03979800549837259, 0.037914514242784, 0.03606641457549163, 0.03436067118872325, 0.03280692179943179, 0.03147625419833176, 0.03016150916752579, 0.02893287350147308, 0.02783186084457806, 0.02678891582885286, 0.02597774205381399, 0.02515399700214902, 0.02431924116316733, 0.02340

In [10]:
print(evals_result["training"]["logloss"])

[0.605935958322588, 0.5342157895748432, 0.4739384930212419, 0.42265164937291827, 0.37861661865161017, 0.3397195373262678, 0.3062464347252479, 0.2764594297487657, 0.2506903997161886, 0.2275873405265284, 0.20748347977360526, 0.18936423043628314, 0.17325016004698618, 0.15915230502794078, 0.14592915919128355, 0.13443403604266407, 0.12374464859674265, 0.11466306484007574, 0.10652392390337619, 0.09836366433691192, 0.09106403808672349, 0.08461510216469294, 0.07871330427926976, 0.07338011152632944, 0.06828002264047717, 0.06414230740987338, 0.06026462849516135, 0.05634501818772201, 0.05283418204840068, 0.04972030706703663, 0.04681452486131873, 0.04430488567021522, 0.0419534796910299, 0.03979800549837259, 0.037914514242784, 0.03606641457549163, 0.03436067118872325, 0.03280692179943179, 0.03147625419833176, 0.03016150916752579, 0.02893287350147308, 0.02783186084457806, 0.02678891582885286, 0.02597774205381399, 0.02515399700214902, 0.02431924116316733, 0.02340259959327651, 0.02265646231391437, 0.0