## 深入理解xgboost九

### 将DMatrix保存为二进制

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [2]:
# 加载数据集
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [4]:
xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)

In [5]:
params = {
    "objective":"binary:logistic",
    "booster":"gbtree",
    "eta":0.05,
    "max_depth":5
}

num_round = 80
watch_list = [(xgb_train, "training"), (xgb_test, "testing")]

In [6]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-logloss:0.64845	testing-logloss:0.65376
[1]	training-logloss:0.60802	testing-logloss:0.61785
[2]	training-logloss:0.57111	testing-logloss:0.58557
[3]	training-logloss:0.53729	testing-logloss:0.55530
[4]	training-logloss:0.50619	testing-logloss:0.52757
[5]	training-logloss:0.47751	testing-logloss:0.50274
[6]	training-logloss:0.45100	testing-logloss:0.47952
[7]	training-logloss:0.42644	testing-logloss:0.45837
[8]	training-logloss:0.40363	testing-logloss:0.43853
[9]	training-logloss:0.38283	testing-logloss:0.42101
[10]	training-logloss:0.36302	testing-logloss:0.40390
[11]	training-logloss:0.34478	testing-logloss:0.38779
[12]	training-logloss:0.32709	testing-logloss:0.37103
[13]	training-logloss:0.31055	testing-logloss:0.35520
[14]	training-logloss:0.29524	testing-logloss:0.34180
[15]	training-logloss:0.28069	testing-logloss:0.32786
[16]	training-logloss:0.26721	testing-logloss:0.31616
[17]	training-logloss:0.25441	testing-logloss:0.30479
[18]	training-logloss:0.24254	testing-

In [7]:
xgb_test.save_binary("model/data_test.buffer")
xgb_test_2 = xgb.DMatrix("model/data_test.buffer")
y_pred = model.predict(xgb_test_2)
y_pred = np.round(y_pred)

In [8]:
from sklearn.metrics import classification_report

In [9]:
print(classification_report(y_true=y_test, y_pred=y_pred, labels=[1, 0], target_names=["恶性", "良性"]))

              precision    recall  f1-score   support

          恶性       0.95      0.99      0.97        72
          良性       0.97      0.90      0.94        42

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114

