In [1]:
import pickle
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, fetch_california_housing

In [2]:
rng = np.random.RandomState(31337)

Zeros and Ones from the Digits dataset: binary classification.

In [3]:
digits = load_digits(n_class=2)

In [4]:
x = digits["data"]
y = digits["target"]

In [5]:
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(x):
    xgb_model = xgb.XGBClassifier(n_jobs=1)
    xgb_model.fit(x[train_index], y[train_index])
    predictions = xgb_model.predict(x[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

[[87  0]
 [ 1 92]]
[[91  0]
 [ 2 87]]


### 鸢尾花多分类任务

In [6]:
iris = load_iris()
x = iris["data"]
y = iris["target"]

In [7]:
Kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(x):
    xgb_model = xgb.XGBClassifier(n_jobs=1)
    xgb_model.fit(x[train_index], y[train_index])
    predictions = xgb_model.predict(x[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

[[19  0  0]
 [ 0 31  3]
 [ 0  1 21]]
[[31  0  0]
 [ 0 16  0]
 [ 0  1 27]]


### 加利福尼亚房价回归

In [8]:
x, y = fetch_california_housing(return_X_y=True)

In [9]:
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(x):
    model = xgb.XGBRegressor(n_jobs=1)
    model.fit(x[train_index], y[train_index])
    predictions = model.predict(x[test_index])
    actuals = y[test_index]
    print(mean_squared_error(actuals, predictions))

0.23842942225357922
0.24043143227480607


### 参数最优化

In [None]:
model = xgb.XGBRegressor(n_jobs=1)
clf = GridSearchCV(model, {"max_depth": [2, 4, 6], "n_estimators":[50, 100, 200]}, verbose=1, n_jobs=1)
clf.fit(x, y)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


### 提前停止

In [None]:
x = digits["data"]
y = digits["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
clf = xgb.XGBClassifier(n_jobs=1)
clf.fit(x_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(x_test, y_test)])