## 深入理解xgboost七

### 小麦钟子多分类问题

In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

In [2]:
data = pd.read_csv("dataset/seeds/seeds_dataset.txt", header=None, sep="\s+", converters={7:lambda x: int(x) - 1})

In [3]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,0
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,0
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,0
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,0


In [4]:
data.rename(columns={7:"label"}, inplace=True)
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,label
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,0
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,0
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,0
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,0
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,0


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)

data.shape, data_train.shape, data_test.shape 

((210, 8), (168, 8), (42, 8))

In [7]:
xgb_train = xgb.DMatrix(data_train.iloc[:, :6], data_train["label"])
xgb_test = xgb.DMatrix(data_test.iloc[:, :6], data_test["label"])

In [8]:
params = {"objective":"multi:softmax",
          "booster":"gbtree",
          "num_class":3,
          "eta":0.05,
          "max_depth":5}

In [9]:
watch_list = [(xgb_train, "training"), (xgb_test, "testing")]
num_round = 50

In [10]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-mlogloss:1.03486	testing-mlogloss:1.04236
[1]	training-mlogloss:0.97661	testing-mlogloss:0.99264
[2]	training-mlogloss:0.92288	testing-mlogloss:0.94710
[3]	training-mlogloss:0.87294	testing-mlogloss:0.90375
[4]	training-mlogloss:0.82683	testing-mlogloss:0.86525
[5]	training-mlogloss:0.78345	testing-mlogloss:0.82644
[6]	training-mlogloss:0.74350	testing-mlogloss:0.79361
[7]	training-mlogloss:0.70575	testing-mlogloss:0.75971
[8]	training-mlogloss:0.67087	testing-mlogloss:0.73156
[9]	training-mlogloss:0.63813	testing-mlogloss:0.70459
[10]	training-mlogloss:0.60718	testing-mlogloss:0.67808
[11]	training-mlogloss:0.57828	testing-mlogloss:0.65310
[12]	training-mlogloss:0.55132	testing-mlogloss:0.63193
[13]	training-mlogloss:0.52586	testing-mlogloss:0.61171
[14]	training-mlogloss:0.50163	testing-mlogloss:0.59077
[15]	training-mlogloss:0.47913	testing-mlogloss:0.57406
[16]	training-mlogloss:0.45779	testing-mlogloss:0.55739
[17]	training-mlogloss:0.43765	testing-mlogloss:0.54084
[1

In [11]:
# 预测
y_pred = model.predict(xgb_test)

In [12]:
# 评测
error_rate = np.sum(y_pred != data_test.label) / data_test.shape[0]

In [13]:
error_rate

0.09523809523809523

### 多分类为概率

In [14]:
# 设置参数
params["objective"] = "multi:softprob"

In [15]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-mlogloss:1.03486	testing-mlogloss:1.04236
[1]	training-mlogloss:0.97661	testing-mlogloss:0.99264
[2]	training-mlogloss:0.92288	testing-mlogloss:0.94710
[3]	training-mlogloss:0.87294	testing-mlogloss:0.90375
[4]	training-mlogloss:0.82683	testing-mlogloss:0.86525
[5]	training-mlogloss:0.78345	testing-mlogloss:0.82644
[6]	training-mlogloss:0.74350	testing-mlogloss:0.79361
[7]	training-mlogloss:0.70575	testing-mlogloss:0.75971
[8]	training-mlogloss:0.67087	testing-mlogloss:0.73156
[9]	training-mlogloss:0.63813	testing-mlogloss:0.70459
[10]	training-mlogloss:0.60718	testing-mlogloss:0.67808
[11]	training-mlogloss:0.57828	testing-mlogloss:0.65310
[12]	training-mlogloss:0.55132	testing-mlogloss:0.63193
[13]	training-mlogloss:0.52586	testing-mlogloss:0.61171
[14]	training-mlogloss:0.50163	testing-mlogloss:0.59077
[15]	training-mlogloss:0.47913	testing-mlogloss:0.57406
[16]	training-mlogloss:0.45779	testing-mlogloss:0.55739
[17]	training-mlogloss:0.43765	testing-mlogloss:0.54084
[1

In [16]:
y_pred_prob = model.predict(xgb_test)
y_pred_prob

array([[0.79855704, 0.11397931, 0.08746363],
       [0.04125691, 0.03817596, 0.9205671 ],
       [0.07750168, 0.8617733 , 0.06072503],
       [0.04363484, 0.03824778, 0.91811734],
       [0.84569675, 0.10991898, 0.04438426],
       [0.04206957, 0.9209339 , 0.03699661],
       [0.9258295 , 0.03706415, 0.03710629],
       [0.04363484, 0.03824778, 0.91811734],
       [0.05066575, 0.03871052, 0.91062367],
       [0.04147132, 0.03816742, 0.9203613 ],
       [0.0407352 , 0.92221665, 0.03704814],
       [0.8705673 , 0.08374319, 0.04568953],
       [0.86869556, 0.06561491, 0.06568953],
       [0.9039212 , 0.0543628 , 0.04171607],
       [0.13954651, 0.07218403, 0.78826946],
       [0.04125691, 0.03817596, 0.9205671 ],
       [0.04125691, 0.03817596, 0.9205671 ],
       [0.90981895, 0.04506492, 0.04511617],
       [0.92227495, 0.03884043, 0.0388846 ],
       [0.9258295 , 0.03706415, 0.03710629],
       [0.72918797, 0.22802965, 0.04278241],
       [0.18569647, 0.7718561 , 0.04244742],
       [0.

In [17]:
pred_label = np.argmax(y_pred_prob, axis=1)
pred_label

array([0, 2, 1, 2, 0, 1, 0, 2, 2, 2, 1, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 1,
       1, 2, 0, 2, 0, 2, 0, 1, 1, 1, 1, 0, 0, 1, 2, 1, 0, 2, 2, 2])

In [18]:
# 计算错误率
error_rate = np.sum(pred_label != data_test.label) / data_test.shape[0]
error_rate

0.09523809523809523