## 深入理解xgboost八

### 水泥强度回归问题

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

In [2]:
data = pd.read_excel("dataset/concrete/Concrete_Data.xls")
data.shape, data.columns

((1030, 9),
 Index(['Cement (component 1)(kg in a m^3 mixture)',
        'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
        'Fly Ash (component 3)(kg in a m^3 mixture)',
        'Water  (component 4)(kg in a m^3 mixture)',
        'Superplasticizer (component 5)(kg in a m^3 mixture)',
        'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
        'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
        'Concrete compressive strength(MPa, megapascals) '],
       dtype='object'))

In [3]:
data.head(5)

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [4]:
data.rename(columns={"Concrete compressive strength(MPa, megapascals) ": "label"}, inplace=True)
data.head(5)

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),label
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)

In [7]:
xgb_train = xgb.DMatrix(data_train.iloc[:, :7], data_train.label)
xgb_test = xgb.DMatrix(data_test.iloc[:, :7], data_test.label)

In [8]:
params = {"objective": "reg:squarederror",
          "booster": "gbtree",
          "max_depth": 5,
          "eta": 0.05}

In [9]:
num_round = 100
watch_list = [(xgb_train, "training"), (xgb_test, "testing")]

In [10]:
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=num_round, evals=watch_list)

[0]	training-rmse:37.01497	testing-rmse:38.56051
[1]	training-rmse:35.37934	testing-rmse:36.88037
[2]	training-rmse:33.83223	testing-rmse:35.29172
[3]	training-rmse:32.36947	testing-rmse:33.79478
[4]	training-rmse:30.98706	testing-rmse:32.36994
[5]	training-rmse:29.67443	testing-rmse:31.01225
[6]	training-rmse:28.43419	testing-rmse:29.73952
[7]	training-rmse:27.26346	testing-rmse:28.52738
[8]	training-rmse:26.15695	testing-rmse:27.39201
[9]	training-rmse:25.11309	testing-rmse:26.31911
[10]	training-rmse:24.12539	testing-rmse:25.32935
[11]	training-rmse:23.19530	testing-rmse:24.40633
[12]	training-rmse:22.31845	testing-rmse:23.52685
[13]	training-rmse:21.49397	testing-rmse:22.71979
[14]	training-rmse:20.71589	testing-rmse:21.96205
[15]	training-rmse:19.98516	testing-rmse:21.23396
[16]	training-rmse:19.29807	testing-rmse:20.56927
[17]	training-rmse:18.64954	testing-rmse:19.93395
[18]	training-rmse:18.04500	testing-rmse:19.33588
[19]	training-rmse:17.47770	testing-rmse:18.79267
[20]	train

In [11]:
y_pred = model.predict(xgb_test)

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
print(mean_squared_error(y_true=data_test.label, y_pred=y_pred))

159.17557679351535


In [14]:
model.save_model("model/concrete.xgb")

In [15]:
# 重新加载模型
booster = xgb.Booster()
booster.load_model("model/concrete.xgb")

In [16]:
pred = booster.predict(xgb_test)
print(mean_squared_error(y_true=data_test.label, y_pred=pred))

159.17557679351535


In [17]:
# 导出模型
booster.dump_model("model/concrete.json")