In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [2]:
from sklearn.datasets import load_boston

In [3]:
data, target = load_boston(return_X_y=True)
data.shape, target.shape


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

((506, 13), (506,))

In [4]:
# 拆分数据集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

######  XGBRegressor

In [5]:
# 创建模型
xgbr = XGBRegressor()

- 参数调优

- XGBoost模型的参数有很多，这里选取几个进行调优

- max_depth：弱学习器决策树的最大深度，默认为 3

- n_estimators：弱学习器的个数，或者说弱学习器的最大迭代次数，默认为 100

- learning_rate：学习率，又称为每个弱学习器的权重缩减系数，取值范围为 (0, 1]，取值较小意味着要达到一定的学习效果，需要更多迭代次数和更多弱学习器，默认取 0.1，通常 n_estimators 和 learning_rate 一起决定算法的拟合效果，所以这两个参数要一起调优。

In [6]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [1, 3, 5],
             'n_estimators': [50, 100, 150],
             'learning_rate': [0.01, 0.05, 0.1, 0.2]}

gv = GridSearchCV(xgbr, parameters,n_jobs=-1)

In [7]:
gv.fit(x_train,y_train)

In [8]:
# 最佳参数
gv.best_params_

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150}

In [9]:
# 最佳得分
gv.best_score_

0.858877932086696

In [10]:
# 最佳模型
best_xg = gv.best_estimator_

In [11]:
best_xg.feature_importances_

array([0.03404512, 0.00472149, 0.00905133, 0.02933691, 0.07306802,
       0.35349125, 0.01473324, 0.05707052, 0.017681  , 0.03687775,
       0.08238003, 0.00895172, 0.27859172], dtype=float32)

In [11]:
# 预测
y_pred = best_xg.predict(x_test)
y_pred

array([12.185475, 38.421406, 21.720942, 14.137365, 23.482157, 13.566648,
       11.631475, 17.823881, 38.93545 , 33.982414, 15.743959, 24.252289,
       10.340374, 23.762968, 35.460907,  9.168151,  8.732341, 22.880802,
       19.981281, 23.867733,  9.00175 , 20.632627, 10.064144, 18.559296,
       34.15566 , 22.308971, 30.833427, 20.917812,  6.692421, 27.536451,
       18.692007, 16.58305 , 18.961786, 20.933899, 22.605364, 18.19265 ,
       27.378187, 28.91216 , 21.449835, 15.110621, 40.835117, 47.594208,
       18.228502, 20.401974, 29.549093, 15.544526, 36.81953 , 20.649395,
       17.72191 , 19.864054, 26.928663, 23.063057, 17.084686, 40.51509 ,
       44.58927 , 21.228718, 18.615341, 22.74261 , 22.361956, 20.949318,
       21.282188, 19.980446, 30.987883, 19.891958, 13.782396, 47.103813,
       31.436825, 16.277624, 21.289263, 19.974277, 31.470558, 13.883835,
       21.455063, 15.979523, 44.709717,  8.136894, 25.442686, 20.504946,
       25.396288, 16.914999, 22.412933, 19.540255, 

In [12]:
from sklearn.metrics import mean_squared_error as mse

In [13]:
mse(y_test, y_pred)

11.845874480532155

 - mse在10左右正常