In [14]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor

In [15]:
def notEmpty(s):
    return s != ''

In [16]:
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False

##### 1、加载数据

In [17]:
names = ['CRIM','ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
path = "datas/boston_housing.data"

fd = pd.read_csv(path, header=None)
data = np.empty((len(fd), 14))
for i, d in enumerate(fd.values):
    d = map(float, filter(notEmpty, d[0].split(' ')))
    data[i] = list(d)

x, y = np.split(data, (13,), axis=1)
y = y.ravel()

print ("样本数据量:%d, 特征个数：%d" % x.shape)
print ("target样本数据量:%d" % y.shape[0])

样本数据量:506, 特征个数：13
target样本数据量:506


##### 2、数据分割

In [18]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("训练数据集样本数目：%d, 测试数据集样本数目：%d" % (x_train.shape[0], x_test.shape[0]))

训练数据集样本数目：404, 测试数据集样本数目：102




##### 3、构建线性回归 + 训练 + 预测 + 评估

In [19]:
lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_test_hat = lr.predict(x_test)
lr_score = lr.score(x_test, y_test)
print ("lr:", lr_score)

lr: 0.6177265992293737


## 使用集成学习方法构建多迭代的线性回归模型

##### 3.1、构建Bagging的线性模型 + 训练 + 预测 + 评估
—— BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0)[source]¶

#base_estimator：用于拟合数据集的随机子集的算法模型。默认为None，CART决策树

#max_samples：从X中抽取的用于训练每个基本估计值的样本数，即随机采样数量

#max_features：从X中提取的用于训练每个基本估计值的特征数

#bootstrap：是否有放回

In [20]:
# 在 BaggingRegressor 模型中添加 LinearRegression 回归算法模型 作为 base_estimator 基础模型
# lr处也可以直接写训练模型名 LinearRegression()
bg = BaggingRegressor(lr, n_estimators=100, max_samples=0.7, max_features=0.8,random_state=28)
bg.fit(x_train, y_train)
bg_y_test_hat = bg.predict(x_test)
bg_score = bg.score(x_test, y_test)
print ("Bagging:", bg_score)

# 实际上还是用的回归模型来训练和预测，只是用BaggingRegressor增加了迭代次数，此时预测值得到了提高
# 通常 n_estimators、max_samples和max_features 能够通过交叉验证过得最优参数

Bagging: 0.6274480726092534


##### 3.2、构建AdaBoost的线性模型 + 训练 + 预测 + 评估

In [23]:
from sklearn.ensemble import AdaBoostRegressor

abr = AdaBoostRegressor(lr, n_estimators=50, learning_rate=0.0001, random_state=28)
abr.fit(x_train, y_train)
abr_y_test_hat = abr.predict(x_test)
abr_score = abr.score(x_test, y_test)
print ("Bagging:", abr_score)

Bagging: 0.6148348568514753


##### 4、构建GBDT的线性模型 + 训练 + 预测 + 评估

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

# GradientBoostingRegressor() 没有基模型概念
gbdt = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, random_state=28)
gbdt.fit(x_train, y_train)
gbdt_y_test_hat = gbdt.predict(x_test)
gbdt_score = gbdt.score(x_test, y_test)
print ("Bagging:", gbdt_score)

Bagging: 0.8668154381095832
