In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import GridSearchCV

In [63]:
def notEmpty(s):
    return s != ''

In [64]:
# 数据加载
boston = pd.read_csv(filepath_or_buffer='datas/boston_housing.data',header=None)
data = np.empty((len(boston),14))
# enumerate：将可遍历的对象组成一个返回索引序列和对应值
# boston.values 以每行得之组成的二维数组
for index,values in enumerate(boston.values):
    data[index] = list(filter(notEmpty,values[0].split(' ')))   

In [65]:
X,Y = np.split(data,(13,),axis=1)
Y = Y.reshape(-1)
print ("样本数据量:%d, 特征个数：%d" % X.shape)
print ("target样本数据量:%d" % Y.shape[0])

样本数据量:506, 特征个数：13
target样本数据量:506


In [66]:
# 训练集和测试集划分
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2,random_state = 214)
print ("训练数据集样本数目：%d, 测试数据集样本数目：%d" % (X_train.shape[0], Y_test.shape[0]))

训练数据集样本数目：404, 测试数据集样本数目：102


In [67]:
# 标准化 对特征属性进行标准化操作
# x - min /(max - min) 缩放到0-1之间
mm = MinMaxScaler()
X_train = mm.fit_transform(X_train,Y_train)
X_test = mm.transform(X_test)

print ("原始数据各个特征属性的调整最小值:",mm.min_)
print ("原始数据各个特征属性的缩放数据值:",mm.scale_)

原始数据各个特征属性的调整最小值: [ -8.59539075e-05   0.00000000e+00  -1.68621701e-02   0.00000000e+00
  -7.92181070e-01  -6.82314620e-01  -2.98661174e-02  -1.02719857e-01
  -4.34782609e-02  -3.56870229e-01  -1.34042553e+00  -8.06898986e-04
  -4.77373068e-02]
原始数据各个特征属性的缩放数据值: [  1.36003018e-02   1.00000000e-02   3.66568915e-02   1.00000000e+00
   2.05761317e+00   1.91607588e-01   1.02986612e-02   9.09347180e-02
   4.34782609e-02   1.90839695e-03   1.06382979e-01   2.52155933e-03
   2.75938190e-02]


In [68]:
# 模型对象创建(回归)
model = DecisionTreeRegressor(criterion='mae',max_depth=7)
# 模型训练
model.fit(X_train,Y_train)

DecisionTreeRegressor(criterion='mae', max_depth=7, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [69]:
# 模型效果评估
print('模型效果Score：{}'.format(model.score(X_test,Y_test)))

模型效果Score：0.7713772019125358


In [79]:
# 线性回归
linea_model = LinearRegression()
linea_model.fit(X_train,Y_train)
print('线性回归模型Score：{}'.format(linea_model.score(X_test,Y_test)))
print('线性回归参数：')
print(linea_model.coef_)

# lass回归
lass_model = LassoCV(alphas=np.logspace(-3,1,20))
lass_model.fit(X_train,Y_train)
print('lass回归模型Score：{}'.format(lass_model.score(X_test,Y_test)))
print('lass回归参数：')
print(lass_model.coef_)

# ridge回归
ridge_model = RidgeCV(alphas=np.logspace(-3,1,20))
ridge_model.fit(X_train,Y_train)
print('rigde回归模型Score：{}'.format(ridge_model.score(X_test,Y_test)))
print('rigde回归参数：')
print(ridge_model.coef_)

线性回归模型Score：0.6695223194488773
线性回归参数：
[ -5.86991024   3.69428421   0.03334099   2.80386891  -7.10340807
  21.76235208  -1.32508942 -14.81083325   5.1801444   -5.41420265
  -9.15446403   3.3243909  -15.9303055 ]
lass回归模型Score：0.6684691324119698
lass回归参数：
[ -5.67447233   3.61494365  -0.           2.80548345  -7.00469852
  21.76024083  -1.30650309 -14.61709082   5.026602    -5.27658138
  -9.14305336   3.30979484 -15.95343155]
rigde回归模型Score：0.6584215078884722
rigde回归参数：
[ -4.85699215   3.1578429   -0.45926748   2.94946373  -6.02470574
  20.42827283  -1.24783757 -12.89674199   4.29977518  -4.64263667
  -9.13024411   3.24798737 -15.91383085]


In [83]:
# 模型的参数组合
pipes = [
    Pipeline([
        ('mms', MinMaxScaler()), ## 归一化操作
        ('pca', PCA()), ## 降纬
        ('decision', DecisionTreeRegressor(criterion='mse'))
    ]),
    
       Pipeline([
        ('mms', MinMaxScaler()), ## 归一化操作
        ('decision', DecisionTreeRegressor(criterion='mse'))
    ]),
       Pipeline([
        ('decision', DecisionTreeRegressor(criterion='mse'))
    ]),
]
parameters = [
    {
    "pca__n_components": [0.25,0.5,0.75,1],
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    },
    {
    "decision__max_depth":  np.linspace(1,20,20).astype(np.int8)
    }
]
for p,para in zip(pipes,parameters):    
    algo = GridSearchCV(p, param_grid=para)
    algo.fi

NameError: name 'PCA' is not defined