# 1.导入库和模块

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.externals import joblib

# 2.导入数据

In [2]:

data = pd.read_csv('D:\my_dateset\winequality-white.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


# 3.把数据分为训练集和测试集

In [3]:
y= data.quality
x=data.drop( 'quality',axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0,stratify=y)

1.train_test__spilt是用来随机划分测试集训练集的函数需要从sklearn中调用

2.test_size=* 表示训练集与测试集的比例，一般*为0-1，如果为整数则表示测试集的数量

3.random_state=* 表示随机数种子，用来控制随机状态

4.stratify=y 就是按y的比例分配训练集与测试集

# 4.声明数据预处理步骤

In [4]:
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators= 100))

1.StandardScaler（）函数先通过计算训练集中特征的均值、标准差，对每个特征进行独立居中和缩放。然后，将平均值和标准偏差存储起来，在以后的测试集上有相同比例来缩放。

2.n_estimators越大越好，但占用的内存与训练和预测的时间也会相应增长，且边际效益是递减的，所以要在可承受的内存/时间内选取尽可能大的n_estimators。而在sklearn中，n_estimators默认为10。

# 5.声明超参数

In [5]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],'randomforestregressor__max_depth': [ None,10 , 3, 1]}

1.选取的特征子集中特征的个数通过max_features参数来控制，max_features越小，随机森林中的树就越不相同，但过小（取1时）会导致在划分时无法选择对哪个特征进行测试。而在sklearn中，max_features有以下几种选取方法："auto", "sqrt", "log2", None。auto与sqrt都是取特征总数的开方，log2取特征总数的对数，None则是令max_features直接等于特征总数，而max_features的默认值是"auto"。

2.hyperparameters 超参数

# 6.模型调优

In [6]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 10, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

1.GridSearchCV，它存在的意义就是自动调参，只要把参数输进去，就能给出最优化的结果和参数
#注：适合于小数据集，一旦数据的量级上去了，很难得出结果

2.交叉验证（Cross-validation）：通过使用相同方法多次训练和评估模型来可靠地估计构建模型的方法的性能的过程

3.交叉验证参数cv，默认None，使用三折交叉验证。指定fold数量，默认为3，也可以是yield产生训练/测试数据的生成器,cv=10即十折交叉验证

4.clf.fit(x_train,y_train) 用训练集数据训练模型

# 7.评估模型并预测

In [7]:
y_pred = clf.predict(x_test)
print(r2_score(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))

0.5622273238539899
0.3436542857142857


1.R2 决定系数（拟合优度）R2越趋于1拟合度越好 越趋于0拟合度越差
2.均方差（mean-squared-error）

In [8]:
y_pred_train = clf.predict(x_train) #在测试集合上预测
y_pred_test = clf.predict(x_test) #在测试集合上预测

print("训练集合上R^2 = {:.3f}".format(r2_score(y_train, y_pred_train)))
print("测试集合上R^2 = {:.3f} ".format(r2_score(y_test,y_pred_test)))

训练集合上R^2 = 0.934
测试集合上R^2 = 0.562 
