In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

plt.style.use({'figure.figsize':(25,20)})
plt.rcParams['font.sans-serif']=['SimHei'] #用来显示中文
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

In [2]:
df=pd.read_csv('housing_all.csv')
X=df.drop(['MEDV'],axis=1)
y=df['MEDV']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [5]:
#从机器学习与数据挖掘工具库sklearn中导入随机森林模型
from sklearn.ensemble import RandomForestRegressor

#导入网格搜索交叉验证，网格搜索可以让模型参数按我们给定的列表遍历，找到效果最好的模型
#交叉验证可以充分评估回归模型的准确性
from sklearn.model_selection import GridSearchCV

#构造参数字典，我们让这三个参数按列表中的顺序排列组合遍历一遍
param_grid={
    'n_estimators':[5,10,20,50,100,200],#决策树的个数
    'max_depth':[3,5,7],#最大树深，树太深会造成过拟合
    'max_features':[0.6,0.7,0.8,1]#决策划分时考虑的最大特征数
}

#实例化随机森林回归器
rf=RandomForestRegressor()

#以随机森林回归器为基础构造网格搜索回归器
grid=GridSearchCV(rf,param_grid=param_grid,cv=3)

#在训练集上训练
grid.fit(X_train,y_train)


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7],
       

### 选取最优参数对应模型

In [6]:
#查看效果最好的参数
grid.best_params_

{'max_depth': 7, 'max_features': 0.6, 'n_estimators': 20}

In [8]:
#指定模型为效果最好参数的模型
rf_reg=grid.best_estimator_

In [9]:
rf_reg

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features=0.6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

### 可视化其中一棵树

In [10]:
from sklearn import tree

In [12]:
import pydotplus

In [13]:
from IPython.display import Image, display

In [14]:
estimator=rf_reg.estimators_[3]
dot_data=tree.export_graphviz(estimator,
                             out_file=None,
                             filled=True,
                             rounded=True
                             )
graph=pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))

InvocationException: GraphViz's executables not found