In [31]:
'''
使用决策树对鸢尾花数据集进行分类
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score

#鸢尾花数据集
iris = load_iris()
print(iris.data.shape)
print(iris.feature_names)
print(iris.target_names)
print(iris.target)

data = pd.DataFrame(iris.data)
print(data)

print('columns:', data.columns)
print('-' * 100)
print(data.info())
print('-' * 100)
data.columns = iris.feature_names

print('-' * 100)
data['Species'] = iris.target
print(data)

#切分数据,选择花瓣的长宽两个特征
x = data.iloc[:, 2:4]
print(x)
y = data.iloc[:, -1]
print(y)
#将样本分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

decisionTree = DecisionTreeClassifier(max_depth=5, criterion='gini', random_state=123)
decisionTree.fit(x_train, y_train)
#导出树图
export_graphviz(decisionTree, out_file='./tree.dot', feature_names=iris.feature_names[:2],
                class_names=iris.target_names, rounded=True, filled=True)

#模型评估
y_predict = decisionTree.predict(x_test)
print('score:', accuracy_score(y_test, y_predict))

#预测
xx_ = pd.DataFrame([[1.4, 0.2]])
xx_.columns = iris.feature_names[2:4]
y_hat = decisionTree.predict_proba(xx_)
print(y_hat)



(150, 4)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
       0    1    2    3
0    5.1  3.5  1.4  0.2
1    4.9  3.0  1.4  0.2
2    4.7  3.2  1.3  0.2
3    4.6  3.1  1.5  0.2
4    5.0  3.6  1.4  0.2
..   ...  ...  ...  ...
145  6.7  3.0  5.2  2.3
146  6.3  2.5  5.0  1.9
147  6.5  3.0  5.2  2.0
148  6.2  3.4  5.4  2.3
149  5.9  3.0  5.1  1.8

[150 rows x 4 columns]
columns: RangeIndex(start=0, stop=4, step=1)
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #  

In [13]:
'''
使用鸢尾花数据训练随机森林模型
'''

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
x=iris.data[:,:2]
y=iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

'''
参数详解：
    n_estimators: 随机森林中基础决策树的数量，较大的数量能提升模型的稳定性，但是会增加计算量
    criterion: 衡量节点分裂效果的标准
        - gini 基尼系数
        - entropy 信息增益（基于熵）
        - log_loss 基于对数损失计算（适用于概率预测）
    max_depth：最大树深度，用于防止过拟合，对于大数据集，建议使用 max_depth=10~20
    min_samples_split：节点分裂的最小样本数，作用是控制内部节点必须包含的最小样本数，低于该值不会进行分裂，较大的值可以防止过拟合
    max_features:选择特征的最大数量，控制每棵树在分列时考虑的最大特征数，增加多样性
        - sqrt sqrt(n_features)
        - log2 log2(n_features)
        - None 所有特征
    bootstrap:是否对数据进行有放回采样，提高模型的泛化能力
'''
rfc = RandomForestClassifier(n_estimators=1000, criterion='gini', random_state=123,max_depth=5,bootstrap=True)
rfc.fit(x_train, y_train)


y_predict=rfc.predict(x_test)

print('score:',accuracy_score(y_test, y_predict))

#返回预测的类别
print(rfc.predict([[1.4, 0.2]]))
#返回属于每个类别的概率
print(rfc.predict_proba([[1.4, 0.2]]))

#查看特征重要性
fis=rfc.feature_importances_
print(fis)

fi_name=[(name,round(i,3)) for i,name in zip(fis,iris.feature_names[:2])]
fi_name.sort(key=lambda x:x[1], reverse=True)

print(fi_name)


score: 0.8333333333333334
[0]
[[0.61758631 0.31960749 0.0628062 ]]
[0.64672025 0.35327975]
[('sepal width (cm)', 0.353), ('sepal length (cm)', 0.647)]
