In [1]:
import numpy as np 
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

# 读入数据集，并划分成训练集、测试集

In [10]:
iris=load_iris()
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df['species']=pd.Categorical.from_codes(iris.target,iris.target_names)
df['is_train']=np.random.uniform(0,1,len(df))<=.75 #在0-1的均匀分布中采样，若<=0.75则记为true
train=df[df['is_train']==True]
test=df[df['is_train']==False]

print(len(train),len(test))
train.head()

117 33


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
5,5.4,3.9,1.7,0.4,setosa,True
6,4.6,3.4,1.4,0.3,setosa,True


# 数据预处理

In [13]:
features=df.columns[:4]
y=pd.factorize(train['species'])[0] #因为类别是文字，要将类别进行编码
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

# 训练随机森林分类器

In [15]:
clf=RandomForestClassifier(n_jobs=2)
#print(len(train[features]),len(y))
clf.fit(train[features],y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# 预测与评估

In [16]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2])

In [17]:
clf.predict_proba(test[features])

array([[ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0.1,  0.9],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  1. ,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.1,  0.9],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.9,  0.1],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.5,  0.5],
       [ 0. ,  0. ,  1. ]])

In [20]:
preds=iris.target_names[clf.predict(test[features])]#将预测出的数字转换成类别
pd.crosstab(test['species'],preds,rownames=['实际类别'],colnames=['预测类别'])

预测类别,setosa,versicolor,virginica
实际类别,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,7,0,0
versicolor,0,9,1
virginica,0,2,14


## 哪个特征比较重要?

In [24]:
list(zip(train[features],clf.feature_importances_))

[('sepal length (cm)', 0.19701643438454083),
 ('sepal width (cm)', 0.017667576077822919),
 ('petal length (cm)', 0.38144546953245545),
 ('petal width (cm)', 0.40387052000518076)]