## 基本模型

In [17]:
from sklearn import datasets

x,y = datasets.make_classification(1000)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [35]:
import pandas as pd
pd.DataFrame(x).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.000831,0.023142,-0.048985,-0.001653,0.038837,0.015671,-0.047048,-0.053508,-0.032693,0.010008,0.006062,0.031505,0.023624,-0.011891,0.032356,-0.026536,0.018286,0.044938,-0.009244,-0.001288
std,1.248403,1.052042,1.312073,0.970187,1.287405,0.969283,1.59278,1.029535,1.00004,1.060475,1.020656,0.985442,0.96971,0.963993,0.98952,1.01996,0.982175,1.005101,0.999307,0.966813
min,-2.975477,-3.542198,-3.648496,-3.918041,-4.884116,-2.794579,-3.652313,-2.986601,-2.918837,-2.934463,-3.406244,-3.238744,-2.959043,-2.968874,-3.149839,-3.862588,-3.620017,-3.495458,-3.004282,-3.107939
25%,-0.982275,-0.670869,-1.104434,-0.627872,-0.714643,-0.618169,-1.570357,-0.779154,-0.735838,-0.709533,-0.703586,-0.665865,-0.631204,-0.637514,-0.653985,-0.696424,-0.672606,-0.637786,-0.712219,-0.665167
50%,-0.034344,0.01516,0.097427,-0.009837,0.064614,-0.009455,-0.093197,-0.078419,-0.028528,0.004389,0.005184,0.003021,0.03437,-0.018775,0.00894,-0.034147,0.030505,0.030585,0.011268,-0.014629
75%,0.933771,0.727298,1.017631,0.632537,0.972297,0.712999,1.333776,0.638344,0.636124,0.669298,0.715268,0.702941,0.689555,0.658253,0.692168,0.654612,0.674509,0.720226,0.663296,0.680776
max,3.987713,3.117381,4.090919,3.42345,2.945085,3.345936,3.593939,3.144467,3.144694,3.880467,3.003783,3.734356,3.580756,3.389002,3.288061,2.78757,3.459474,3.240767,3.132041,2.770057


In [19]:
print('Accuracy:',(y == rf.predict(x)).mean())

Accuracy: 0.997


In [20]:
print('Total correct:',(y == rf.predict(x)).sum())

Total correct: 997


In [31]:
probs = rf.predict_proba(x)
import pandas as pd
probs_df = pd.DataFrame(probs,columns=['0','1'])
probs_df['is_correct'] = y == rf.predict(x)

from pyecharts import Bar
bar = Bar("分类为0的正确率",width=600,height=300)
bar.add('',probs_df.groupby('0').is_correct.mean().index,probs_df.groupby('0').is_correct.mean().values)
bar

In [29]:
# 查看特征的重要性
rf = RandomForestClassifier()
rf.fit(x,y)
bar = Bar('特征重要程度',width=600,height=300)
bar.add('',list(range(len(rf.feature_importances_))),rf.feature_importances_)
bar

## 调参  
通过调参来优化模型表现

In [55]:
from sklearn import datasets
# 生成数据
x,y = datasets.make_classification(n_samples=10000,
                                  n_features=20,
                                  n_informative=15,
                                  flip_y=.5,
                                  weights=[.8,.2])

import numpy as np
training = np.random.choice([True,False],p=[.8,.2],size=y.shape)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x[training],y[training])
preds = rf.predict(x[~training])
print("正确率：",(preds==y[~training]).mean())

正确率： 0.676530612245


* max_features 特征变量数量

In [103]:
from sklearn.metrics import confusion_matrix
max_feature_params = [None,'auto','sqrt','log2',.01,.5,.99]  #max_features用于设置构建决策树时需要的特征变量个数，保证每棵决策树都不相同
confusion_matrixes = {}
for max_feature in max_feature_params:
    rf = RandomForestClassifier(max_features=max_feature)
    rf.fit(x[training],y[training])
    confusion_matrixes[max_feature] = confusion_matrix(y[~training],rf.predict(x[~training])).ravel()

In [97]:
bar = Bar('混淆矩阵',width=600,height=300)
for confusion_matrix in confusion_matrixes:
    bar.add(str(confusion_matrix),['0->0','0->1','1->0','1->1'],confusion_matrixes[confusion_matrix])
bar

* n_estimators 决策树数量

In [111]:
n_estimator_params = range(1,20)
accuracy_matrix = {}
for n_estimator in n_estimator_params:
    rf = RandomForestClassifier(n_estimators=n_estimator)
    rf.fit(x[training],y[training])
    accuracy_matrix[n_estimator] = np.trace(confusion_matrix(y[~training],rf.predict(x[~training])))/np.sum(confusion_matrix(y[~training],rf.predict(x[~training])))

In [125]:
bar = Bar(width=600,height=300)  
bar.add('',list(n_estimator_params),list(accuracy_matrix.values()),xaxis_name='决策树数量',yaxis_name='准确率',yaxis_name_pos='end')
bar

* boostrap 有放回的抽取样本

* n_jobs 加快训练过程

In [135]:
rf = RandomForestClassifier(n_jobs=4,verbose=True)
rf.fit(x[training],y[training])

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
            verbose=True, warm_start=False)

In [136]:
(rf.predict(x[~training])==y[~training]).mean()

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


0.66734693877551021