In [129]:
"""
随机森林学习
"""

'\n随机森林学习\n'

In [130]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import label_binarize
from sklearn import metrics

In [131]:
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [132]:
path = "data/risk_factors_cervical_cancer.csv"  # 数据文件路径
data = pd.read_csv(path)
data.head(2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [133]:
colums = data.columns.values
colums.shape

(36,)

In [134]:
X =  data.loc[:,colums[0:-4]]
X.head(1)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,?,?,0,0,0,0


In [135]:
Y = data.loc[:,colums[-4:]]
dict(Y['Biopsy'].value_counts())

{0: 803, 1: 55}

In [136]:
"""
=======================================数据预处理=================================
https://blog.csdn.net/sinat_35512245/article/details/79685891
"""



In [137]:
#### 1.查看是否有空数据

In [138]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
Age                                   858 non-null int64
Number of sexual partners             858 non-null object
First sexual intercourse              858 non-null object
Num of pregnancies                    858 non-null object
Smokes                                858 non-null object
Smokes (years)                        858 non-null object
Smokes (packs/year)                   858 non-null object
Hormonal Contraceptives               858 non-null object
Hormonal Contraceptives (years)       858 non-null object
IUD                                   858 non-null object
IUD (years)                           858 non-null object
STDs                                  858 non-null object
STDs (number)                         858 non-null object
STDs:condylomatosis                   858 non-null object
STDs:cervical condylomatosis          858 non-null object
STDs:vaginal condylomatosi

In [139]:
### 数据中有 ？ 的处理

In [140]:
X = X.replace("?", np.NAN)

In [141]:
imputer = Imputer(missing_values="NaN")
X = imputer.fit_transform(X, Y)
X[0] ###series 取行操作

array([18.        ,  4.        , 15.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  6.14084507,  5.81690141,  0.        ,  0.        ,
        0.        ,  0.        ])

In [142]:
####数据分隔
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print ("训练样本数量:%d,特征属性数目:%d,目标属性数目:%d" % (x_train.shape[0],x_train.shape[1],y_train.shape[1]))
print ("测试样本数量:%d" % x_test.shape[0])

训练样本数量:686,特征属性数目:32,目标属性数目:4
测试样本数量:172


In [143]:
###最大最小化

In [144]:
ss = MinMaxScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

In [145]:
####降维
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [146]:
x_train.shape

(686, 2)

In [29]:
"""
随机森林模型，参数分两部分：
RF 框架参数：
n_estimators: 最大迭代次数，或者说最大的弱学习器的个数
oob_score: 是否采用袋外样本来评估模型的好坏，默认为False。
criterion: CART 做划分时，对特征的评价标准。
            分类RF 对应的CART 分类树默认的基尼系数gini，另外一个可选择的标准是信息增益
            回归RF 对应的CART 回归树默认是均方差mse, 另一个可选的标准是mae(绝对值误差)。
决策树参数：
max_features: RF 划分时考虑的最大特征数。 
max_depth: 决策树的最大深度。推荐这个最大深度的值为10~100
min_samples_split： 内部节点再划分需要的最小样本数；默认是2；
                    如果某节点的样本数小于min_samples_split， 则不会继续再尝试选择最优特征来进行划分
min_samples_leaf： 这个值限制了叶子节点最少的样本数，如果节点的样本数量小于这个值， 
                    则会和兄弟节点一起被剪枝。  
"""

'\n随机森林模型，参数分两部分：\nRF 框架参数：\nn_estimators: 最大迭代次数，或者说最大的弱学习器的个数\noob_score: 是否采用袋外样本来评估模型的好坏，默认为False。\ncriterion: CART 做划分时，对特征的评价标准。\n            分类RF 对应的CART 分类树默认的基尼系数gini，另外一个可选择的标准是信息增益\n            回归RF 对应的CART 回归树默认是均方差mse, 另一个可选的标准是mae(绝对值误差)。\n决策树参数：\nmax_features: RF 划分时考虑的最大特征数。 \nmax_depth: 决策树的最大深度。推荐这个最大深度的值为10~100\nmin_samples_split： 内部节点再划分需要的最小样本数；默认是2；\n                    如果某节点的样本数小于min_samples_split， 则不会继续再尝试选择最优特征来进行划分\nmin_samples_leaf： 这个值限制了叶子节点最少的样本数，如果节点的样本数量小于这个值， \n                    则会和兄弟节点一起被剪枝。  \n'

In [30]:
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=1, random_state=0)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [31]:
"""
模型效果评估： AUC和ROC
"""

'\n模型效果评估： AUC和ROC\n'

In [32]:
score = forest.score(x_test,y_test)
print("准确率：%.2f%%" % (score * 100))

准确率：89.53%


In [88]:
####模型预测
forest_y_score = forest.predict_proba(x_test)

(172, 2)

In [84]:
#计算ROC值
forest_fpr1, forest_tpr1, _ = metrics.roc_curve(label_binarize(y_test[colums[-4]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[0].ravel())
forest_fpr2, forest_tpr2, _ = metrics.roc_curve(label_binarize(y_test[colums[-3]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[1].ravel())
forest_fpr3, forest_tpr3, _ = metrics.roc_curve(label_binarize(y_test[colums[-2]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[2].ravel())
forest_fpr4, forest_tpr4, _ = metrics.roc_curve(label_binarize(y_test[colums[-1]],classes=(0,1,2)).T[0:-1].T.ravel(), forest_y_score[3].ravel())

In [87]:
forest_fpr1.shape

(36,)

In [89]:
#AUC值
auc1 = metrics.auc(forest_fpr1, forest_tpr1)
auc2 = metrics.auc(forest_fpr2, forest_tpr2)
auc3 = metrics.auc(forest_fpr3, forest_tpr3)
auc4 = metrics.auc(forest_fpr4, forest_tpr4)

print ("Hinselmann目标属性AUC值：", auc1)
print ("Schiller目标属性AUC值：", auc2)
print ("Citology目标属性AUC值：", auc3)
print ("Biopsy目标属性AUC值：", auc4)

Hinselmann目标属性AUC值： 0.9901974040021634
Schiller目标属性AUC值： 0.9559221200648998
Citology目标属性AUC值： 0.9637979989183343
Biopsy目标属性AUC值： 0.9568685776095187


In [90]:
"""
参数调整，比较效果
"""

'\n参数调整，比较效果\n'

In [None]:
###GridSearch 调整模型的迭代次数： n_estimators


In [94]:
param_test1 = {'n_estimators':list(range(10,100,10))}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,max_depth=1,random_state=0), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(x_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.56764, std: 0.04135, params: {'n_estimators': 10},
  mean: 0.55097, std: 0.05901, params: {'n_estimators': 20},
  mean: 0.55312, std: 0.05771, params: {'n_estimators': 30},
  mean: 0.56682, std: 0.08755, params: {'n_estimators': 40},
  mean: 0.56311, std: 0.08798, params: {'n_estimators': 50},
  mean: 0.55925, std: 0.08358, params: {'n_estimators': 60},
  mean: 0.54874, std: 0.06371, params: {'n_estimators': 70},
  mean: 0.54708, std: 0.06375, params: {'n_estimators': 80},
  mean: 0.54679, std: 0.06346, params: {'n_estimators': 90}],
 {'n_estimators': 10},
 0.567641597211095)

In [101]:
#### 调整树的深度， 和n_estimators 一起学
param_test2 = {'n_estimators':list(range(10,110,10)),'max_depth': list(range(1,14,2))}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,random_state=0), 
                       param_grid = param_test2, scoring='roc_auc',cv=5)
gsearch2.fit(x_train, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.56764, std: 0.04135, params: {'max_depth': 1, 'n_estimators': 10},
  mean: 0.55097, std: 0.05901, params: {'max_depth': 1, 'n_estimators': 20},
  mean: 0.55312, std: 0.05771, params: {'max_depth': 1, 'n_estimators': 30},
  mean: 0.56682, std: 0.08755, params: {'max_depth': 1, 'n_estimators': 40},
  mean: 0.56311, std: 0.08798, params: {'max_depth': 1, 'n_estimators': 50},
  mean: 0.55925, std: 0.08358, params: {'max_depth': 1, 'n_estimators': 60},
  mean: 0.54874, std: 0.06371, params: {'max_depth': 1, 'n_estimators': 70},
  mean: 0.54708, std: 0.06375, params: {'max_depth': 1, 'n_estimators': 80},
  mean: 0.54679, std: 0.06346, params: {'max_depth': 1, 'n_estimators': 90},
  mean: 0.54916, std: 0.06262, params: {'max_depth': 1, 'n_estimators': 100},
  mean: 0.56905, std: 0.07986, params: {'max_depth': 3, 'n_estimators': 10},
  mean: 0.54655, std: 0.04575, params: {'max_depth': 3, 'n_estimators': 20},
  mean: 0.54682, std: 0.04490, params: {'max_depth': 3, 'n_estimators': 30}

In [102]:
####通过选择的最好参数来查看AUC 
forest1 = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=11, random_state=0)
forest1.fit(x_train, y_train)
score = forest1.score(x_test,y_test)
print("准确率：%.2f%%" % (score * 100))

准确率：87.79%


In [None]:
"""
在选择好的参数基础上调整min_samples_split 和 min_samples_leaf
"""

In [103]:
param_test3 = {'min_samples_split':list(range(80,150,20)),'min_samples_leaf': list(range(10,60,10))}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=11, random_state=0), 
                       param_grid = param_test3, scoring='roc_auc',cv=5)
gsearch3.fit(x_train, y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.59136, std: 0.06060, params: {'min_samples_leaf': 10, 'min_samples_split': 80},
  mean: 0.58690, std: 0.06048, params: {'min_samples_leaf': 10, 'min_samples_split': 100},
  mean: 0.58559, std: 0.05548, params: {'min_samples_leaf': 10, 'min_samples_split': 120},
  mean: 0.58643, std: 0.05950, params: {'min_samples_leaf': 10, 'min_samples_split': 140},
  mean: 0.57729, std: 0.06430, params: {'min_samples_leaf': 20, 'min_samples_split': 80},
  mean: 0.57748, std: 0.06139, params: {'min_samples_leaf': 20, 'min_samples_split': 100},
  mean: 0.57709, std: 0.06237, params: {'min_samples_leaf': 20, 'min_samples_split': 120},
  mean: 0.57625, std: 0.06590, params: {'min_samples_leaf': 20, 'min_samples_split': 140},
  mean: 0.58283, std: 0.06892, params: {'min_samples_leaf': 30, 'min_samples_split': 80},
  mean: 0.58216, std: 0.07170, params: {'min_samples_leaf': 30, 'min_samples_split': 100},
  mean: 0.58387, std: 0.07130, params: {'min_samples_leaf': 30, 'min_samples_split': 120},
  

In [104]:
####通过选择的最好参数来查看AUC 
forest1 = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=11, random_state=0,min_samples_leaf=10,min_samples_split=80)
forest1.fit(x_train, y_train)
score = forest1.score(x_test,y_test)
print("准确率：%.2f%%" % (score * 100))

准确率：89.53%


In [148]:
forest1.oob_score_

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_score_'

In [None]:
"""
调整样本数：max_features
"""

In [147]:
param_test4 = {'max_features':list(range(1,2,1))}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=11, random_state=0,min_samples_leaf=10,min_samples_split=80), 
                       param_grid = param_test4, scoring='roc_auc',cv=5)
gsearch4.fit(x_train, y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.59136, std: 0.06060, params: {'max_features': 1}],
 {'max_features': 1},
 0.5913592622550055)