In [1]:
"""
随机森林学习
"""

'\n随机森林学习\n'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import label_binarize
from sklearn import metrics



In [3]:
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [4]:
path = "datas/risk_factors_cervical_cancer.csv"  # 数据文件路径
data = pd.read_csv(path)
data.head(2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [10]:
colums = data.columns.values

In [13]:
X =  data.loc[:,colums[0:-4]]
X.head(1)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,?,?,0,0,0,0


In [16]:
Y = data.loc[:,colums[-4:]]
Y.head(1)

Unnamed: 0,Hinselmann,Schiller,Citology,Biopsy
0,0,0,0,0


In [None]:
"""
=======================================数据预处理=================================
https://blog.csdn.net/sinat_35512245/article/details/79685891
"""

In [19]:
#### 1.查看是否有空数据

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
Age                                   858 non-null int64
Number of sexual partners             858 non-null object
First sexual intercourse              858 non-null object
Num of pregnancies                    858 non-null object
Smokes                                858 non-null object
Smokes (years)                        858 non-null object
Smokes (packs/year)                   858 non-null object
Hormonal Contraceptives               858 non-null object
Hormonal Contraceptives (years)       858 non-null object
IUD                                   858 non-null object
IUD (years)                           858 non-null object
STDs                                  858 non-null object
STDs (number)                         858 non-null object
STDs:condylomatosis                   858 non-null object
STDs:cervical condylomatosis          858 non-null object
STDs:vaginal condylomatosi

In [23]:
### 数据中有 ？ 的处理

In [24]:
X = X.replace("?", np.NAN)

In [28]:
imputer = Imputer(missing_values="NaN")
X = imputer.fit_transform(X, Y)
X[0] ###series 取行操作

array([18.        ,  4.        , 15.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  6.14084507,  5.81690141,  0.        ,  0.        ,
        0.        ,  0.        ])

In [29]:
####数据分隔
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print ("训练样本数量:%d,特征属性数目:%d,目标属性数目:%d" % (x_train.shape[0],x_train.shape[1],y_train.shape[1]))
print ("测试样本数量:%d" % x_test.shape[0])

训练样本数量:686,特征属性数目:32,目标属性数目:4
测试样本数量:172


In [31]:
###最大最小化

In [32]:
ss = MinMaxScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

In [33]:
####降维
pca = PCA(n_components=2)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [34]:
"""
随机森林模型，参数分两部分：
RF 框架参数：
n_estimators: 最大迭代次数，或者说最大的弱学习器的个数
oob_score: 是否采用袋外样本来评估模型的好坏，默认为False。
criterion: CART 做划分时，对特征的评价标准。
            分类RF 对应的CART 分类树默认的基尼系数gini，另外一个可选择的标准是信息增益
            回归RF 对应的CART 回归树默认是均方差mse, 另一个可选的标准是mae(绝对值误差)。
决策树参数：
max_features: RF 划分时考虑的最大特征数。 
max_depth: 决策树的最大深度。推荐这个最大深度的值为10~100
min_samples_split： 内部节点再划分需要的最小样本数；默认是2；
                    如果某节点的样本数小于min_samples_split， 则不会继续再尝试选择最优特征来进行划分
min_samples_leaf： 这个值限制了叶子节点最少的样本数，如果节点的样本数量小于这个值， 
                    则会和兄弟节点一起被剪枝。  
"""

'\n随机森林模型，参数分两部分：\nRF 框架参数：\nn_estimators: 最大迭代次数，或者说最大的弱学习器的个数\noob_score: 是否采用袋外样本来评估模型的好坏，默认为False。\ncriterion: CART 做划分时，对特征的评价标准。\n            分类RF 对应的CART 分类树默认的基尼系数gini，另外一个可选择的标准是信息增益\n            回归RF 对应的CART 回归树默认是均方差mse, 另一个可选的标准是mae(绝对值误差)。\n决策树参数：\nmax_features: RF 划分时考虑的最大特征数。 \nmax_depth: 决策树的最大深度。推荐这个最大深度的值为10~100\nmin_samples_split： 内部节点再划分需要的最小样本数；默认是2；\n                    如果某节点的样本数小于min_samples_split， 则不会继续再尝试选择最优特征来进行划分\nmin_samples_leaf： 这个值限制了叶子节点最少的样本数，如果节点的样本数量小于这个值， \n                    则会和兄弟节点一起被剪枝。  \n'

In [36]:
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=1, random_state=0)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [37]:
"""
模型效果评估： AUC和ROC
"""

'\n模型效果评估： AUC和ROC\n'

In [38]:
score = forest.score(x_test,y_test)
print("准确率：")