In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import metrics

1.获取数据 在线下载

2.数据基本处理 缺失值处理，划分数据集

3.特征工程 数据标准化

4.机器学习 逻辑回归的建模 

5.模型评估 二分类 准确率 精确率 召回率 F1_score AUC

# 获取数据 在线下载

## pandas在线下载数据

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')

## 查看数据

In [3]:
data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


## 修改列的名字

In [4]:
#给列名字
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape','Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin','Normal Nucleoli', 'Mitoses', 'Class']

In [5]:
#给data增加一个names参数
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names = names)

In [6]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## 列名解释

- Sample code number 样本编号 用不到
- Clump Thickness 肿瘤特征1
- Uniformity of Cell Size 肿瘤特征2
- Uniformity of Cell Shape 肿瘤特征3
- Marginal Adhesion 肿瘤特征4
- Single Epithelial Cell Size 肿瘤特征5
- Bare Nuclei 肿瘤特征6
- Bland Chromatin 肿瘤特征7
- Normal Nucleoli 肿瘤特征8
- Mitoses 肿瘤特征9
- Class 肿瘤的种类

在实际工作中，要弄清楚每一个肿瘤特征代表什么含义，这样才能做好异常值缺失值的处理

## 查看目标值

In [7]:
data.Class#说明：2表示良性，4表示恶性

0      2
1      2
2      2
3      2
4      2
5      4
6      2
7      2
8      2
9      2
10     2
11     2
12     4
13     2
14     4
15     4
16     2
17     2
18     4
19     2
20     4
21     4
22     2
23     4
24     2
25     4
26     2
27     2
28     2
29     2
      ..
669    4
670    4
671    2
672    2
673    2
674    2
675    2
676    2
677    2
678    2
679    2
680    4
681    4
682    2
683    2
684    2
685    2
686    2
687    2
688    2
689    2
690    2
691    4
692    2
693    2
694    2
695    2
696    4
697    4
698    4
Name: Class, Length: 699, dtype: int64

# 数据基本处理 缺失值处理，划分数据集

## ？标志的缺失值

In [8]:
#替换缺失值
data = data.replace(to_replace='?',value=np.nan)

In [9]:
## 删除缺失值的样本
data = data.dropna() #删除有np.nan的行

## 划分数据集

In [10]:
#特征值
x = data.iloc[:,1:10]
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [11]:
#目标值
y = data.iloc[:,-1]
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# 特征工程 标准化

In [13]:
transform = StandardScaler()#实例化转换器
#标准化
x_train = transform.fit_transform(x_train)
x_test = transform.fit_transform(x_test)

# 逻辑回归机器学习建模

## 建立模型

In [14]:
estimate = LogisticRegression()#用默认的就行

## 训练模型

In [15]:
estimate.fit(x_train,y_train)#得到了模型

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# 模型评估

## 计算准确率

In [16]:
estimate.score(x_train,y_train)

0.9688644688644689

In [17]:
estimate.score(x_test,y_test)

0.9635036496350365

## 分类评估报告api

In [18]:
#获取预测值
y_predict = estimate.predict(x_test)

In [19]:
y_predict

array([4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4,
       2, 4, 2, 4, 4, 4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 4, 4, 4, 2,
       4, 2, 4, 2, 4, 2, 2, 4, 2, 4, 2, 2, 4, 2, 4, 2, 4, 4, 2, 4, 2, 2,
       2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4,
       2, 4, 4, 4, 2], dtype=int64)

In [20]:
res = classification_report(y_true=y_test, y_pred=y_predict,labels= [2,4],target_names=['良性','恶性'])

In [21]:
res

'             precision    recall  f1-score   support\n\n         良性       0.94      1.00      0.97        84\n         恶性       1.00      0.91      0.95        53\n\navg / total       0.97      0.96      0.96       137\n'

In [22]:
type(res)

str

In [23]:
print(res)# support指的是样本

             precision    recall  f1-score   support

         良性       0.94      1.00      0.97        84
         恶性       1.00      0.91      0.95        53

avg / total       0.97      0.96      0.96       137



## ROC曲线和AUC指标判断模型的好坏

roc_auc_score(y_true=y_test,y_score=y_predict)#会报错，因为我们没有设置dtype和pos_label

In [48]:
y_test

y_predict

y_a = y_test

y_b = y_predict

# # 尝试1 不行
# roc_auc_score(y_true=y_a,y_score=y_b)

# #尝试2 不行
# y_c = np.array(y_a,dtype='float64')

# y_d = np.array(y_b,dtype='float64')

# roc_auc_score(y_true=y_c,y_score=y_d)

# 尝试3 可以
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_a, y_b, pos_label=2)
metrics.auc(fpr, tpr)

# 尝试4 可以
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_c, y_d, pos_label=2)
metrics.auc(fpr, tpr)






0.047169811320754707

综上，没有设置pos_label=2的情况下直接用roc_auc_score是肯定不行的，需要这样：


import numpy as np

from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_c, y_d, pos_label=2)

metrics.auc(fpr, tpr)