In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

## 获取数据

In [2]:
names = ["编号", "特征1", "特征2", "特征3", "特征4", "特征5", "特征6", "特征7", "特征8", "特征9", "class", ]
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", 
            names=names)

In [3]:
data.head()

Unnamed: 0,编号,特征1,特征2,特征3,特征4,特征5,特征6,特征7,特征8,特征9,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


## 基本数据处理

### 缺失值处理

In [4]:
data = data.replace(to_replace="?", value=np.nan)
data = data.dropna()

### 确定特征值和目标值

In [5]:
x = data.iloc[:, 1:10]
x

Unnamed: 0,特征1,特征2,特征3,特征4,特征5,特征6,特征7,特征8,特征9
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [6]:
y = data["class"]
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: class, Length: 683, dtype: int64

### 分割数据

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)

## 特征工程(标准化)

In [8]:
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

## 模型训练---逻辑回归

In [9]:
estimator = LogisticRegression()
estimator.fit(x_train, y_train)

LogisticRegression()

## 模型评估

### 准确率

In [10]:
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

准确率为：
 0.9854014598540146


### 预测值

In [11]:
y_pre = estimator.predict(x_test)
print("预测值为：\n", y_pre)

预测值为：
 [2 4 4 2 2 2 2 2 2 2 2 2 2 4 2 2 4 4 4 2 4 2 4 4 4 2 4 2 2 2 2 2 4 2 2 2 4
 2 2 2 2 4 2 4 4 4 4 2 4 4 2 2 2 2 2 4 2 2 2 2 4 4 4 4 2 4 2 2 4 2 2 2 2 4
 2 2 2 2 2 2 4 4 4 2 4 4 4 4 2 2 2 4 2 4 2 2 2 2 2 2 4 2 2 4 2 2 4 2 4 4 2
 2 2 2 4 2 2 2 2 2 2 4 2 4 2 2 2 4 2 4 2 2 2 4 2 2 2]


### 精确率与召回率

In [14]:
ret = classification_report(y_test, y_pre, labels=(2, 4), target_names=("良性", "恶性"))
print(ret)

              precision    recall  f1-score   support

          良性       0.99      0.99      0.99        89
          恶性       0.98      0.98      0.98        48

    accuracy                           0.99       137
   macro avg       0.98      0.98      0.98       137
weighted avg       0.99      0.99      0.99       137



### auc指标计算

In [16]:
y_test = np.where(y_test>3, 1, 0)

In [18]:
roc_auc_score(y_test, y_pre)

0.9839653558052434