In [2]:
# 逻辑回归案例，调整参数，感受准确率等各种指标变化
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
          'Mitoses', 'Class']# 列名
data = pd.read_csv(
    "./data/breast-cancer-wisconsin.csv",
    names=column)  # 读取数据，指定列名
print(data)# 打印数据
print(data.info())# 查看数据概况
data.describe(include='all')  # 查看数据概况

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
unique,,,,,,,11.0,,,,
top,,,,,,,1.0,,,,
freq,,,,,,,402.0,,,,
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,,5.0,4.0,1.0,4.0


In [3]:
data = data.replace(to_replace='?', value=np.nan)# 将？替换为NaN
data = data.dropna()#直接删除，哪一行有空值，就删除对应的样本
print(data.shape)# 查看数据概况

(683, 11)


In [4]:
data.info()# 查看数据概况

<class 'pandas.core.frame.DataFrame'>
Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           683 non-null    int64 
 1   Clump Thickness              683 non-null    int64 
 2   Uniformity of Cell Size      683 non-null    int64 
 3   Uniformity of Cell Shape     683 non-null    int64 
 4   Marginal Adhesion            683 non-null    int64 
 5   Single Epithelial Cell Size  683 non-null    int64 
 6   Bare Nuclei                  683 non-null    object
 7   Bland Chromatin              683 non-null    int64 
 8   Normal Nucleoli              683 non-null    int64 
 9   Mitoses                      683 non-null    int64 
 10  Class                        683 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 64.0+ KB


In [5]:
data[column[10]].unique()# 查看数据概况

array([2, 4])

In [6]:
data[column[6]] = data[column[6]].astype('int16')# 将Bare Nuclei列的类型转换为int16

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25, random_state=1)
# 划分训练集和测试集，随机种子为1
std = StandardScaler()# 标准化
x_train = std.fit_transform(x_train) #训练集标准化
x_test = std.transform(x_test) #测试集标准化
x_train[0]# 查看数据概况

array([-1.21629973, -0.70863282, -0.75174943,  0.04301674, -0.55657068,
       -0.71054972, -0.99312055, -0.62911518, -0.36280962])

In [8]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression(C=0.5, solver='lbfgs')# 逻辑回归模型，C为正则化参数，solver为优化算法
lg.fit(x_train, y_train)# 训练模型
print(lg.coef_)# 查看模型参数
y_predict = lg.predict(x_test)# 预测测试集
print("准确率：", lg.score(x_test, y_test))# 查看准确率
print(y_test[0:5])# 查看测试集真实值
print(lg.predict_proba(x_test)[0:5])  #得出对应分类的概率

[[1.11400191 0.25293086 0.78938469 0.60986034 0.0728013  1.10834397
  0.7794668  0.64312128 0.67692658]]
准确率： 0.9824561403508771
444    2
24     2
195    2
49     4
375    2
Name: Class, dtype: int64
[[0.94893919 0.05106081]
 [0.99494175 0.00505825]
 [0.98365149 0.01634851]
 [0.02707911 0.97292089]
 [0.99732446 0.00267554]]


In [9]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))# 查看分类报告
print("AUC指标：", roc_auc_score(y_test, y_predict))# 查看AUC指标

              precision    recall  f1-score   support

          良性       0.97      1.00      0.99       111
          恶性       1.00      0.95      0.97        60

    accuracy                           0.98       171
   macro avg       0.99      0.97      0.98       171
weighted avg       0.98      0.98      0.98       171

AUC指标： 0.975
