In [8]:
# 导入所需的库
import pandas as pd
import numpy as np

# 读取breast-cancer数据集
data = pd.read_csv('./data/breast-cancer-wisconsin.csv',header = None)

data = data.replace('?',np.nan)

# 检查数据中的空值
print("数据集中的空值数量:")
print(data.isna().sum())
print("原始数据集大小:", data.shape) 

# 删除包含缺失值的行
data = data.dropna()

print(data.shape)   
# 再次检查确认所有空值都已处理
print("\n处理后的空值数量:")
print(data.isna().sum())
print(data.info())


数据集中的空值数量:
0      0
1      0
2      0
3      0
4      0
5      0
6     16
7      0
8      0
9      0
10     0
dtype: int64
原始数据集大小: (699, 11)
(683, 11)

处理后的空值数量:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       683 non-null    int64 
 1   1       683 non-null    int64 
 2   2       683 non-null    int64 
 3   3       683 non-null    int64 
 4   4       683 non-null    int64 
 5   5       683 non-null    int64 
 6   6       683 non-null    object
 7   7       683 non-null    int64 
 8   8       683 non-null    int64 
 9   9       683 non-null    int64 
 10  10      683 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 64.0+ KB
None


In [10]:
# 重命名列以便更容易理解
columns = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 
           'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 
           'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data.columns = columns
data

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [13]:
from sklearn.model_selection import train_test_split
x = data.drop(['ID','Class'],axis=1) #DataFrame.drop() takes from 1 to 2 positional arguments but 3 positional arguments,注意传参
y = data['Class']
y = y.replace(2,0)
y = y.replace(4,1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [15]:
# 使用逻辑回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

# 创建逻辑回归模型
# 使用较小的C值增加正则化强度，并使用L2正则化
model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear', max_iter=1000)

# 训练模型
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test) # 预测概率

# 评估模型
print("\n模型评估:")
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")

print(y_pred[0:5])
print(y_pred_prob[0:5]) 


模型评估:
准确率: 0.9124
[1 1 0 0 0]
[[0.04506915 0.95493085]
 [0.06565168 0.93434832]
 [0.92579728 0.07420272]
 [0.89191197 0.10808803]
 [0.881855   0.118145  ]]
