## 逻辑回归

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 读取数据文件
df = pd.read_csv('../data/breast-cancer-wisconsin.csv', header=None)

# 检查数据中是否有空值
# 首先检查是否有'?'值，因为在查看上下文中发现数据里有'?'表示缺失值
# print("数据中是否包含'?'值:", df == '?')

# 将'?'替换为NaN，以便使用pandas的空值检测函数
df = df.replace('?', np.nan)

# 显示每列缺失值的数量
print("\n每列缺失值数量:")
print(df.isna().sum())

# 显示数据的基本信息
print("\n数据基本信息:")
print(df.info())
# 删除包含缺失值的行
df_clean = df.dropna()
print(f"\n删除缺失值后的数据形状: {df_clean.shape}")

# 重命名列以便更容易理解
columns = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 
           'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 
           'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
df_clean.columns = columns

# 查看数据的基本统计信息
print("\n数据的基本统计信息:")
print(df_clean.describe())

# 将目标变量转换为二分类问题（2为良性，4为恶性）
# 将4转换为1（恶性），2转换为0（良性）
# df_clean.loc[:, 'Class'] = df_clean.loc[:, 'Class'].map({2: 0, 4: 1})
print("\n类别分布:")
print(df_clean['Class'].value_counts())

# 准备特征和目标变量
X = df_clean.drop(['ID', 'Class'], axis=1)  # 删除ID列和目标变量
y = df_clean['Class']

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\n训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")

# 创建逻辑回归模型
# 使用较小的C值增加正则化强度，并使用L2正则化
model = LogisticRegression(C=0.1, penalty='l2', solver='liblinear', max_iter=1000)

# 训练模型
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

# 评估模型
print("\n模型评估:")
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")

print(y_pred[0:5])


每列缺失值数量:
0      0
1      0
2      0
3      0
4      0
5      0
6     16
7      0
8      0
9      0
10     0
dtype: int64

数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       699 non-null    int64 
 1   1       699 non-null    int64 
 2   2       699 non-null    int64 
 3   3       699 non-null    int64 
 4   4       699 non-null    int64 
 5   5       699 non-null    int64 
 6   6       683 non-null    object
 7   7       699 non-null    int64 
 8   8       699 non-null    int64 
 9   9       699 non-null    int64 
 10  10      699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None

删除缺失值后的数据形状: (683, 11)

数据的基本统计信息:
                 ID  Clump Thickness  Uniformity of Cell Size  \
count  6.830000e+02       683.000000               683.000000   
mean   1.076720e+06         4.442167                 3.150805   
std    6

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
