# 导入所需要的包

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sympy import im
import warnings
import logging
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

| Variable     | Definition       | Key                                            |
|--------------|------------------|------------------------------------------------|
| index        | 病人编号         |                                                |
| Source       | 病人来源         | 0：国外数据集 1：国内数据集                    |
| Sex          | 性别             | 0：男 1：女                                    |
| Age          | 年龄             |                                                |
| BMI          | 身高体重比       |                                                |
| Diagnostic   | 癌症类型         |                                                |
| Stage        | 癌症分期         |                                                |
| NLR          | 中性/淋巴        |                                                |
| HGB          | 血红蛋白         |                                                |
| Surgury      | 之前是否进行手术 | 0：未手术 1：已手术                            |
| Chemo        | 是否同时进行化疗 | 0：否；1：是                                   |
| Radiotherapy | 是否同时进行化疗 | 0：否；1：是                                   |
| Drug         | 免疫药物         | 0：PD1/PDL1orCTLA4；1：Combo                   |
| MSI          | 微卫星不稳定性   | 0：MSS； 1：MSI-H                              |
| GeneMutation | 基因突变         | 0.0：hert2阴性；1.0：hert2阳性；2.0：K-RAS阳性 |
| CPS          | 联合阳性评分     |                                                |
| Ki67         | 细胞增殖标志物   |                                                |
| Response     | 免疫应答         | 0：无应答；1：有应答                           |

In [73]:
data = pd.read_csv("digestive_cancer.csv")
data.head()

Unnamed: 0,Source,Sex,Age,BMI,Diagnostic,Stage,NLR,HGB,Surgury,Chemo,Radiotherapy,Drug,MSI,GeneMutation,CPS,ki67,Response
0,0,1,43.871321,30.1,Esophageal,4,2.44,120.0,0,0,0,0,0.0,,,,0
1,0,0,70.603696,22.3,Esophageal,4,3.4,127.0,0,0,0,0,0.0,,,,1
2,0,1,63.586585,28.4,Esophageal,4,8.0,96.0,0,0,0,0,0.0,,,,0
3,0,0,58.896646,19.4,Esophageal,4,8.71,115.0,0,0,0,0,0.0,,,,0
4,0,1,60.692676,37.4,Esophageal,4,2.63,142.0,0,0,0,0,0.0,,,,0


In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Source        506 non-null    int64  
 1   Sex           506 non-null    int64  
 2   Age           506 non-null    float64
 3   BMI           505 non-null    float64
 4   Diagnostic    506 non-null    object 
 5   Stage         506 non-null    int64  
 6   NLR           506 non-null    float64
 7   HGB           506 non-null    float64
 8   Surgury       506 non-null    int64  
 9   Chemo         506 non-null    int64  
 10  Radiotherapy  506 non-null    int64  
 11  Drug          506 non-null    int64  
 12  MSI           427 non-null    float64
 13  GeneMutation  184 non-null    float64
 14  CPS           137 non-null    float64
 15  ki67          67 non-null     float64
 16  Response      506 non-null    int64  
dtypes: float64(8), int64(8), object(1)
memory usage: 67.3+ KB


In [75]:
# 标明哪些列是数值型，哪些是类别型
category_columns = ['Sex', 'Diagnostic', 'Stage', 'Surgury', 'Chemo', 'Radiotherapy', 'Drug', 'MSI', 'GeneMutation']
numeric_columns = ['Age', 'BMI', 'NLR', 'HGB', 'CPS', 'ki67']
hospital_source = ['Source']
target = ['Response']
diagnostic_encoder = LabelEncoder()
data['Diagnostic'] = diagnostic_encoder.fit_transform(data['Diagnostic'])

In [76]:
# 计算类别权重
# 计算正负样本比例
y = data['Response']
scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)  # 负类样本数 / 正类样本数

# 划分训练集和测试集

In [77]:
# 由于目前的数据集是将国内数据集和国外数据集合并到一起的，但两种数据集存在明显的差异，因此在划分训练集和测试集时，需要将两种数据集分开单独划分
data_in = data[data['Source'] == 1]
data_out = data[data['Source'] == 0]
# 由于在划分训练集和测试集时，需要保证训练集和测试集的类别分布是相同的，除此之外，还要保证每个类别中癌症类别的分布也是相同的，因此需要组建Diagnostic和Response的联合变量
data_in_stratify = data_in['Diagnostic'].astype(str) + '_' + data_in['Response'].astype(str)
data_out_stratify = data_out['Diagnostic'].astype(str) + '_' + data_out['Response'].astype(str)
# 区分数据的特征和目标
data_in_X = data_in.drop(columns=['Response', 'Source'])
data_in_y = data_in['Response']
data_out_X = data_out.drop(columns=['Response', 'Source'])
data_out_y = data_out['Response']
# 划分训练集和测试集
data_in_train_X, data_in_test_X, data_in_train_y, data_in_test_y = train_test_split(data_in_X, data_in_y, test_size=0.2, random_state=42, stratify=data_in_stratify)
data_out_train_X, data_out_test_X, data_out_train_y, data_out_test_y = train_test_split(data_out_X, data_out_y, test_size=0.2, random_state=42, stratify=data_out_stratify)
# 将两家医院的数据进行合并
data_train_X = pd.concat([data_in_train_X, data_out_train_X], ignore_index=True)
data_train_y = pd.concat([data_in_train_y, data_out_train_y], ignore_index=True)
data_test_X = pd.concat([data_in_test_X, data_out_test_X], ignore_index=True)
data_test_y = pd.concat([data_in_test_y, data_out_test_y], ignore_index=True)
# 查看数据集的缺失值情况
print(data_train_X.isnull().sum())
print(data_test_X.isnull().sum())

Sex               0
Age               0
BMI               1
Diagnostic        0
Stage             0
NLR               0
HGB               0
Surgury           0
Chemo             0
Radiotherapy      0
Drug              0
MSI              63
GeneMutation    257
CPS             296
ki67            352
dtype: int64
Sex              0
Age              0
BMI              0
Diagnostic       0
Stage            0
NLR              0
HGB              0
Surgury          0
Chemo            0
Radiotherapy     0
Drug             0
MSI             16
GeneMutation    65
CPS             73
ki67            87
dtype: int64


# 填充数据集

In [78]:
# # 完成训练集、测试集的划分之后，开始填充数据集
# # 将训练数据集中的缺失值使用miceforest库进行填充，填充训练完成以后再对测试集的数据进行填充
# # 统计训练集和测试集的缺失值情况
# # print(data_train_X.isnull().sum())
# # print(data_test_X.isnull().sum())
# # kernal = mf.ImputationKernel(data=data_train_X, datasets=1, save_all_iterations=True, random_state=42)
# # kernal.mice(1)
# # # 获取填补后的数据
# # data_train_X = kernal.complete_data(dataset=0)
# # # 对测试集的数据进行填补
# # data_test_X = kernal.impute_new_data(new_data=data_test_X, random_state=42)
# # # 统计训练集和测试集的缺失值情况
# # print(data_train_X.isnull().sum())
# # print(data_test_X.isnull().sum())
# # 由于上述插值过程太过耗时，所以改用KNN插值法补充数据
# # print(data['Diagnostic'].dtype)
# from sklearn.impute import KNNImputer
# # 对类别变量进行OneHot编码
# onehot = OneHotEncoder(sparse=False)
# # 对训练集的类别变量进行编码
# categorical_train = onehot.fit_transform(data_train_X[category_columns])
# categorical_train_df = pd.DataFrame(
#     categorical_train,
#     columns=onehot.get_feature_names_out(category_columns)
# )

# # 对测试集的类别变量进行编码
# categorical_test = onehot.transform(data_test_X[category_columns])
# categorical_test_df = pd.DataFrame(
#     categorical_test,
#     columns=onehot.get_feature_names_out(category_columns)
# )
# print("实际的列名:", data_train_X.columns.tolist())
# # 将数值型变量与编码后的类别变量合并
# data_train_encoded = pd.concat([data_train_X[numeric_columns], categorical_train_df], axis=1)
# data_test_encoded = pd.concat([data_test_X[numeric_columns], categorical_test_df], axis=1)
# # 创建KNNImputer对象，指定邻居数
# knn_imputer = KNNImputer(n_neighbors=5)

# # 对训练集进行填充
# data_train_X_filled = pd.DataFrame(
#     knn_imputer.fit_transform(data_train_X),
#     columns=data_train_X.columns
# )

# # 对测试集进行填充
# data_test_X_filled = pd.DataFrame(
#     knn_imputer.transform(data_test_X),
#     columns=data_test_X.columns
# )

# # 检查填充后的数据集是否还有缺失值

# print(data_train_X_filled.isnull().sum())
# print(data_test_X_filled.isnull().sum())



In [79]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import numpy as np

need_impute_category_columns = ['MSI', 'GeneMutation']
# 1. 首先对需要进行填充缺失值类别变量进行标签编码（而不是独热编码）
label_encoders = {}
data_train_encoded = data_train_X.copy()
data_test_encoded = data_test_X.copy()

for col in need_impute_category_columns:
    label_encoders[col] = LabelEncoder()
    # 对非缺失值进行编码
    mask = data_train_X[col].notna()
    data_train_encoded.loc[mask, col] = label_encoders[col].fit_transform(data_train_X.loc[mask, col])
    mask = data_test_X[col].notna()
    data_test_encoded.loc[mask, col] = label_encoders[col].transform(data_test_X.loc[mask, col])
# 打印编码后每列数据的缺失值情况
print(data_train_encoded.isnull().sum())
print(data_test_encoded.isnull().sum())

# 2. 分别为数值型和类别型特征创建插补器
# 数值型特征的插补器
numeric_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=100,
        max_depth=6,
        random_state=42
    ),
    random_state=42,
    max_iter=10,
    verbose=1
)

# 3. 进行联合填充
# 将所有特征一起输入到插补器中
data_train_filled = pd.DataFrame(
    numeric_imputer.fit_transform(data_train_encoded),
    columns=data_train_encoded.columns,
    index=data_train_encoded.index
)

data_test_filled = pd.DataFrame(
    numeric_imputer.transform(data_test_encoded),
    columns=data_test_encoded.columns,
    index=data_test_encoded.index
)

# 4. 对类别型变量的填充值进行四舍五入
for col in need_impute_category_columns:
    data_train_filled[col] = np.round(data_train_filled[col]).astype(int)
    data_test_filled[col] = np.round(data_test_filled[col]).astype(int)

# 5. 将类别变量转换回原始的类别标签（可选）
for col in need_impute_category_columns:
    data_train_filled[col] = label_encoders[col].inverse_transform(data_train_filled[col])
    data_test_filled[col] = label_encoders[col].inverse_transform(data_test_filled[col])

# 6. 最后进行OneHot编码
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
categorical_train = onehot.fit_transform(data_train_filled[category_columns])
categorical_train_df = pd.DataFrame(
    categorical_train,
    columns=onehot.get_feature_names_out(category_columns),
    index=data_train_filled.index
)

categorical_test = onehot.transform(data_test_filled[category_columns])
categorical_test_df = pd.DataFrame(
    categorical_test,
    columns=onehot.get_feature_names_out(category_columns),
    index=data_test_filled.index
)

# 7. 合并最终的特征
data_train_final = pd.concat([data_train_filled[numeric_columns], categorical_train_df], axis=1)
data_test_final = pd.concat([data_test_filled[numeric_columns], categorical_test_df], axis=1)

# 8. 验证结果
print("训练集缺失值情况：")
print(data_train_final.isnull().sum().sum())
print("\n测试集缺失值情况：")
print(data_test_final.isnull().sum().sum())

Sex               0
Age               0
BMI               1
Diagnostic        0
Stage             0
NLR               0
HGB               0
Surgury           0
Chemo             0
Radiotherapy      0
Drug              0
MSI              63
GeneMutation    257
CPS             296
ki67            352
dtype: int64
Sex              0
Age              0
BMI              0
Diagnostic       0
Stage            0
NLR              0
HGB              0
Surgury          0
Chemo            0
Radiotherapy     0
Drug             0
MSI             16
GeneMutation    65
CPS             73
ki67            87
dtype: int64
[IterativeImputer] Completing matrix with shape (404, 15)
[IterativeImputer] Change: 55.130149229736524, scaled tolerance: 0.166 
[IterativeImputer] Change: 26.526402231028342, scaled tolerance: 0.166 
[IterativeImputer] Change: 24.02249501058944, scaled tolerance: 0.166 
[IterativeImputer] Change: 11.49922344217877, scaled tolerance: 0.166 
[IterativeImputer] Change: 12.835722555471836

In [80]:
print(data_train_final.columns)

Index(['Age', 'BMI', 'NLR', 'HGB', 'CPS', 'ki67', 'Sex_0.0', 'Sex_1.0',
       'Diagnostic_0.0', 'Diagnostic_1.0', 'Diagnostic_2.0', 'Stage_1.0',
       'Stage_2.0', 'Stage_3.0', 'Stage_4.0', 'Surgury_0.0', 'Surgury_1.0',
       'Chemo_0.0', 'Chemo_1.0', 'Radiotherapy_0.0', 'Radiotherapy_1.0',
       'Drug_0.0', 'Drug_1.0', 'MSI_0.0', 'MSI_1.0', 'GeneMutation_0.0',
       'GeneMutation_1.0', 'GeneMutation_2.0'],
      dtype='object')


In [81]:
data_train_final.head()

Unnamed: 0,Age,BMI,NLR,HGB,CPS,ki67,Sex_0.0,Sex_1.0,Diagnostic_0.0,Diagnostic_1.0,...,Chemo_1.0,Radiotherapy_0.0,Radiotherapy_1.0,Drug_0.0,Drug_1.0,MSI_0.0,MSI_1.0,GeneMutation_0.0,GeneMutation_1.0,GeneMutation_2.0
0,71.0,16.73,4.333,99.0,2.0,65.159918,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,75.0,19.67,1.857,143.0,17.170976,63.601667,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,71.0,20.98,3.833,124.0,9.386995,56.021,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,63.0,21.519,2.357,105.0,0.0,66.483489,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,76.0,18.78,2.945,105.0,13.270881,62.586606,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


# 定义5种不同的机器学习模型，并进行训练

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

models = {
    'Logistic Regression': LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced',
        C=1.0,
        solver='lbfgs'
    ),  # LogisticRegression默认就会输出概率
    
    'K-Nearest Neighbors': KNeighborsClassifier(
        n_neighbors=5,
        weights='distance',
        metric='minkowski',
        n_jobs=-1
    ),  # KNeighborsClassifier默认就会输出概率
    
    'Random Forest': RandomForestClassifier(
        random_state=42,
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced',
        n_jobs=-1
    ),  # RandomForestClassifier默认就会输出概率
    
    'Naive Bayes': GaussianNB(
        var_smoothing=1e-9,
        priors=None  # 可以设置先验概率
    ),  # GaussianNB默认就会输出概率
    
    'QDA': QuadraticDiscriminantAnalysis(
        priors=None,  # 可以设置先验概率,None表示使用训练数据中的类别频率
        reg_param=0.0,  # 正则化参数,用于处理协方差估计
        store_covariance=True,  # 是否存储协方差矩阵
        tol=1e-4  # 奇异值分解的容差阈值
    )
}

import pickle
import os

# 创建权重保存目录
if not os.path.exists('weight'):
    os.makedirs('weight')

# 训练并保存模型
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # 训练模型
    model.fit(data_train_final, data_train_y)
    
    # 保存模型
    save_path = f'weight/{model_name.lower().replace(" ", "_")}.pkl'
    with open(save_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {save_path}")

Training Logistic Regression...
Model saved to weight/logistic_regression.pkl
Training K-Nearest Neighbors...
Model saved to weight/k-nearest_neighbors.pkl
Training Random Forest...
Model saved to weight/random_forest.pkl
Training Naive Bayes...
Model saved to weight/naive_bayes.pkl
Training QDA...
Model saved to weight/qda.pkl


# 预测测试集，并计算相应指标，绘图等

In [83]:
def cal_bin_metrics(y_true, y_pred):
    """
    the function is used to calculate the corresponding metrics for binary classification,
    including 二分类准确率, 灵敏度, 特异性, PPV, NPV, AUC
    :param y_true: 一维数组标签, [1, 0, 1, ...]
    :param y_pred: 二维预测数组, [num_sample, 2]
    :return:
    """
    y_pred = np.argmax(y_pred, axis=1)
    binary_accracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    # 利用sklearn的接口计算得到混淆矩阵
    binary_confusion_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # 接下来利用二分类的混淆矩阵计算各种指标，包括灵敏度，特异性，PPV, NPV
    tp = binary_confusion_matrix[1][1]
    fn = binary_confusion_matrix[1][0]
    fp = binary_confusion_matrix[0][1]
    tn = binary_confusion_matrix[0][0]
    sensitivity = tp / (tp + fn)
    specifity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    metrics = [binary_accracy, sensitivity, specifity, ppv, npv, binary_confusion_matrix]
    return metrics

# 加载模型并在测试集上进行预测
results = {}
metrics_names = ['Accuracy', 'Sensitivity', 'Specificity', 'PPV', 'NPV']

for model_name in models.keys():
    print(f"\n评估 {model_name} 模型...")
    
    # 加载模型
    model_path = f'weight/{model_name.lower().replace(" ", "_")}.pkl'
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # 进行预测
    predictions = model.predict_proba(data_test_final)
    # 保存预测的结果
    np.save(f'prediction/{model_name.lower().replace(" ", "_")}_prediction.npy', predictions)
    
    # 计算 AUC
    auc = roc_auc_score(y_true=data_test_y, y_score=predictions[:, 1])
    
    # 计算其他指标
    metrics = cal_bin_metrics(y_true=data_test_y, y_pred=predictions)
    
    # 保存结果
    results[model_name] = {
        'AUC': auc,
        'Metrics': dict(zip(metrics_names, metrics[:-1])),  # 不包含混淆矩阵
        'Confusion Matrix': metrics[-1]
    }
    
    # 打印结果
    print(f"AUC: {auc:.4f}")
    for metric_name, value in results[model_name]['Metrics'].items():
        print(f"{metric_name}: {value:.4f}")
    print("\n混淆矩阵:")
    print(results[model_name]['Confusion Matrix'])


评估 Logistic Regression 模型...
AUC: 0.7921
Accuracy: 0.7451
Sensitivity: 0.7667
Specificity: 0.7143
PPV: 0.7931
NPV: 0.6818

混淆矩阵:
[[30 12]
 [14 46]]

评估 K-Nearest Neighbors 模型...
AUC: 0.6758
Accuracy: 0.6176
Sensitivity: 0.6833
Specificity: 0.5238
PPV: 0.6721
NPV: 0.5366

混淆矩阵:
[[22 20]
 [19 41]]

评估 Random Forest 模型...
AUC: 0.8075
Accuracy: 0.7451
Sensitivity: 0.8333
Specificity: 0.6190
PPV: 0.7576
NPV: 0.7222

混淆矩阵:
[[26 16]
 [10 50]]

评估 Naive Bayes 模型...
AUC: 0.7849
Accuracy: 0.7353
Sensitivity: 0.7500
Specificity: 0.7143
PPV: 0.7895
NPV: 0.6667

混淆矩阵:
[[30 12]
 [15 45]]

评估 QDA 模型...
AUC: 0.7685
Accuracy: 0.6863
Sensitivity: 0.7167
Specificity: 0.6429
PPV: 0.7414
NPV: 0.6136

混淆矩阵:
[[27 15]
 [17 43]]
