# 导入所需要的包

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sympy import im
import xgboost
import lightgbm
import catboost
import optuna
import gc
import warnings
import logging
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from optuna.samplers import TPESampler
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

| Variable     | Definition       | Key                                            |
|--------------|------------------|------------------------------------------------|
| index        | 病人编号         |                                                |
| Source       | 病人来源         | 0：国外数据集 1：国内数据集                    |
| Sex          | 性别             | 0：男 1：女                                    |
| Age          | 年龄             |                                                |
| BMI          | 身高体重比       |                                                |
| Diagnostic   | 癌症类型         |                                                |
| Stage        | 癌症分期         |                                                |
| NLR          | 中性/淋巴        |                                                |
| HGB          | 血红蛋白         |                                                |
| Surgury      | 之前是否进行手术 | 0：未手术 1：已手术                            |
| Chemo        | 是否同时进行化疗 | 0：否；1：是                                   |
| Radiotherapy | 是否同时进行化疗 | 0：否；1：是                                   |
| Drug         | 免疫药物         | 0：PD1/PDL1orCTLA4；1：Combo                   |
| MSI          | 微卫星不稳定性   | 0：MSS； 1：MSI-H                              |
| GeneMutation | 基因突变         | 0.0：hert2阴性；1.0：hert2阳性；2.0：K-RAS阳性 |
| CPS          | 联合阳性评分     |                                                |
| Ki67         | 细胞增殖标志物   |                                                |
| Response     | 免疫应答         | 0：无应答；1：有应答                           |

In [None]:
data = pd.read_csv("digestive_cancer.csv")
data.head()

In [None]:
data.info()

In [26]:
# 标明哪些列是数值型，哪些是类别型
category_columns = ['Sex', 'Diagnostic', 'Stage', 'Surgury', 'Chemo', 'Radiotherapy', 'Drug', 'MSI', 'GeneMutation']
numeric_columns = ['Age', 'BMI', 'NLR', 'HGB', 'CPS', 'ki67']
hospital_source = ['Source']
target = ['Response']

In [27]:
# 计算类别权重
y = data['Response']
scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)  # 负类样本数 / 正类样本数

# 划分训练集和测试集

In [None]:
# 由于目前的数据集是将国内数据集和国外数据集合并到一起的，但两种数据集存在明显的差异，因此在划分训练集和测试集时，需要将两种数据集分开单独划分
data_in = data[data['Source'] == 1]
data_out = data[data['Source'] == 0]
# 由于在划分训练集和测试集时，需要保证训练集和测试集的类别分布是相同的，除此之外，还要保证每个类别中癌症类别的分布也是相同的，因此需要组建Diagnostic和Response的联合变量
data_in_stratify = data_in['Diagnostic'].astype(str) + '_' + data_in['Response'].astype(str)
data_out_stratify = data_out['Diagnostic'].astype(str) + '_' + data_out['Response'].astype(str)
# 区分数据的特征和目标
data_in_X = data_in.drop(columns=['Response', 'Source'])
data_in_y = data_in['Response']
data_out_X = data_out.drop(columns=['Response', 'Source'])
data_out_y = data_out['Response']
# 划分训练集和测试集
data_in_train_X, data_in_test_X, data_in_train_y, data_in_test_y = train_test_split(data_in_X, data_in_y, test_size=0.2, random_state=42, stratify=data_in_stratify)
data_out_train_X, data_out_test_X, data_out_train_y, data_out_test_y = train_test_split(data_out_X, data_out_y, test_size=0.2, random_state=42, stratify=data_out_stratify)
# 将两家医院的数据进行合并
data_train_X = pd.concat([data_in_train_X, data_out_train_X], ignore_index=True)
data_train_y = pd.concat([data_in_train_y, data_out_train_y], ignore_index=True)
data_test_X = pd.concat([data_in_test_X, data_out_test_X], ignore_index=True)
data_test_y = pd.concat([data_in_test_y, data_out_test_y], ignore_index=True)
data_train_X.describe().T

### 将训练数据集和测试数据集的类别特征进行编码，无序的采用OneHotEncoder，有序的采用OrdinalEncoder

In [29]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# 定义有序和无序类别特征
ordinal_features = ['Stage']  # 有序特征
nominal_features = [col for col in category_columns if col not in ordinal_features]  # 无序特征

# 创建编码器
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ordinal_encoder = OrdinalEncoder()

# 对训练集进行编码
# 1. OneHot编码
onehot_encoded = onehot_encoder.fit_transform(data_train_X[nominal_features])
onehot_feature_names = onehot_encoder.get_feature_names_out(nominal_features)
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_feature_names)

# 2. Ordinal编码
ordinal_encoded = ordinal_encoder.fit_transform(data_train_X[ordinal_features])
ordinal_encoded_df = pd.DataFrame(ordinal_encoded, columns=ordinal_features)

# 3. 合并数值特征和编码后的特征
data_train_final = pd.concat([
    data_train_X[numeric_columns].reset_index(drop=True),
    onehot_encoded_df.reset_index(drop=True),
    ordinal_encoded_df.reset_index(drop=True)
], axis=1)

# 对测试集进行编码(使用训练集的编码器)
# 1. OneHot编码
test_onehot_encoded = onehot_encoder.transform(data_test_X[nominal_features])
test_onehot_encoded_df = pd.DataFrame(test_onehot_encoded, columns=onehot_feature_names)

# 2. Ordinal编码
test_ordinal_encoded = ordinal_encoder.transform(data_test_X[ordinal_features])
test_ordinal_encoded_df = pd.DataFrame(test_ordinal_encoded, columns=ordinal_features)

# 3. 合并数值特征和编码后的特征
data_test_final = pd.concat([
    data_test_X[numeric_columns].reset_index(drop=True),
    test_onehot_encoded_df.reset_index(drop=True),
    test_ordinal_encoded_df.reset_index(drop=True)
], axis=1)
data_train_X = data_train_final
data_test_X = data_test_final

# 定义目标优化函数，以便使用optuna进行参数选择

In [30]:
# xgboost模型的目标函数
def train_model_category(trial, data_x, data_y):
    '''
    trial: optuna自带的默认参数, 用于提供各个参数调整范围
    data_x: 训练数据的输入特征
    data_y: 训练数据的标签
    '''
    # 定义交叉验证策略
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # 定义模型需要调整的参数范围
    # xgboost的参数网络
    param_grid = {
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 20, 200, step=5),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.05),
        'max_depth': trial.suggest_int('max_depth', 5, 7, step=1),
        'random_state': 42,
        'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1, step=0.05),
        'scale_pos_weight': scale_pos_weight,
        'n_jobs': 20
    }

    total_preds = np.zeros(data_x.shape[0])
    # 对5折数据进行for循环
    for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(data_x, data_y)):
        train_x, train_y = data_x.iloc[train_idx], data_y.iloc[train_idx]
        valid_x, valid_y = data_x.iloc[valid_idx], data_y.iloc[valid_idx]
        clf = xgboost.XGBClassifier(**param_grid)
        clf.fit(train_x, train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                verbose=False, 
        )
        
        total_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    
    auc = roc_auc_score(data_y, total_preds)
    print('Full AUC score %.6f' % auc) 
    return auc


In [31]:
# # # lightgbm模型的目标函数
# def train_model_category(trial, data_x, data_y):
#     """
#     trial: optuna自带的默认参数, 用于提供各个参数调整范围
#     data_x: 训练数据的输入特征
#     data_y: 训练数据的标签
#     """
#     # 定义交叉验证策略
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     # 定义模型需要调整的参数范围
#     # lightgbm的参数网络
#     param_grid = {
#         'n_estimators': trial.suggest_int('n_estimators', 20, 200,step=5),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3,step=0.05),
#         'num_leaves': trial.suggest_int('num_leaves', 2**2, 2**5, step=4),
#         'max_depth': trial.suggest_int('max_depth', 3, 12, step=2),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1, step=0.05),
#         'subsample': trial.suggest_float('subsample', 0.2, 1, step=0.05),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.2, 1, step=0.1),
#         'random_state': 42,
#     }

#     total_preds = np.zeros(data_x.shape[0])
#     # 对5折数据进行for循环
#     for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(data_x, data_y)):
#         train_x, train_y = data_x.iloc[train_idx], data_y.iloc[train_idx]
#         valid_x, valid_y = data_x.iloc[valid_idx], data_y.iloc[valid_idx]
#         clf = lightgbm.LGBMClassifier(**param_grid)
#         clf.fit(train_x, train_y,
#                 eval_set=[(train_x, train_y), (valid_x, valid_y)]
#         )
        
#         total_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        
#         del clf, train_x, train_y, valid_x, valid_y
#         gc.collect()
    
#     auc = roc_auc_score(data_y, total_preds)
#     print('Full AUC score %.6f' % auc) 
#     return auc

In [32]:
# # randomforest模型的目标函数
# def train_model_category(trial, data_x, data_y):
#     '''
#     trial: optuna自带的默认参数, 用于提供各个参数调整范围
#     data_x: 训练数据的输入特征
#     data_y: 训练数据的标签
#     '''
#     # 定义交叉验证策略
#     kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     # 定义模型需要调整的参数范围
#     # xgboost的参数网络
#     param_grid = {
#         'n_estimators': trial.suggest_int('n_estimators', 20, 200, step=5),
#         'class_weight': 'balanced',
#         'random_state': 42,
#         'n_jobs': -1
#     }

#     total_preds = np.zeros(data_x.shape[0])
#     # 对5折数据进行for循环
#     for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(data_x, data_y)):
#         train_x, train_y = data_x.iloc[train_idx], data_y.iloc[train_idx]
#         valid_x, valid_y = data_x.iloc[valid_idx], data_y.iloc[valid_idx]
#         clf = RandomForestClassifier(**param_grid)
#         clf.fit(train_x, train_y)
        
#         total_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        
#         del clf, train_x, train_y, valid_x, valid_y
#         gc.collect()
    
#     auc = roc_auc_score(data_y, total_preds)
#     print('Full AUC score %.6f' % auc) 
#     return auc

In [None]:
# 定义optuna优化器
study = optuna.create_study(direction='maximize', study_name='xgboost', sampler=TPESampler(seed=42))
func = lambda trial: train_model_category(trial, data_train_X, data_train_y)
study.optimize(func, n_trials=300)
print('最佳参数: ', study.best_params)
print('最佳trial: ', study.best_trial)

# 使用最优参数定义模型，并进行训练

In [None]:
# 定义xgboost模型
clf = xgboost.XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight, n_jobs=-1, **study.best_params)
clf.fit(data_train_X, data_train_y, 
        verbose=False
)
import pickle
# 保存训练好的模型
model_name = 'xgboost'
save_path = f'weight/{model_name.lower().replace(" ", "_")}.pkl'
with open(save_path, 'wb') as f:
    pickle.dump(clf, f)
print(f"Model saved to {save_path}")

In [35]:
# 定义lightgbm模型
# clf = lightgbm.LGBMClassifier(random_state=42, scale_pos_weight=scale_pos_weight, n_jobs=-1, **study.best_params)
# clf.fit(data_train_X, data_train_y
# )
# # 保存训练好的模型
# clf.booster_.save_model('weight/lightgbm_model.txt')

In [36]:
# 定义random forest模型
# clf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1, **study.best_params)
# clf.fit(data_train_X, data_train_y)
# # 保存训练好的模型
# # # 保存训练好的模型
# clf.save_model('weight/randomforest_model.bin')

# 预测测试集，并计算相应指标，绘图等

In [None]:
def cal_bin_metrics(y_true, y_pred):
    """
    the function is used to calculate the corresponding metrics for binary classification,
    including 二分类准确率, 灵敏度, 特异性, PPV, NPV, AUC
    :param y_true: 一维数组标签, [1, 0, 1, ...]
    :param y_pred: 二维预测数组, [num_sample, 2]
    :return:
    """
    y_pred = np.argmax(y_pred, axis=1)
    binary_accracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    # 利用sklearn的接口计算得到混淆矩阵
    binary_confusion_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # 接下来利用二分类的混淆矩阵计算各种指标，包括灵敏度，特异性，PPV, NPV
    tp = binary_confusion_matrix[1][1]
    fn = binary_confusion_matrix[1][0]
    fp = binary_confusion_matrix[0][1]
    tn = binary_confusion_matrix[0][0]
    sensitivity = tp / (tp + fn)
    specifity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    metrics = [binary_accracy, sensitivity, specifity, ppv, npv, binary_confusion_matrix]
    return metrics

predictions = clf.predict_proba(data_test_X)
# 保存预测的结果
np.save('prediction/xgboost_prediction.npy', predictions)
auc = roc_auc_score(y_true=data_test_y, y_score=predictions[:, 1])
print(auc)
metrics = cal_bin_metrics(y_true=data_test_y, y_pred=predictions)
print(metrics)