In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sympy import im
import xgboost
import lightgbm
import catboost
import optuna
import gc
import warnings
import logging
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from optuna.samplers import TPESampler
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

| Variable     | Definition       | Key                                            |
|--------------|------------------|------------------------------------------------|
| index        | 病人编号         |                                                |
| Source       | 病人来源         | 0：国外数据集 1：国内数据集                    |
| Sex          | 性别             | 0：男 1：女                                    |
| Age          | 年龄             |                                                |
| BMI          | 身高体重比       |                                                |
| Diagnostic   | 癌症类型         |                                                |
| Stage        | 癌症分期         |                                                |
| NLR          | 中性/淋巴        |                                                |
| HGB          | 血红蛋白         |                                                |
| Surgury      | 之前是否进行手术 | 0：未手术 1：已手术                            |
| Chemo        | 是否同时进行化疗 | 0：否；1：是                                   |
| Radiotherapy | 是否同时进行化疗 | 0：否；1：是                                   |
| Drug         | 免疫药物         | 0：PD1/PDL1orCTLA4；1：Combo                   |
| MSI          | 微卫星不稳定性   | 0：MSS； 1：MSI-H                              |
| GeneMutation | 基因突变         | 0.0：hert2阴性；1.0：hert2阳性；2.0：K-RAS阳性 |
| CPS          | 联合阳性评分     |                                                |
| Ki67         | 细胞增殖标志物   |                                                |
| Response     | 免疫应答         | 0：无应答；1：有应答                           |

In [None]:
data = pd.read_csv("digestive_cancer.csv")
data.head()

In [12]:
# 标明哪些列是数值型，哪些是类别型
category_columns = ['Sex', 'Diagnostic', 'Stage', 'Surgury', 'Chemo', 'Radiotherapy', 'Drug', 'MSI', 'GeneMutation']
numeric_columns = ['Age', 'BMI', 'NLR', 'HGB', 'CPS', 'Ki67']
hospital_source = ['Source']
target = ['Response']
# 由于MSI和GeneMutation是类别型变量，但由于存在缺失值，导致其数据类型是float64,阴性需要先将数据类型转换为str，然后进行编码
data['MSI'] = data['MSI'].astype(str)
data['GeneMutation'] = data['GeneMutation'].astype(str)

In [13]:
# 计算类别权重
y = data['Response']
scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)  # 负类样本数 / 正类样本数

# 划分训练集和测试集

In [None]:
# 由于目前的数据集是将国内数据集和国外数据集合并到一起的，但两种数据集存在明显的差异，因此在划分训练集和测试集时，需要将两种数据集分开单独划分
data_in = data[data['Source'] == 1]
data_out = data[data['Source'] == 0]
# 由于在划分训练集和测试集时，需要保证训练集和测试集的类别分布是相同的，除此之外，还要保证每个类别中癌症类别的分布也是相同的，因此需要组建Diagnostic和Response的联合变量
data_in_stratify = data_in['Diagnostic'].astype(str) + '_' + data_in['Response'].astype(str)
data_out_stratify = data_out['Diagnostic'].astype(str) + '_' + data_out['Response'].astype(str)
# 区分数据的特征和目标
data_in_X = data_in.drop(columns=['Response', 'Source'])
data_in_y = data_in['Response']
data_out_X = data_out.drop(columns=['Response', 'Source'])
data_out_y = data_out['Response']
# 划分训练集和测试集
data_in_train_X, data_in_test_X, data_in_train_y, data_in_test_y = train_test_split(data_in_X, data_in_y, test_size=0.2, random_state=42, stratify=data_in_stratify)
data_out_train_X, data_out_test_X, data_out_train_y, data_out_test_y = train_test_split(data_out_X, data_out_y, test_size=0.2, random_state=42, stratify=data_out_stratify)
# 将两家医院的数据进行合并
data_train_X = pd.concat([data_in_train_X, data_out_train_X], ignore_index=True)
data_train_y = pd.concat([data_in_train_y, data_out_train_y], ignore_index=True)
data_test_X = pd.concat([data_in_test_X, data_out_test_X], ignore_index=True)
data_test_y = pd.concat([data_in_test_y, data_out_test_y], ignore_index=True)
# 查看数据集的分布情况
# data_train_X['Diagnostic'].value_counts()
# data_train_y.value_counts()
# data_test_X['Diagnostic'].value_counts()
# data_test_y.value_counts()
data_train_X.describe().T


# 定义目标优化函数，以便使用optuna进行参数选择

In [15]:
# catboost模型的目标函数
pram = {
    'cat_features': category_columns,
    'loss_function': 'Logloss',
    'random_seed': 42,
    'scale_pos_weight': scale_pos_weight,

}

def train_model_category(trial, data_x, data_y):
    '''
    trial: optuna自带的默认参数, 用于提供各个参数调整范围
    data_x: 训练数据的输入特征
    data_y: 训练数据的标签
    '''
    # 定义交叉验证策略
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # 定义模型需要调整的参数范围
    # xgboost的参数网络
    param_grid = {
        'iterations':  trial.suggest_int('iterations', 20, 200, step=5),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.05),
        'depth': trial.suggest_int('depth', 5, 7, step=1),
        'rsm': trial.suggest_float('rsm', 0.4, 1, step=0.05),
    }

    param_grid.update(pram)
    print(param_grid)

    total_preds = np.zeros(data_x.shape[0])
    # 对5折数据进行for循环
    for n_fold, (train_idx, valid_idx) in enumerate(kfold.split(data_x, data_y)):
        train_x, train_y = data_x.iloc[train_idx], data_y.iloc[train_idx]
        valid_x, valid_y = data_x.iloc[valid_idx], data_y.iloc[valid_idx]
        clf = catboost.CatBoostClassifier(**param_grid)
        clf.fit(train_x, train_y, cat_features=category_columns,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                verbose=False, 
        )
        
        total_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    
    auc = roc_auc_score(data_y, total_preds)
    print('Full AUC score %.6f' % auc) 
    return auc

In [None]:
# 定义optuna优化器
study = optuna.create_study(direction='maximize', study_name='catboost classifier', sampler=TPESampler(seed=42))
func = lambda trial: train_model_category(trial, data_train_X, data_train_y)
study.optimize(func, n_trials=220)
print('最佳参数: ', study.best_params)
print('最佳trial: ', study.best_trial)

# 使用最优参数定义模型，并进行训练

In [None]:
# 定义catboost模型
best_params = study.best_params
best_params.update(pram)
print(best_params)
clf = catboost.CatBoostClassifier( **best_params)
clf.fit(data_train_X, data_train_y, 
        verbose=False
)
# 保存训练好的模型
clf.save_model('weight/catboost_model.bin')

# 预测测试集，并计算相应指标，绘图等

In [None]:
def cal_bin_metrics(y_true, y_pred):
    """
    the function is used to calculate the corresponding metrics for binary classification,
    including 二分类准确率, 灵敏度, 特异性, PPV, NPV, AUC
    :param y_true: 一维数组标签, [1, 0, 1, ...]
    :param y_pred: 二维预测数组, [num_sample, 2]
    :return:
    """
    y_pred = np.argmax(y_pred, axis=1)
    binary_accracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    # 利用sklearn的接口计算得到混淆矩阵
    binary_confusion_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    # 接下来利用二分类的混淆矩阵计算各种指标，包括灵敏度，特异性，PPV, NPV
    tp = binary_confusion_matrix[1][1]
    fn = binary_confusion_matrix[1][0]
    fp = binary_confusion_matrix[0][1]
    tn = binary_confusion_matrix[0][0]
    sensitivity = tp / (tp + fn)
    specifity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    metrics = [binary_accracy, sensitivity, specifity, ppv, npv, binary_confusion_matrix]
    return metrics

predictions = clf.predict_proba(data_test_X)
# 保存预测的结果
np.save('prediction/catboost_prediction.npy', predictions)
auc = roc_auc_score(y_true=data_test_y, y_score=predictions[:, 1])
print(auc)
metrics = cal_bin_metrics(y_true=data_test_y, y_pred=predictions)
print(metrics)