In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# import xgboost as xgb
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

# 设置字体为Arial
rcParams['font.family'] = 'Arial'

# 设置全局随机种子
np.random.seed(42)  # 设置NumPy的随机种子
import random
random.seed(42)  # 设置Python的随机种子


def train_model_100_times(data, target_column, model_type):
    # 分离特征和目标变量
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # 初始化存储结构
    auc_matrix = np.zeros((100, 2))
    best_test_auc = 0
    best_model = None
    best_feature_importance = None  # 用于存储最佳模型的特征重要性
    best_fpr = None
    best_tpr = None

    for i in range(100):
        # 分层抽样（确保类别分布一致）
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=i)  # 设置随机种子
        train_indices, test_indices = next(sss.split(X, y))
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
        '''# 计算需要重复的数量
        original_train_size = len(X_train)
        target_size = int(original_train_size * 1.5)
        num_to_sample = target_size - original_train_size

        # 从原训练集中随机抽样
        random_indices = np.random.choice(original_train_size, num_to_sample, replace=True)
        X_additional = X_train.iloc[random_indices]
        y_additional = y_train.iloc[random_indices]

        # 合并原训练集和新增的样本
        X_train = pd.concat([X_train, X_additional], axis=0)
        y_train = pd.concat([y_train, y_additional], axis=0)'''
        # 选择模型
        if model_type == 'RandomForest':
            model = RandomForestClassifier(
                random_state=i,  # 设置随机森林的随机种子
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=5,
                n_estimators=500,
                class_weight='balanced'
            )
        elif model_type == 'LogisticRegression':
            model = LogisticRegression(
                random_state=i,
                max_iter=1000,
                class_weight='balanced'
            )
        elif model_type == 'GradientBoosting':
            model = GradientBoostingClassifier(
                random_state=i,
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=5,
                n_estimators=500
            )

        elif model_type == 'MLPClassifier':
            model = MLPClassifier(
                random_state=i,
                hidden_layer_sizes=(100, 50),
                max_iter=1000,
                activation='relu',
                solver='adam'
            )

        model.fit(X_train, y_train)

        # 计算AUC
        train_pred = model.predict_proba(X_train)[:, 1]
        train_auc = roc_auc_score(y_train, train_pred)
        test_pred = model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, test_pred)

        # 存储结果
        auc_matrix[i, :] = [train_auc, test_auc]

        # 更新最佳模型和特征重要性
        if test_auc > best_test_auc:
            best_test_auc = test_auc
            best_model = model
            best_fpr, best_tpr, _ = roc_curve(y_test, test_pred)
            if model_type in ['RandomForest', 'GradientBoosting']:
                best_feature_importance = model.feature_importances_  # 获取特征重要性（MeanDecreaseGini）
            elif model_type == 'LogisticRegression':
                best_feature_importance = np.abs(model.coef_[0])  # 获取逻辑回归的系数绝对值

    # 获取特征名称和重要性
    feature_names = X.columns
    if best_feature_importance is not None:
        feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': best_feature_importance})
        feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    else:
        feature_importance = None

    return {
        "AUC_Matrix": auc_matrix,
        "Best_Model": best_model,
        "TPR": best_tpr,
        "FPR": best_fpr,
        "Best_Test_AUC": best_test_auc,
        "Feature_Importance": feature_importance  # 返回特征重要性
    }


# 示例调用
data_list = {
    'tRNA': '../../DATA-cfRNA/model-data/tRNA-target.csv',
    'rsRNA': '../../DATA-cfRNA/model-data/rsRNA-target.csv',
    'piRNA': '../../DATA-cfRNA/model-data/piRNA-target.csv',
    'mRNA': '../../DATA-cfRNA/model-data/mRNA-target.csv',
    'lncRNA': '../../DATA-cfRNA/model-data/lncRNA-target.csv',
    'ysRNA': '../../DATA-cfRNA/model-data/ysRNA-target.csv',
    'miRNA': '../../DATA-cfRNA/model-data/miRNA-target.csv',
    'allRNA': '../../DATA-cfRNA/model-data/allRNA-target.csv',
}

# 自定义颜色
colors = ['#849989', '#3673b1', '#7eace0', '#aea7c6', '#bfb73c', '#d8744f', '#93c665', '#a3d4d6']

model_types = ['RandomForest', 'LogisticRegression', 'GradientBoosting']

mRNA_feature_importance_list = []

# 用于存储所有模型的测试集AUC结果
all_test_auc_data = []

for model_type in model_types:
    all_auc_data = []
    labels = []
    roc_data = []
    feature_importance_list = []  # 用于存储每种RNA的特征重要性

    for (rna_type, file_path), color in zip(data_list.items(), colors):
        data = pd.read_csv(file_path)
        data.set_index(data.columns[0], inplace=True)
        # 在代码一开始对 target 列进行映射
        data['target'] = data['target'].map({'MAL': 0, 'BEN': 1})

        result = train_model_100_times(data, 'target', model_type)
        auc_matrix = result["AUC_Matrix"]
        all_auc_data.extend([auc_matrix[:, 0], auc_matrix[:, 1]])
        labels.extend([f"{rna_type}_Train", f"{rna_type}_Test"])
        roc_data.append((result["FPR"], result["TPR"], result["Best_Test_AUC"], rna_type, color))

        # 获取特征重要性并保存前10
        feature_importance = result["Feature_Importance"]
        if feature_importance is not None:
            top_10_features = feature_importance.head(10)
            top_10_features['RNA_Type'] = rna_type  # 添加RNA类型列
            top_10_features['Model_Type'] = model_type  # 添加模型类型列
            feature_importance_list.append(top_10_features)

            if rna_type == 'mRNA':
                mRNA_feature_importance_list.append(top_10_features)

        # 存储测试集AUC结果
        test_auc = auc_matrix[:, 1]
        for auc in test_auc:
            all_test_auc_data.append([rna_type, model_type, auc])

    # 合并所有RNA的特征重要性并保存为CSV
    if feature_importance_list:
        all_feature_importance = pd.concat(feature_importance_list)
        # all_feature_importance.to_csv(f"../../DATA-cfRNA/figure/fig2/Top_10_Features_{model_type}_MeanDecreaseGini.csv", index=False)

    # 绘制所有数据集的AUC箱形图
    plt.figure(figsize=(12, 8))
    # 生成每个箱体对应的颜色列表，每个RNA类型有训练和测试两个箱体，所以颜色重复两次
    box_colors = [color for color in colors for _ in range(2)]
    sns.boxplot(data=pd.DataFrame(np.array(all_auc_data).T, columns=labels), palette=box_colors)
    plt.title(f"{model_type}: Train and Test AUC over 100 iterations for all datasets", fontsize=14)
    plt.ylabel("AUC", fontsize=12)
    plt.xticks(rotation=45, fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()
    # plt.savefig(f"../../DATA-cfRNA/figure/fig2/{model_type}_AUC_Boxplot.pdf", format='pdf', bbox_inches='tight')  # 保存为PDF
    plt.show()

    # 绘制所有数据集的最佳ROC曲线
    plt.figure(figsize=(10, 8))
    for fpr, tpr, auc_score, rna_type, color in roc_data:
        plt.plot(fpr, tpr, lw=2, color=color, label=f'{rna_type} ROC Curve (AUC = {auc_score:.3f})')
    plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f"{model_type}: Best ROC Curves for all datasets", fontsize=14)
    plt.legend(loc="lower right", fontsize=10)
    plt.style.use('default')  # 移除背景色
    plt.gca().set_facecolor('white')  # 设置背景为白色
    plt.grid(False)  # 移除网格线
    plt.tight_layout()
    # plt.savefig(f"../../DATA-cfRNA/figure/fig2/{model_type}_ROC_Curves.pdf", format='pdf', bbox_inches='tight')  # 保存为PDF
    plt.show()

# 整合mRNA的特征重要性数据
if mRNA_feature_importance_list:
    all_mRNA_feature_importance = pd.concat(mRNA_feature_importance_list)
    # all_mRNA_feature_importance.to_csv("../../DATA-cfRNA/figure/fig2/All_Model_mRNA_Top_10_Features.csv", index=False)

# 创建DataFrame用于绘制合并的箱线图
test_auc_df = pd.DataFrame(all_test_auc_data, columns=['RNA_Type', 'Model_Type', 'Test_AUC'])

# 绘制合并的箱线图
plt.figure(figsize=(12, 8))
sns.boxplot(x='RNA_Type', y='Test_AUC', hue='Model_Type', data=test_auc_df, palette='Set3')
plt.title('Test AUC over 100 iterations for all models and RNA types', fontsize=14)
plt.ylabel("AUC", fontsize=12)
plt.xlabel("RNA Type", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.legend(title='Model Type', loc='upper left', fontsize=10)
plt.tight_layout()
# plt.savefig("../../DATA-cfRNA/figure/fig2/All_Models_Test_AUC_Boxplot.pdf", format='pdf', bbox_inches='tight')
plt.show()

