In [9]:
# Python import
import os
import copy
import random
import collections
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import svm
import warnings
import joblib
from sklearn.model_selection import train_test_split,RandomizedSearchCV
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, average_precision_score
import csv

In [3]:
def LR_kftest(X_train, y_train, X_test, y_test, SEED):    
    # Logistic Regression params
    lr_param_dict = {
        "penalty": ["l2"],
        "C": [1e-3, 5e-3, 1e-2, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000],
        "solver": ["liblinear"],
        "random_state": [SEED]
    }

    # Initiate Logistic Regression model
    lr_model = LogisticRegression()

    # Adjust hyper-parameters with randomized search
    lr_rscv = RandomizedSearchCV(lr_model, lr_param_dict, n_iter=100, cv=5, verbose=0,
                                 scoring="roc_auc", random_state=SEED, n_jobs=-1)
    lr_rscv.fit(X_train, y_train)

    # Predict on the test set
    y_pred = lr_rscv.best_estimator_.predict(X_test)
    y_pred_proba = lr_rscv.best_estimator_.predict_proba(X_test)[:, 1]

    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    auroc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    aurp = average_precision_score(y_test, y_pred_proba)

    return acc, auroc, f1, aurp

In [4]:
# Output result of evaluation
def eval_output(model_perf,path):
    with open(os.path.join(path,"Evaluate_Result_TestSet.txt"),'w') as f:
        f.write("AUROC=%s\tAUPRC=%s\tAccuracy=%s\tMCC=%s\tRecall=%s\tPrecision=%s\tf1_score=%s\n" %
               (model_perf["auroc"],model_perf["auprc"],model_perf["accuracy"],model_perf["mcc"],model_perf["recall"],model_perf["precision"],model_perf["f1"]))
        f.write("\n######NOTE#######\n")
        f.write("#According to help_documentation of sklearn.metrics.classification_report:in binary classification, recall of the positive class is also known as sensitivity; recall of the negative class is specificity#\n\n")
        f.write(model_perf["class_report"])

In [7]:
def equal_kind(data_df):
    df_y0 = data_df[data_df['label'] == 0]
    df_y1 = data_df[data_df['label'] == 1]

    # 确定两个子集中数量较少的那个
    min_count = min(len(df_y0), len(df_y1))

    # 从两个子集中随机选择等量的样本
    df_y0_balanced = df_y0.sample(n=min_count, random_state=42) if len(df_y0) > min_count else df_y0
    df_y1_balanced = df_y1.sample(n=min_count, random_state=42) if len(df_y1) > min_count else df_y1

    # 合并这两个平衡后的子集
    balanced_df = pd.concat([df_y0_balanced, df_y1_balanced])
    # 打乱合并后的数据集的顺序
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return balanced_df

In [19]:
# b不进行测试集划分，直接合并所有文件
def prepare_full_dataset(label_folder_path, sample_folder_path, features_num):
    """
    直接将所有样本数据和标签合并为一个 dataset，并返回特征和标签。
    
    :param label_folder_path: 标签文件夹路径
    :param sample_folder_path: 样本文件夹路径
    :param features_num: 用于训练的特征数量
    :return: 特征和标签的 numpy 数组
    """
    # 获取所有 CSV 文件的文件名，并按文件名排序
    label_csv_files = sorted([f for f in os.listdir(label_folder_path) if f.endswith('.csv')])
    sample_csv_files = sorted([f for f in os.listdir(sample_folder_path) if f.endswith('.csv')])

    # 用于存储所有标签和样本的 DataFrame
    all_labels = []
    all_samples = []

    # 加载所有标签和样本文件
    for label_file, sample_file in zip(label_csv_files, sample_csv_files):
        all_labels.append(pd.read_csv(os.path.join(label_folder_path, label_file)))
        all_samples.append(pd.read_csv(os.path.join(sample_folder_path, sample_file)))

    # 合并所有标签和样本
    all_labels_df = pd.concat(all_labels, axis=0)
    all_samples_df = pd.concat(all_samples, axis=0)

    # 合并样本和标签数据
    merged_df = pd.merge(all_samples_df, all_labels_df, on='sample', how='left')
    
    # 调用 equal_kind 函数处理合并后的数据
    merged_df = equal_kind(merged_df)

    # 提取特征和标签
    features = merged_df.iloc[:, 1:features_num + 1].values  # 提取特征列
    labels = merged_df.iloc[:, -1].values  # 提取标签列

    return features, labels

In [10]:
# Plot AUROC of model
def plot_AUROC(model_perf,path):
    #get AUROC,FPR,TPR and threshold
    roc_auc = model_perf["auroc"]
    fpr,tpr,threshold = model_perf["auroc_curve"]
    #return AUROC info
    temp_df = pd.DataFrame({"FPR":fpr,"TPR":tpr})
    temp_df.to_csv(os.path.join(path,"AUROC_info.txt"),header = True,index = False, sep = '\t')
    #plot
    plt.figure()
    lw = 2
    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='AUROC (area = %0.2f)' % roc_auc) 
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("AUROC of Models")
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(path,"AUROC_TestSet.pdf"),format = "pdf")

In [6]:
# Random seed
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

warnings.filterwarnings(action='ignore')

In [20]:
label_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/NomalSamples/labels/'
sample_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/NomalSamples/samples/'
# 定义不同的 features_num
features_nums = [640, 600, 560, 520, 480, 440, 400, 360, 320, 280, 240, 200, 160, 120, 80, 40]
# features_nums = [600]
# 结果记录
all_results = []

# 进行循环
for features_num in features_nums:
    print(f"Running with features_num = {features_num}")
    
    # 准备数据集
    X, y = prepare_full_dataset(label_folder_path, sample_folder_path, features_num=features_num)
    
    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

    fold_accuracies = []
    fold_auroc = []
    fold_F1 = []
    fold_aurp = []

    # 五折交叉验证
    for fold, (train_index, val_index) in enumerate(kf.split(X)):
        print(f"Training fold {fold + 1}")
        
        # 获取当前折的训练集和验证集
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]


        # 在当前折上进行训练和验证
        acc, auroc, F1, aurp = LR_kftest(X_train, y_train, X_val, y_val, SEED)
        
        fold_accuracies.append(acc)
        fold_auroc.append(auroc)
        fold_F1.append(F1)
        fold_aurp.append(aurp)

    # 计算五折的平均值和方差
    mean_acc = np.mean(fold_accuracies)
    acc_variance = np.var(fold_accuracies)
    mean_auroc = np.mean(fold_auroc)
    auroc_variance = np.var(fold_auroc)
    mean_F1  = np.mean(fold_F1)
    F1_variance = np.var(fold_F1)
    mean_aurp = np.mean(fold_aurp)
    aurp_variance = np.var(fold_aurp)

    # 设置当前实验的结果
    results = {
        "features_num": features_num,
        "mean_accuracy": mean_acc,
        "accuracy_variance": acc_variance,
        "mean_auroc": mean_auroc,
        "auroc_variance": auroc_variance,
        "mean_F1": mean_F1,
        "F1_variance": F1_variance,
        "mean_aurp": mean_aurp,
        "aurp_variance": aurp_variance
    }

    # 将当前实验的结果添加到结果列表中
    all_results.append(results)

# 结果保存路径
csv_file = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/5fold_features_ablation/LR_ablation_results.csv'

# 检查文件是否存在
file_exists = os.path.isfile(csv_file)

# 打开文件并追加结果
with open(csv_file, mode='a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=results.keys())
    
    # 如果文件不存在，写入标题
    if not file_exists:
        writer.writeheader()
    
    # 写入每次实验的结果
    for result in all_results:
        writer.writerow(result)

print("Experiment completed and results saved.")

Running with features_num = 640
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 600
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 560
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 520
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 480
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 440
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 400
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 360
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 320
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training