In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score, auc

In [2]:
def set_matplotlib_style():
    plt.rcParams['figure.figsize'] = (12,8)  # 图形大小
    plt.rcParams['axes.titlesize'] = 18       # 标题字体大小
    plt.rcParams['axes.labelsize'] = 15       # x轴和y轴标签字体大小
    plt.rcParams['xtick.labelsize'] = 12      # x轴刻度字体大小
    plt.rcParams['ytick.labelsize'] = 12      # y轴刻度字体大小
    plt.rcParams['legend.fontsize'] = 12      # 图例字体大小
    plt.rcParams['axes.linewidth'] = 2      # 坐标轴线宽
    plt.rcParams['xtick.major.size'] = 5      # x轴主刻度大小
    plt.rcParams['ytick.major.size'] = 5      # y轴主刻度大小
    plt.rcParams['xtick.major.width'] = 1.5   # x轴主刻度线宽
    plt.rcParams['ytick.major.width'] = 1.5   # y轴主刻度线宽
    plt.rcParams['lines.linewidth'] = 2       # 线条宽度
    plt.rcParams['lines.markersize'] = 8      # 标记大小
    plt.rcParams['savefig.dpi'] = 300         # 保存图片分辨率
    plt.rcParams['savefig.format'] = 'pdf'    # 图片保存格式
    plt.rcParams['grid.alpha'] = 0.6          # 网格线透明度
    # plt.rcParams['grid.linestyle'] = '--'     # 网格线样式
    # plt.rcParams['grid.linewidth'] = 0.7      # 网格线宽度
    plt.rcParams['axes.grid'] = False          # 网格
    plt.rcParams['axes.edgecolor'] = 'black'  # 边框颜色
    plt.rcParams['axes.titlepad'] = 15        # 标题与图形之间的距离
    plt.rcParams['legend.frameon'] = False    # 去掉图例边框
    

def parse_log_file(file_path):
    loss_list = []
    lr_list = []
    steps = []

    with open(file_path, 'r') as f:
        for line in f:
            match = re.search(r'Train Step (\d+) iter - loss : ([\d.]+) / lr : ([\d.]+)', line)
            if match:
                step = int(match.group(1))
                loss = float(match.group(2))
                lr = float(match.group(3))
                steps.append(step)
                loss_list.append(loss)
                lr_list.append(lr)
    return steps, loss_list, lr_list


def parse_eval_file(file_path):
    loss_list = []
    steps = []

    eval_res_df = pd.read_csv(file_path, sep='\t')
    steps = eval_res_df['step'].tolist()
    loss_list = eval_res_df['loss'].tolist()

    return steps, loss_list


def downsample_and_smooth(data, steps_per_epoch, downsample_rate=20, sigma=2):
    """
    Downsample the data to every `steps_per_epoch // downsample_rate` and smooth it.
    """
    sampled_indices = list(range(0, len(data), steps_per_epoch // downsample_rate))
    sampled_data = [data[i] for i in sampled_indices]
    smoothed_data = gaussian_filter1d(sampled_data, sigma=sigma)
    return sampled_indices, smoothed_data
    

def plot_curves(log_path, eval_path, out_path):
    set_matplotlib_style()
    
    # Parse log and evaluation files
    train_steps, train_loss_list, lr_list = parse_log_file(log_path)
    eval_steps, eval_loss_list = parse_eval_file(eval_path)

    # Get steps per epoch from the evaluation file
    steps_per_epoch = eval_steps[1] - eval_steps[0] if len(eval_steps) > 1 else len(train_steps)

    # Downsample and smooth the training loss
    sampled_indices, smoothed_train_loss = downsample_and_smooth(train_loss_list, steps_per_epoch)
    
    # Plot loss curve
    plt.figure()
    plt.plot(sampled_indices, smoothed_train_loss, label='Training Loss', color='blue')
    plt.plot(eval_steps, eval_loss_list, label='Test Loss', color='red')
    plt.title('Loss Curve')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()
    plt.savefig(os.path.join(out_path, 'loss.pdf'))
    plt.close()

    # Plot learning rate curve
    plt.plot(train_steps, lr_list, label='Learning Rate', color='orange')
    plt.title('Learning Rate Curve')
    plt.xlabel('Steps')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.legend()
    plt.savefig(os.path.join(out_path, 'learning_rates.pdf'))
    plt.close()

In [6]:
# dataset_list = ['dlbcl', 'lica', 'senescence', 'pl_wbc', 'pl_cfdna', 'pl_cfdna_sped9k']
dataset_list = ['dlbcl_val', 'leucocyte_val', 'pl_wbc_val']
for dataset in dataset_list:
    log_path = f'./benchmark_{dataset}.log'
    eval_path = os.path.join('../results/benchmark', dataset, '1.finetune/bert.model/eval.csv')
    out_path = os.path.join('../results/benchmark', dataset, '2.plot')
    plot_curves(log_path, eval_path, out_path)

In [47]:
def plot_combined_curves(dataset_list, output_path):
    """
    将多个数据集的 Loss 曲线拼接到一张图中

    Parameters:
    dataset_list (list): 数据集名称列表
    output_path (str): 保存总图的输出路径
    """
    set_matplotlib_style()
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()

    for idx, dataset in enumerate(dataset_list):
        log_path = f'./benchmark_{dataset}.log'
        eval_path = os.path.join('../results/benchmark', dataset, '1.finetune/bert.model/eval.csv')

        # Parse log and evaluation files
        train_steps, train_loss_list, lr_list = parse_log_file(log_path)
        eval_steps, eval_loss_list = parse_eval_file(eval_path)

        # Get steps per epoch from the evaluation file
        steps_per_epoch = eval_steps[1] - eval_steps[0] if len(eval_steps) > 1 else len(train_steps)

        # Downsample and smooth the training loss
        sampled_indices, smoothed_train_loss = downsample_and_smooth(train_loss_list, steps_per_epoch)
        
        # Plot on the corresponding subplot
        ax = axes[idx]
        ax.plot(sampled_indices, smoothed_train_loss, label='Training Loss', color='blue')
        ax.plot(eval_steps, eval_loss_list, label='Test Loss', color='red')
        ax.set_title(f'Loss Curve - {dataset}', fontsize=12)
        ax.set_xlabel('Steps')
        ax.set_ylabel('Loss')
        ax.grid(True)
        ax.legend(fontsize=10)
    
    # Remove unused subplots if dataset_list has fewer than 6 datasets
    for idx in range(len(dataset_list), len(axes)):
        fig.delaxes(axes[idx])

    # Adjust layout and save the figure
    plt.tight_layout()
    combined_plot_path = os.path.join(output_path, 'combined_loss_curves.pdf')
    plt.savefig(combined_plot_path)
    plt.close()
    print(f"Combined loss curves saved at {combined_plot_path}")

output_path = '../results/benchmark'
plot_combined_curves(dataset_list, output_path)

Combined loss curves saved at ../results/benchmark/combined_loss_curves.pdf


In [40]:
def calculate_metrics(predictions_df):
    true_labels = predictions_df['ctype'].map({'T': 1, 'N': 0}).values
    predicted_probs = predictions_df['P_ctype'].values
    
    predicted_labels = (predicted_probs > 0.5).astype(int)
    
    acc = accuracy_score(true_labels, predicted_labels)
    
    precision, recall, _ = precision_recall_curve(true_labels, predicted_probs)
    pr_auc = auc(recall, precision)

    roc_auc = roc_auc_score(true_labels, predicted_probs)
    
    return {"ACC": acc, "PR-AUC": pr_auc, "ROC-AUC": roc_auc}

In [48]:
dataset_list = ['dlbcl', 'lica', 'senescence', 'pl_wbc', 'pl_cfdna', 'pl_cfdna_sped9k', 'dlbcl_mut', 'leucocyte']
for dataset in dataset_list:
    res_df = pd.read_csv(os.path.join('../results/benchmark', dataset, '3.deconvolute/res.csv'), sep='\t')
    metrics = calculate_metrics(res_df)
    print(f'The metrics of {dataset} are: ROC-AUC={metrics["ROC-AUC"]:.4f}, PR-AUC={metrics["PR-AUC"]:.4f}, ACC={metrics["ACC"]:.4f}')

The metrics of dlbcl are: ROC-AUC=0.8482, PR-AUC=0.7825, ACC=0.7760
The metrics of lica are: ROC-AUC=0.8334, PR-AUC=0.8303, ACC=0.7421
The metrics of senescence are: ROC-AUC=0.7528, PR-AUC=0.6780, ACC=0.7039
The metrics of pl_wbc are: ROC-AUC=0.8592, PR-AUC=0.9751, ACC=0.8601
The metrics of pl_cfdna are: ROC-AUC=0.9019, PR-AUC=0.9159, ACC=0.8217
The metrics of pl_cfdna_sped9k are: ROC-AUC=0.9053, PR-AUC=0.9128, ACC=0.8401
The metrics of dlbcl_mut are: ROC-AUC=0.8544, PR-AUC=0.8631, ACC=0.7544
The metrics of leucocyte are: ROC-AUC=0.6805, PR-AUC=0.4508, ACC=0.7939
