In [1]:
import pandas as pd
from sklearn import model_selection
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, average_precision_score
import numpy as np
from torch.optim.lr_scheduler import StepLR
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, random_split
from sklearn.model_selection import train_test_split
import torch.nn.init as init
import torch.nn.functional as F
import os
from sklearn.model_selection import KFold
import csv


In [2]:
def model_evaluate(net, test_dataloader):
    net.eval()  # 设置模型为评估模式
    y_true = []
    y_scores = []

    with torch.no_grad():
        for sample in test_dataloader:
            inputs = sample[0]
            labels = sample[1]
            labels = labels.unsqueeze(1)  # 确保标签的形状为 (batch_size, 1)
            labels = labels.float()

            # 获取模型输出，经过sigmoid后得到概率
            probabilities = torch.sigmoid(net(inputs))  # 二分类任务，输出一个概率值

            # 由于输出是一个概率，直接使用它来作为正类的概率
            y_scores.extend(probabilities.numpy())
            y_true.extend(labels.numpy())

    y_true = np.array(y_true)
    y_scores = np.array(y_scores)

    # 计算F1 Score和AUC值
    y_pred = np.round(y_scores)  # 将概率转换为0或1
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    aupr = average_precision_score(y_true, y_scores)

    # 返回AUC, F1, AUPR
    return auc, f1, aupr

In [3]:
class ColumnAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(ColumnAttention, self).__init__()
        # 自注意力的权重
        self.query = nn.Linear(input_dim, hidden_dim)
        self.key = nn.Linear(input_dim, hidden_dim)
        self.value = nn.Linear(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        # 输入 x 维度: (batch_size, 40, 13)
        Q = self.query(x)  # (batch_size, 40, hidden_dim)
        K = self.key(x)    # (batch_size, 40, hidden_dim)
        V = self.value(x)  # (batch_size, 40, hidden_dim)
        
        # 计算注意力分数: Q * K^T
        attention_scores = torch.matmul(Q, K.transpose(-2, -1))  # (batch_size, 40, 40)
        attention_scores = attention_scores / (K.size(-1) ** 0.5)  # 缩放
        
        # 通过softmax获得注意力权重
        attention_weights = F.softmax(attention_scores, dim=-1)  # (batch_size, 40, 40)
        
        # 应用注意力权重到值上
        attended_values = torch.matmul(attention_weights, V)  # (batch_size, 40, hidden_dim)
        output = self.fc(attended_values) # (batch_size, 40, 13)
        return output
    
class RowAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(RowAttention, self).__init__()
        self.query = nn.Linear(input_dim, hidden_dim)
        self.key = nn.Linear(input_dim, hidden_dim)
        self.value = nn.Linear(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, input_dim)
    
    def forward(self, x):
        Q = self.query(x)  # (batch_size, 13, hidden_dim)
        K = self.key(x)    # (batch_size, 13, hidden_dim)
        V = self.value(x)  # (batch_size, 13, hidden_dim)
        
        attention_scores = torch.matmul(Q, K.transpose(-2, -1))  # (batch_size, 13, 13)
        attention_scores = attention_scores / (K.size(-1) ** 0.5)  # Scaling
        
        attention_weights = F.softmax(attention_scores, dim=-1)  # (batch_size, 13, 13)
        
        attended_values = torch.matmul(attention_weights, V)  # (batch_size, 13, hidden_dim)
        output = self.fc(attended_values)  # (batch_size, 13, 40)
        return output

class Feed_Forward(nn.Module):
    def __init__(self, input_dim, d_ff, dropout_rate=0.1):
        super(Feed_Forward, self).__init__()
        # 第一个全连接层，输入维度为 input_dim，输出维度为 d_ff
        self.fc1 = nn.Linear(input_dim, d_ff)
        # 第二个全连接层，输入维度为 d_ff，输出维度为 input_dim
        self.fc2 = nn.Linear(d_ff, input_dim)
        # Dropout层，用于防止过拟合
        self.dropout = nn.Dropout(dropout_rate)
        # 激活函数：ReLU
        self.relu = nn.ReLU()

    def forward(self, x):
        # 通过第一个全连接层
        x1 = self.fc1(x)
        # 激活函数
        x1 = self.relu(x1)
        # Dropout
        x1 = self.dropout(x1)
        # 通过第二个全连接层
        x2 = self.fc2(x1)
        # 残差连接：输入与输出相加
        out = x + x2  # x 是输入，x2 是第二个全连接层的输出
        return out

class CrossAttentionModel(nn.Module):
    def __init__(self, input_dim=13, seq_len=40, hidden_dim=64, output_dim=1, d_ff= 32, dropout_rate = 0.15):
        super(CrossAttentionModel, self).__init__()
        self.seq_len = seq_len
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        self.column_attention = ColumnAttention(input_dim, hidden_dim)
        self.row_attention = RowAttention(input_dim = 40, hidden_dim = 64)
        self.layer_norm1 = nn.LayerNorm(input_dim)
        self.layer_norm2 = nn.LayerNorm(input_dim)
        self.ffn1 = Feed_Forward(input_dim, d_ff, dropout_rate)
        self.ffn2 = Feed_Forward(input_dim, d_ff, dropout_rate)
        # 全连接层
        self.fc1 = nn.Linear(input_dim * seq_len, 128)
        self.fc2 = nn.Linear(128, output_dim)
        # self.sigmoid = nn.Sigmoid()
        self.droupout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # 输入 x 维度: (batch_size, 40, 13)
        x = x.view(-1, self.seq_len, self.input_dim)  # 调整输入形状
        x = self.column_attention(x)
        x = self.layer_norm1(x)  # Layer Norm after Column Attention
        x = self.ffn1(x)  # Feed-Forward Neural Network
        # return x
        # Step 2: Row Attention
        x = x.transpose(1, 2)  # Change shape to (batch_size, 13, 40) for row attention
        # return x
        x = self.row_attention(x)

        x = x.transpose(1, 2)
        # return x
        x = self.layer_norm2(x)  # Layer Norm after Row Attention
        x = self.ffn2(x)  # Feed-Forward Neural Network
        
        # 将输出展平
        attention_output = x.view(-1, self.seq_len * self.input_dim)
        
        # 全连接层
        x = F.relu(self.fc1(attention_output))  # (batch_size, 128)
        x = self.droupout(x) # (batch_size, 128)
        x = self.fc2(x)  # (batch_size, output_dim = 1)

        return x
    
    def _initialize_weights(self):
        # 对column_attention的linear层进行初始化
        init.kaiming_normal_(self.column_attention.query.weight, nonlinearity='relu')
        init.kaiming_normal_(self.column_attention.key.weight, nonlinearity='relu')
        init.kaiming_normal_(self.column_attention.value.weight, nonlinearity='relu')
        
        # 对fc1和fc2进行初始化
        init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        
        # 初始化偏置项为0
        if self.column_attention.query.bias is not None:
            init.constant_(self.column_attention.query.bias, 0)
            init.constant_(self.column_attention.key.bias, 0)
            init.constant_(self.column_attention.value.bias, 0)
        
        if self.fc1.bias is not None:
            init.constant_(self.fc1.bias, 0)
        if self.fc2.bias is not None:
            init.constant_(self.fc2.bias, 0)

class MultiColumnAttentionModel(nn.Module):
    def __init__(self, input_dim=16, seq_len=40, hidden_dim=64, output_dim=1, d_ff= 32, dropout_rate = 0.15):
        super(CrossAttentionModel, self).__init__()
        self.seq_len = seq_len
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        self.column_attention = ColumnAttention(input_dim, hidden_dim)
        self.row_attention = RowAttention(input_dim = 40, hidden_dim = 64)
        self.layer_norm1 = nn.LayerNorm(input_dim)
        self.layer_norm2 = nn.LayerNorm(input_dim)
        self.ffn1 = Feed_Forward(input_dim, d_ff, dropout_rate)
        self.ffn2 = Feed_Forward(input_dim, d_ff, dropout_rate)
        # 全连接层
        self.fc1 = nn.Linear(input_dim * seq_len, 128)
        self.fc2 = nn.Linear(128, output_dim)
        # self.sigmoid = nn.Sigmoid()
        self.droupout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # 输入 x 维度: (batch_size, 40, 16)
        x = x.view(-1, self.seq_len, self.input_dim)  # 调整输入形状

        x = self.column_attention(x)
        x = self.layer_norm1(x)  # Layer Norm after Column Attention
        x = self.ffn1(x)  # Feed-Forward Neural Network,(batch_size, 16, 40)

        x = self.column_attention(x)
        x = self.layer_norm2(x)  # Layer Norm after Row Attention
        x = self.ffn2(x)  # Feed-Forward Neural Network,(batch_size, 16, 40)
        
        # 将输出展平
        attention_output = x.view(-1, self.seq_len * self.input_dim)
        
        # 全连接层
        x = F.relu(self.fc1(attention_output))  # (batch_size, 128)
        x = self.droupout(x) # (batch_size, 128)
        x = self.fc2(x)  # (batch_size, output_dim = 1)

        return x
    
    def _initialize_weights(self):
        # 对column_attention的linear层进行初始化
        init.kaiming_normal_(self.column_attention.query.weight, nonlinearity='relu')
        init.kaiming_normal_(self.column_attention.key.weight, nonlinearity='relu')
        init.kaiming_normal_(self.column_attention.value.weight, nonlinearity='relu')
        
        # 对fc1和fc2进行初始化
        init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        
        # 初始化偏置项为0
        if self.column_attention.query.bias is not None:
            init.constant_(self.column_attention.query.bias, 0)
            init.constant_(self.column_attention.key.bias, 0)
            init.constant_(self.column_attention.value.bias, 0)
        
        if self.fc1.bias is not None:
            init.constant_(self.fc1.bias, 0)
        if self.fc2.bias is not None:
            init.constant_(self.fc2.bias, 0) 


In [4]:
class Accumulator:
    """
    在n个变量上累加
    """
    def __init__(self, n):
        self.data = [0.0] * n       # 创建一个长度为 n 的列表，初始化所有元素为0.0。

    def add(self, *args):           # 累加
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):                # 重置累加器的状态，将所有元素重置为0.0
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):     # 获取所有数据
        return self.data[idx]

In [5]:
def accuracy(y_hat, y):
    """
    计算正确的数量
    :param y_hat:
    :param y:
    :return:
    """
    # if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
    #     y_hat = y_hat.argmax(axis=1)            # 在每行中找到最大值的索引，以确定每个样本的预测类别
    # cmp = y_hat.type(y.dtype) == y
    # return float(cmp.type(y.dtype).sum())
    y_hat = (y_hat >= 0.5).float()  # 将概率转化为 0 或 1
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())


def evaluate_accuracy(net, data_iter):
    """
    计算指定数据集的精度
    :param net:
    :param data_iter:
    :return:
    """
    if isinstance(net, torch.nn.Module):
        net.eval()                  # 通常会关闭一些在训练时启用的行为
    metric = Accumulator(2)
    with torch.no_grad():
        for sample in data_iter:
            X = sample[0]
            y = sample[1]
            y_hat = net(X)
            y = y.unsqueeze(1)
            y = y.float()
            metric.add(accuracy(y_hat, y), y.numel())
    return metric[0] / metric[1]

def train_epoch_ch3(net, train_iter, loss, updater):
    """
    训练模型一轮
    :param net:是要训练的神经网络模型
    :param train_iter:是训练数据的数据迭代器，用于遍历训练数据集
    :param loss:是用于计算损失的损失函数
    :param updater:是用于更新模型参数的优化器
    :return:
    """
    if isinstance(net, torch.nn.Module):  # 用于检查一个对象是否属于指定的类（或类的子类）或数据类型。
        net.train()

    # 训练损失总和， 训练准确总和， 样本数
    metric = Accumulator(3)

    for sample in train_iter:
        X = sample[0]
        y = sample[1]
        # X = X.view(X.shape[0], 1, -1)
        y_hat = net(X)
        y = y.unsqueeze(1)
        y = y.float()
        l = loss(y_hat, y)
        if isinstance(updater, torch.optim.Optimizer):  # 用于检查一个对象是否属于指定的类（或类的子类）或数据类型。
            # 使用pytorch内置的优化器和损失函数
            updater.zero_grad()
            l.mean().backward()  # 方法用于计算损失的平均值
            updater.step()
        else:
            # 使用定制（自定义）的优化器和损失函数
            l.sum().backward()
            updater(X.shape())
        metric.add(float(l.sum()), accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练精度
    return metric[0] / metric[2], metric[1] / metric[2]

# ch6版本增加了valset和testset参数，在训练结束后对测试机进行评估
def train_ch6(net, train_iter, val_iter, test_iter, loss, num_epochs, updater, scheduler = None, save_best=True, checkpoint_dir='./selfattention_checkpoints'):
    """
    训练模型
    :param net: 模型
    :param train_iter: 训练数据的迭代器
    :param test_iter: 验证数据的迭代器，用于早停
    :param test_iter: 测试数据的迭代器
    :param loss: 损失函数
    :param num_epochs: 训练的轮数
    :param updater: 参数更新器/优化器
    """
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    
    best_acc = 0.0  # 跟踪最高的测试精度
    best_epoch = 0  # 保存最佳模型的epoch
    best_auroc = 0 
    best_F1 = 0
    best_auc =0.0
    best_aurp = 0 #最佳的auroc，F1，aurp都跟随最佳accuracy
    
    for epoch in range(num_epochs):
        # 训练模型一轮，并返回训练损失和训练精度
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        # 计算测试集上的精度
        test_acc = evaluate_accuracy(net, val_iter)
        # 计算验证集上的AUROC
        all_labels = []
        all_preds = []
        net.eval()  # 设置模型为评估模式
        with torch.no_grad():
            for X_batch, y_batch in val_iter:
                outputs = net(X_batch)
                all_labels.append(y_batch.detach().numpy())
                all_preds.append(outputs.detach().numpy()) # 直接使用原始输出作为预测值，sklearn的roc_auc_score不需要归一化为的概率为输入
        all_labels = np.concatenate(all_labels)
        all_preds = np.concatenate(all_preds)
        test_auc = roc_auc_score(all_labels, all_preds)

        if  test_auc > best_auc:
            best_auc = test_auc
            best_epoch = epoch
            # 获取文件夹中已有的模型文件
            checkpoint_path = os.path.join(checkpoint_dir, f'best_model_epoch_{epoch + 1}_auc_{best_auc:.3f}.pth')
            # 如果存在上一个保存的模型，删除它
            if 'previous_checkpoint_path' in locals() and os.path.exists(previous_checkpoint_path):
                os.remove(previous_checkpoint_path)
            torch.save(net.state_dict(), checkpoint_path)       
            # 更新记录当前模型的路径
            previous_checkpoint_path = checkpoint_path
            
    # 获取本次训练得到的最佳模型
    best_model_path = os.path.join(checkpoint_dir, f'best_model_epoch_{best_epoch + 1}_auc_{best_auc:.3f}.pth')
    saved_model = net
    saved_model.load_state_dict(torch.load(best_model_path))
    saved_model.eval()  # 切换为评估模式
    best_auroc, best_F1, best_aurp = model_evaluate(saved_model, test_iter)
    return best_acc, best_auroc, best_F1, best_aurp


# ch7版本去除了test set，专门用于不早停的五折交叉验证
def train_ch7(net, train_iter, val_iter, loss, num_epochs, updater, scheduler = None,  checkpoint_dir='./selfattention_checkpoints'):
    """
    训练模型
    :param net: 模型
    :param train_iter: 训练数据的迭代器
    :param val_iter: 验证数据的迭代器
    :param loss: 损失函数
    :param num_epochs: 训练的轮数
    :param updater: 参数更新器/优化器
    """
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    
    best_acc = 0.0  # 跟踪最高的测试精度
    # best_epoch = 0  # 保存最佳模型的epoch
    best_auroc = 0 
    best_F1 = 0
    best_auc =0.0
    best_aurp = 0
    
    for epoch in range(num_epochs):
        # 训练模型一轮，并返回训练损失和训练精度
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        # 计算测试集上的精度
        test_acc = evaluate_accuracy(net, val_iter)
        # 计算验证集上的AUROC
        all_labels = []
        all_preds = []
        net.eval()  # 设置模型为评估模式
        with torch.no_grad():
            for X_batch, y_batch in val_iter:
                outputs = net(X_batch)
                all_labels.append(y_batch.detach().numpy())
                all_preds.append(outputs.detach().numpy()) # 直接使用原始输出作为预测值，sklearn的roc_auc_score不需要归一化为的概率为输入
        all_labels = np.concatenate(all_labels)
        all_preds = np.concatenate(all_preds)
        test_auc = roc_auc_score(all_labels, all_preds)

        if  epoch == num_epochs - 1:
            best_auc = test_auc
            # best_epoch = epoch
            best_acc = test_acc
            checkpoint_path = os.path.join(checkpoint_dir, f'best_model_epoch_{epoch + 1}_auc_{best_auc:.3f}.pth')
            torch.save(net.state_dict(), checkpoint_path)       
            
    # 获取本次训练得到的最佳模型
    saved_model = net
    saved_model.load_state_dict(torch.load(checkpoint_path))
    saved_model.eval()  # 切换为评估模式
    best_auroc, best_F1, best_aurp = model_evaluate(saved_model, val_iter)
    return best_acc, best_auroc, best_F1, best_aurp



In [6]:
class CustomDataset_pos_encoded(Dataset):
    def __init__(self, features, labels, input_dim=16, seq_len=40):
        self.features = features
        self.labels = labels
        self.input_dim = input_dim  # 动态传入的特征维度
        self.seq_len = seq_len      # 序列长度
        self.position_encoding = self.create_position_encoding(seq_len, input_dim)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # 将每个样本的特征转换为 (seq_len, input_dim) 的二维张量
        sample = self.features[idx].reshape(self.input_dim, self.seq_len).astype(np.float32).T  # 转置为 (input_dim, seq_len)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        position_encoding = self.position_encoding.numpy()  # 获取位置编码，shape: (seq_len, input_dim)
        
        # 逐元素相加样本和位置编码
        sample_with_pos = sample + position_encoding  # shape: (seq_len, input_dim)

        return torch.tensor(sample_with_pos), label

    def create_position_encoding(self, seq_len, input_dim):
        position = np.arange(seq_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, input_dim, 2) * -(np.log(10000.0) / input_dim))

        pos_enc = np.zeros((seq_len, input_dim))
        pos_enc[:, 0::2] = np.sin(position * div_term)

        if input_dim % 2 != 0:
            pos_enc[:, 1::2] = np.cos(position * div_term[:-1])
        else:
            pos_enc[:, 1::2] = np.cos(position * div_term)


        return torch.tensor(pos_enc, dtype=torch.float32)  # 将生成的 numpy 数组转换为 torch tensor
    

In [7]:
class CustomDataset(Dataset):
    def __init__(self, features, labels, input_dim=16, seq_len=40):
        self.features = features
        self.labels = labels
        self.input_dim = input_dim  # 动态传入的特征维度
        self.seq_len = seq_len      # 序列长度

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # 将每个样本的特征转换为 (seq_len, input_dim) 的二维张量
        sample = self.features[idx].reshape(self.input_dim, self.seq_len).astype(np.float32).T  # 转置为 (input_dim, seq_len)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return torch.tensor(sample), label

In [8]:
def equal_kind(data_df):
    df_y0 = data_df[data_df['label'] == 0]
    df_y1 = data_df[data_df['label'] == 1]

    # 确定两个子集中数量较少的那个
    min_count = min(len(df_y0), len(df_y1))

    # 从两个子集中随机选择等量的样本
    df_y0_balanced = df_y0.sample(n=min_count, random_state=42) if len(df_y0) > min_count else df_y0
    df_y1_balanced = df_y1.sample(n=min_count, random_state=42) if len(df_y1) > min_count else df_y1

    # 合并这两个平衡后的子集
    balanced_df = pd.concat([df_y0_balanced, df_y1_balanced])
    # 打乱合并后的数据集的顺序
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return balanced_df

In [9]:
# 合并样本并划分训练集和测试集的函数
def prepare_dataset(label_folder_path, sample_folder_path, test_sample_name=None, random_split=True, test_size=0.1, features_num = 640):
    # 获取所有 CSV 文件的文件名，并按文件名排序
    label_csv_files = sorted([f for f in os.listdir(label_folder_path) if f.endswith('.csv')])
    sample_csv_files = sorted([f for f in os.listdir(sample_folder_path) if f.endswith('.csv')])

    # 创建空的列表，用于存储所有的 DataFrame（除了测试集的文件）
    train_labels = []
    train_samples = []

    # 创建空的列表，用于存储测试集的 DataFrame
    test_label = None
    test_sample = None

    # 判断是否按指定的测试集样本名称进行划分
    if test_sample_name:
        for label_file, sample_file in zip(label_csv_files, sample_csv_files):
            if test_sample_name in label_file:  # 判断是否为指定的测试集
                test_label = pd.read_csv(os.path.join(label_folder_path, label_file))
                test_sample = pd.read_csv(os.path.join(sample_folder_path, sample_file))
            else:
                train_labels.append(pd.read_csv(os.path.join(label_folder_path, label_file)))
                train_samples.append(pd.read_csv(os.path.join(sample_folder_path, sample_file)))
                
    elif random_split:  # 如果是随机划分测试集
        # 随机选取 10% 数据作为测试集
        all_labels = [pd.read_csv(os.path.join(label_folder_path, file)) for file in label_csv_files]
        all_samples = [pd.read_csv(os.path.join(sample_folder_path, file)) for file in sample_csv_files]

        # 合并所有的样本和标签
        all_labels_df = pd.concat(all_labels, axis=0)
        all_samples_df = pd.concat(all_samples, axis=0)

        # 随机划分训练集和测试集
        train_samples_df, test_samples_df, train_labels_df, test_labels_df = train_test_split(
            all_samples_df, all_labels_df, test_size=test_size, random_state=42
        )

        # 合并训练集特征和标签
        merged_train_df = pd.merge(train_samples_df, train_labels_df, on='sample', how='left')
        merged_train_df = equal_kind(merged_train_df)

        # 合并测试集特征和标签
        merged_test_df = pd.merge(test_samples_df, test_labels_df, on='sample', how='left')
        merged_test_df = equal_kind(merged_test_df)

        # 提取特征和标签
        train_features = merged_train_df.iloc[:, 1: features_num + 1].values  # 第一列为 'sample'，最后一列为 'label'
        train_labels = merged_train_df.iloc[:, -1].values  # 标签列

        test_features = merged_test_df.iloc[:, 1: features_num + 1].values
        test_labels = merged_test_df.iloc[:, -1].values

        return train_features, train_labels, test_features, test_labels

    else:
        # 按照默认方式,即指定一个样本作为测试集
        for label_file, sample_file in zip(label_csv_files, sample_csv_files):
            train_labels.append(pd.read_csv(os.path.join(label_folder_path, label_file)))
            train_samples.append(pd.read_csv(os.path.join(sample_folder_path, sample_file)))

        # 将训练集的 DataFrame 合并
        train_y_df = pd.concat(train_labels, axis=0)
        train_X_df = pd.concat(train_samples, axis=0)

        # 合并训练集的特征和标签
        merged_train_df = pd.merge(train_X_df, train_y_df, on='sample', how='left')
        merged_train_df = equal_kind(merged_train_df)  # 继续调用你的equal_kind函数进行处理

        # 提取训练集的特征和标签
        train_features = merged_train_df.iloc[:, 1:features_num + 1].values  # 第一列为 'sample'，最后一列为 'label'
        train_labels = merged_train_df.iloc[:, features_num + 1].values  # 标签列

        return train_features, train_labels, None, None
    
# b不进行测试集划分，直接合并所有文件
def prepare_full_dataset(label_folder_path, sample_folder_path, features_num):
    """
    直接将所有样本数据和标签合并为一个 dataset，并返回特征和标签。
    
    :param label_folder_path: 标签文件夹路径
    :param sample_folder_path: 样本文件夹路径
    :param features_num: 用于训练的特征数量
    :return: 特征和标签的 numpy 数组
    """
    # 获取所有 CSV 文件的文件名，并按文件名排序
    label_csv_files = sorted([f for f in os.listdir(label_folder_path) if f.endswith('.csv')])
    sample_csv_files = sorted([f for f in os.listdir(sample_folder_path) if f.endswith('.csv')])

    # 用于存储所有标签和样本的 DataFrame
    all_labels = []
    all_samples = []

    # 加载所有标签和样本文件
    for label_file, sample_file in zip(label_csv_files, sample_csv_files):
        all_labels.append(pd.read_csv(os.path.join(label_folder_path, label_file)))
        all_samples.append(pd.read_csv(os.path.join(sample_folder_path, sample_file)))

    # 合并所有标签和样本
    all_labels_df = pd.concat(all_labels, axis=0)
    all_samples_df = pd.concat(all_samples, axis=0)

    # 合并样本和标签数据
    merged_df = pd.merge(all_samples_df, all_labels_df, on='sample', how='left')
    
    # 调用 equal_kind 函数处理合并后的数据
    merged_df = equal_kind(merged_df)

    # 提取特征和标签
    features = merged_df.iloc[:, 1:features_num + 1].values  # 提取特征列
    labels = merged_df.iloc[:, -1].values  # 提取标签列

    return features, labels

In [9]:
label_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/SelfAttentionSamples/labels/'
sample_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/SelfAttentionSamples/samples/'
# 定义不同的 features_num
features_nums = [640, 600, 560, 520, 480, 440, 400, 360, 320, 280, 240, 200, 160, 120, 80, 40]
# features_nums = [600]
# 结果记录
all_results = []

# 进行循环
for features_num in features_nums:
    print(f"Running with features_num = {features_num}")
    
    # 准备数据集
    main_features, main_labels = prepare_full_dataset(label_folder_path, sample_folder_path, features_num=features_num)
    
    # 初始化模型
    model = CrossAttentionModel(input_dim=features_num // 40, seq_len=40, hidden_dim=64, dropout_rate=0.15)
    
    # 训练参数
    num_epochs = 60
    batch_size = 64
    lr = 0.001

    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_auroc = []
    fold_F1 = []
    fold_aurp = []

    # 五折交叉验证
    for fold, (train_index, val_index) in enumerate(kf.split(main_features)):
        print(f"Training fold {fold + 1}")
        
        # 获取当前折的训练集和验证集
        X_train, X_val = main_features[train_index], main_features[val_index]
        y_train, y_val = main_labels[train_index], main_labels[val_index]

        # 创建训练和验证数据集
        train_dataset = CustomDataset_pos_encoded(X_train, y_train, input_dim=features_num // 40, seq_len=40)
        val_dataset = CustomDataset_pos_encoded(X_val, y_val, input_dim=features_num // 40, seq_len=40)

        # 创建训练和验证数据加载器
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # 初始化模型并优化器
        model._initialize_weights()
        optimizer = torch.optim.Adam(model.parameters(), lr)
        loss_fn = torch.nn.BCEWithLogitsLoss()

        # 在当前折上进行训练和验证
        acc, auroc, F1, aurp = train_ch7(model, train_iter=train_loader, val_iter=val_loader, loss=loss_fn, num_epochs=num_epochs, updater=optimizer, checkpoint_dir=f'/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/DL_saved/single_column_11f_test0.1_findauc/fold{fold+1}/')
        
        fold_accuracies.append(acc)
        fold_auroc.append(auroc)
        fold_F1.append(F1)
        fold_aurp.append(aurp)

    # 计算五折的平均值和方差
    mean_acc = np.mean(fold_accuracies)
    acc_variance = np.var(fold_accuracies)
    mean_auroc = np.mean(fold_auroc)
    auroc_variance = np.var(fold_auroc)
    mean_F1  = np.mean(fold_F1)
    F1_variance = np.var(fold_F1)
    mean_aurp = np.mean(fold_aurp)
    aurp_variance = np.var(fold_aurp)

    # 设置当前实验的结果
    results = {
        "features_num": features_num,
        "mean_accuracy": mean_acc,
        "accuracy_variance": acc_variance,
        "mean_auroc": mean_auroc,
        "auroc_variance": auroc_variance,
        "mean_F1": mean_F1,
        "F1_variance": F1_variance,
        "mean_aurp": mean_aurp,
        "aurp_variance": aurp_variance
    }

    # 将当前实验的结果添加到结果列表中
    all_results.append(results)

# 结果保存路径
csv_file = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/5fold_features_ablation/cross_ablation_results.csv'

# 检查文件是否存在
file_exists = os.path.isfile(csv_file)

# 打开文件并追加结果
with open(csv_file, mode='a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=results.keys())
    
    # 如果文件不存在，写入标题
    if not file_exists:
        writer.writeheader()
    
    # 写入每次实验的结果
    for result in all_results:
        writer.writerow(result)

print("Experiment completed and results saved.")

Running with features_num = 640


ValueError: The column label 'sample' is not unique.

In [10]:
# 进行一次不带位置编码的学习
label_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/SelfAttentionSamples/labels/'
sample_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/SelfAttentionSamples/samples/'
# 定义不同的 features_num
features_nums = [640, 600, 560, 520, 480, 440, 400, 360, 320, 280, 240, 200, 160, 120, 80, 40]
# features_nums = [600]
# 结果记录
all_results = []

# 进行循环
for features_num in features_nums:
    print(f"Running with features_num = {features_num}")
    
    # 准备数据集
    main_features, main_labels = prepare_full_dataset(label_folder_path, sample_folder_path, features_num=features_num)
    
    # 初始化模型
    model = CrossAttentionModel(input_dim=features_num // 40, seq_len=40, hidden_dim=64, dropout_rate=0.15)
    
    # 训练参数
    num_epochs = 60
    batch_size = 64
    lr = 0.001

    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_auroc = []
    fold_F1 = []
    fold_aurp = []

    # 五折交叉验证
    for fold, (train_index, val_index) in enumerate(kf.split(main_features)):
        print(f"Training fold {fold + 1}")
        
        # 获取当前折的训练集和验证集
        X_train, X_val = main_features[train_index], main_features[val_index]
        y_train, y_val = main_labels[train_index], main_labels[val_index]

        # 创建训练和验证数据集
        train_dataset = CustomDataset(X_train, y_train, input_dim=features_num // 40, seq_len=40)
        val_dataset = CustomDataset(X_val, y_val, input_dim=features_num // 40, seq_len=40)

        # 创建训练和验证数据加载器
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # 初始化模型并优化器
        model._initialize_weights()
        optimizer = torch.optim.Adam(model.parameters(), lr)
        loss_fn = torch.nn.BCEWithLogitsLoss()

        # 在当前折上进行训练和验证
        acc, auroc, F1, aurp = train_ch7(model, train_iter=train_loader, val_iter=val_loader, loss=loss_fn, num_epochs=num_epochs, updater=optimizer, checkpoint_dir=f'/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/DL_saved/single_column_11f_test0.1_findauc/fold{fold+1}/')
        
        fold_accuracies.append(acc)
        fold_auroc.append(auroc)
        fold_F1.append(F1)
        fold_aurp.append(aurp)

    # 计算五折的平均值和方差
    mean_acc = np.mean(fold_accuracies)
    acc_variance = np.var(fold_accuracies)
    mean_auroc = np.mean(fold_auroc)
    auroc_variance = np.var(fold_auroc)
    mean_F1  = np.mean(fold_F1)
    F1_variance = np.var(fold_F1)
    mean_aurp = np.mean(fold_aurp)
    aurp_variance = np.var(fold_aurp)

    # 设置当前实验的结果
    results = {
        "features_num": features_num,
        "mean_accuracy": mean_acc,
        "accuracy_variance": acc_variance,
        "mean_auroc": mean_auroc,
        "auroc_variance": auroc_variance,
        "mean_F1": mean_F1,
        "F1_variance": F1_variance,
        "mean_aurp": mean_aurp,
        "aurp_variance": aurp_variance
    }

    # 将当前实验的结果添加到结果列表中
    all_results.append(results)

# 结果保存路径
csv_file = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/5fold_features_ablation/cross_ablation_results.csv'

# 检查文件是否存在
file_exists = os.path.isfile(csv_file)

# 打开文件并追加结果
with open(csv_file, mode='a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=results.keys())
    
    # 如果文件不存在，写入标题
    if not file_exists:
        writer.writeheader()
    
    # 写入每次实验的结果
    for result in all_results:
        writer.writerow(result)

print("Experiment completed and results saved.")

Running with features_num = 640


ValueError: The column label 'sample' is not unique.

In [11]:
# 将所有特征顺序倒序消融，先消除coverage
label_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/NomalSamples/labels/'
sample_folder_path = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/resources/NomalSamples/samples/'
# 定义不同的 features_num
features_nums = [640, 600, 560, 520, 480, 440, 400, 360, 320, 280, 240, 200, 160, 120, 80, 40]
# features_nums = [600]
# 结果记录
all_results = []

label_csv_files = sorted([f for f in os.listdir(label_folder_path) if f.endswith('.csv')])
sample_csv_files = sorted([f for f in os.listdir(sample_folder_path) if f.endswith('.csv')])

# 用于存储所有标签和样本的 DataFrame
all_labels = []
all_samples = []

# 加载所有标签和样本文件
for label_file, sample_file in zip(label_csv_files, sample_csv_files):
    all_labels.append(pd.read_csv(os.path.join(label_folder_path, label_file)))
    all_samples.append(pd.read_csv(os.path.join(sample_folder_path, sample_file)))

# 合并所有标签和样本
all_labels_df = pd.concat(all_labels, axis=0)
all_samples_df = pd.concat(all_samples, axis=0)

# 合并样本和标签数据
merged_df = pd.merge(all_samples_df, all_labels_df, on='sample', how='left')

# 调用 equal_kind 函数处理合并后的数据
merged_df = equal_kind(merged_df)
# 将特征列倒序
first_col = merged_df.iloc[:, 0]
last_col = merged_df.iloc[:, -1]
middle_cols_reversed = merged_df.iloc[:, 1:-1].iloc[:, ::-1]
merged_df_reordered = pd.concat([first_col, middle_cols_reversed, last_col], axis=1)


for features_num in features_nums:
    print(f"Running with features_num = {features_num}")
    
    # 准备数据集
    main_features = merged_df_reordered.iloc[:, 1:features_num + 1].values  # 提取特征列
    main_labels = merged_df_reordered.iloc[:, -1].values  # 提取标签列
    # 初始化模型
    model = CrossAttentionModel(input_dim=features_num // 40, seq_len=40, hidden_dim=64, dropout_rate=0.15)
    
    # 训练参数
    num_epochs = 60
    batch_size = 64
    lr = 0.001

    # 五折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    fold_accuracies = []
    fold_auroc = []
    fold_F1 = []
    fold_aurp = []

    # 五折交叉验证
    for fold, (train_index, val_index) in enumerate(kf.split(main_features)):
        print(f"Training fold {fold + 1}")
        
        # 获取当前折的训练集和验证集
        X_train, X_val = main_features[train_index], main_features[val_index]
        y_train, y_val = main_labels[train_index], main_labels[val_index]

        # 创建训练和验证数据集
        train_dataset = CustomDataset(X_train, y_train, input_dim=features_num // 40, seq_len=40)
        val_dataset = CustomDataset(X_val, y_val, input_dim=features_num // 40, seq_len=40)

        # 创建训练和验证数据加载器
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # 初始化模型并优化器
        model._initialize_weights()
        optimizer = torch.optim.Adam(model.parameters(), lr)
        loss_fn = torch.nn.BCEWithLogitsLoss()

        # 在当前折上进行训练和验证
        acc, auroc, F1, aurp = train_ch7(model, train_iter=train_loader, val_iter=val_loader, loss=loss_fn, num_epochs=num_epochs, updater=optimizer, checkpoint_dir=f'/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/DL_saved/single_column_11f_test0.1_findauc/fold{fold+1}/')
        
        fold_accuracies.append(acc)
        fold_auroc.append(auroc)
        fold_F1.append(F1)
        fold_aurp.append(aurp)

    # 计算五折的平均值和方差
    mean_acc = np.mean(fold_accuracies)
    acc_variance = np.var(fold_accuracies)
    mean_auroc = np.mean(fold_auroc)
    auroc_variance = np.var(fold_auroc)
    mean_F1  = np.mean(fold_F1)
    F1_variance = np.var(fold_F1)
    mean_aurp = np.mean(fold_aurp)
    aurp_variance = np.var(fold_aurp)

    # 设置当前实验的结果
    results = {
        "features_num": features_num,
        "mean_accuracy": mean_acc,
        "accuracy_variance": acc_variance,
        "mean_auroc": mean_auroc,
        "auroc_variance": auroc_variance,
        "mean_F1": mean_F1,
        "F1_variance": F1_variance,
        "mean_aurp": mean_aurp,
        "aurp_variance": aurp_variance
    }

    # 将当前实验的结果添加到结果列表中
    all_results.append(results)

# 结果保存路径
csv_file = '/BioII/lulab_b/huangkeyun/zhangys/alkb-seq/ML_models/eight_sample_11features_test/5fold_features_ablation/cross_ablation_reveresd_results.csv'

# 检查文件是否存在
file_exists = os.path.isfile(csv_file)

# 打开文件并追加结果
with open(csv_file, mode='a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=results.keys())
    
    # 如果文件不存在，写入标题
    if not file_exists:
        writer.writeheader()
    
    # 写入每次实验的结果
    for result in all_results:
        writer.writerow(result)

print("Experiment completed and results saved.")

Running with features_num = 640
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 600
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 560
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 520
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 480
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 440
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 400
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 360
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Running with features_num = 320
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training