In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os
import random
import os

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # 你可以换成其它数字
# 数据集类
class ProteinNPYDataset(Dataset):
    def __init__(self, pos_path, neg_path):
        self.pos = np.load(pos_path, mmap_mode='r')
        self.neg = np.load(neg_path, mmap_mode='r')
        self.lengths = [len(self.pos), len(self.neg)]
        self.total_len = self.lengths[0] + self.lengths[1]

    def __len__(self):
        return self.total_len

    def __getitem__(self, idx):
        if idx < self.lengths[0]:
            x = self.pos[idx]
            y = 1
        else:
            x = self.neg[idx - self.lengths[0]]
            y = 0
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.long)

In [42]:
class MLPExperts(nn.Module):
    def __init__(self, d_model, d_ff, num_experts):
        super().__init__()
        self.num_experts = num_experts
        self.fc1 = nn.Linear(d_model, d_ff * num_experts, bias=True)
        self.fc2 = nn.Linear(d_ff, d_model, bias=True)
        self.d_ff = d_ff
    def forward(self, x, expert_idx):
        # x: [B*L, d_model], expert_idx: [B*L, k]
        all_hidden = self.fc1(x)  # [B*L, d_ff * num_experts]
        all_hidden = all_hidden.view(x.size(0), self.num_experts, self.d_ff)  # [B*L, num_experts, d_ff]
        out = []
        for i in range(expert_idx.size(1)):
            idx = expert_idx[:, i]  # [B*L]
            hidden = all_hidden[torch.arange(x.size(0)), idx]  # [B*L, d_ff]
            hidden = F.gelu(hidden)
            out_i = self.fc2(hidden)  # [B*L, d_model]
            out.append(out_i)
        out = torch.stack(out, dim=1)  # [B*L, k, d_model]
        return out
class NoisyTopKMoE(nn.Module):
    def __init__(self, d_model, d_ff, num_experts=30, k=2, noisy_std=1.0):
        super().__init__()
        self.num_experts = num_experts
        self.k = k
        self.noisy_std = noisy_std
        self.experts = MLPExperts(d_model, d_ff, num_experts)
        self.gate = nn.Linear(d_model, num_experts)
    def forward(self, x):
        # x: [B, L, d_model]
        B, L, D = x.shape
        x_flat = x.reshape(-1, D)  # [B*L, D]
        gate_logits = self.gate(x_flat)  # [B*L, num_experts]
        # Noisy gating
        if self.training and self.noisy_std > 0:
            noise = torch.randn_like(gate_logits) * self.noisy_std
            gate_logits = gate_logits + noise
        gate_scores = F.softmax(gate_logits, dim=-1)  # [B*L, num_experts]

          # 稀疏路由：只选top-k
        topk_val, topk_idx = torch.topk(gate_scores, self.k, dim=-1)  # [B*L, k]
        # 负载均衡损失（新版，防止爆炸）
        meangate = gate_scores.mean(dim=0)  # [num_experts]
        load_balance_loss = (meangate * meangate).sum() * (self.num_experts ** 2)
        # 专家并行输出
        expert_outs = self.experts(x_flat, topk_idx)  # [B*L, k, d_model]
        topk_val = topk_val / (topk_val.sum(dim=-1, keepdim=True) + 1e-9)
        moe_out = (expert_outs * topk_val.unsqueeze(-1)).sum(dim=1)  # [B*L, d_model]
        moe_out = moe_out.view(B, L, D)
        return moe_out, load_balance_loss

In [43]:
class TransformerMoEBlock(nn.Module):
    def __init__(self, d_model, nhead, d_ff, num_experts=30, k=3, dropout=0.1, noisy_std=1.0):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.moe = NoisyTopKMoE(d_model, d_ff, num_experts, k, noisy_std)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        attn_out, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        moe_out, load_balance_loss = self.moe(x)
        x = x + self.dropout(moe_out)
        x = self.norm2(x)
        return x, load_balance_loss

class TransformerMoE(nn.Module):
    def __init__(self, d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerMoEBlock(d_model, nhead, d_ff, num_experts, k, dropout, noisy_std)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )
    def forward(self, x):
        total_load_balance_loss = 0
        for layer in self.layers:
            x, lb_loss = layer(x)
            total_load_balance_loss += lb_loss
        x = x.mean(dim=1)  # 池化
        logits = self.classifier(x)
        return logits, total_load_balance_loss

### 普通transformer

In [44]:

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.fc2(self.dropout(F.gelu(self.fc1(x))))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.dropout = nn.Dropout(dropout)
        # ✅ 完全移除位置编码
    
    def forward(self, x):
        # ✅ 直接处理输入，不添加位置编码
        attn_out, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
        
        ffn_out = self.ffn(x)
        x = x + self.dropout(ffn_out)
        x = self.norm2(x)
        
        return x, 0.0

class Transformer(nn.Module):
    def __init__(self, d_model=1152, nhead=8, d_ff=2048, num_layers=4, dropout=0.1, num_classes=2):
        super().__init__()
        # ✅ 不使用位置编码，与TransformerMoE保持一致
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, nhead, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )
        
    def forward(self, x):
        total_load_balance_loss = 0.0
        for layer in self.layers:
            x, _ = layer(x)
        x = x.mean(dim=1)  # 全局平均池化
        logits = self.classifier(x)
        return logits, total_load_balance_loss

In [45]:
def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    all_probs = []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())  # 正类概率
    
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef,
        confusion_matrix, roc_auc_score, average_precision_score
    )
    
    # 计算混淆矩阵
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    
    # 计算所有指标
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    auprc = average_precision_score(all_labels, all_probs)
    sn = tp / (tp + fn) if (tp + fn) > 0 else 0  # 敏感性
    sp = tn / (tn + fp) if (tn + fp) > 0 else 0  # 特异性
    
    print(f"Test ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    print(f"Test AUC: {auc:.4f}, AUPRC: {auprc:.4f}, SN: {sn:.4f}, SP: {sp:.4f}")
    return acc, pre, rec, f1, mcc, auc, auprc, sn, sp
def train_one_epoch(model, loader, optimizer, criterion, device, moe_loss_weight=0.01, scaler=None):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader, desc="Training", leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with autocast():  # 开启混合精度
            logits, lb_loss = model(x)
            loss = criterion(logits, y) + moe_loss_weight * lb_loss
        scaler.scale(loss).backward()      # 用scaler缩放loss反向传播
        scaler.step(optimizer)             # 用scaler.step更新参数
        scaler.update()                    # 更新scaler状态
        total_loss += loss.item()
    return total_loss / len(loader)

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
from torch.cuda.amp import autocast, GradScaler
# 数据路径
train_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
train_neg = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'

train_dataset = ProteinNPYDataset(train_pos, train_neg)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4,dropout=0.1,num_classes=2
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

scaler = GradScaler()  # 在训练前初始化


def train_one_epoch_putong(model, loader, optimizer, criterion, device, moe_loss_weight=0.01, scaler=None):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader, desc="Training", leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with autocast():
            logits, lb_loss = model(x)
            # ❌ 普通Transformer的lb_loss=0.0，这里应该设为0
            loss = criterion(logits, y)  
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)
# 初始化scaler
scaler = GradScaler()

# 训练主循环
epochs = 10
best_acc = 0
best_state = None
best_path = "/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_best.pth"
last_path = "/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth"

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss = train_one_epoch_putong(model, train_loader, optimizer, criterion, device, scaler=scaler)
    print(f"Train Loss: {train_loss:.4f}")
    # 保存最后一次模型权重
    torch.save(model.state_dict(), last_path)
    print(f"Last model saved at epoch {epoch+1} ({last_path})")

  scaler = GradScaler()  # 在训练前初始化
  scaler = GradScaler()



Epoch 1/10


  with autocast():
                                                           

Train Loss: 0.3351
Last model saved at epoch 1 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 2/10


                                                           

Train Loss: 0.1839
Last model saved at epoch 2 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 3/10


                                                           

Train Loss: 0.1823
Last model saved at epoch 3 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 4/10


                                                           

Train Loss: 0.1536
Last model saved at epoch 4 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 5/10


                                                           

Train Loss: 0.1451
Last model saved at epoch 5 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 6/10


                                                           

Train Loss: 0.1527
Last model saved at epoch 6 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 7/10


                                                           

Train Loss: 0.1298
Last model saved at epoch 7 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 8/10


                                                           

Train Loss: 0.1249
Last model saved at epoch 8 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 9/10


                                                           

Train Loss: 0.1106
Last model saved at epoch 9 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)

Epoch 10/10


                                                           

Train Loss: 0.0936
Last model saved at epoch 10 (/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth)


In [48]:
# 1. 加载模型
model = Transformer(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4,dropout=0.1,num_classes=2
).to(device)
model.load_state_dict(torch.load('/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth', map_location=device))
model.eval()

test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'
test_dataset = ProteinNPYDataset(test_pos, test_neg)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    all_probs = []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())  # 正类概率
    
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef,
        confusion_matrix, roc_auc_score, average_precision_score
    )
    
    # 计算混淆矩阵
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    
    # 计算所有指标
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    auprc = average_precision_score(all_labels, all_probs)
    sn = tp / (tp + fn) if (tp + fn) > 0 else 0  # 敏感性
    sp = tn / (tn + fp) if (tn + fp) > 0 else 0  # 特异性
    
    print(f"Test ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    print(f"Test AUC: {auc:.4f}, AUPRC: {auprc:.4f}, SN: {sn:.4f}, SP: {sp:.4f}")
    return acc, pre, rec, f1, mcc, auc, auprc, sn, sp


# 测试
eval_model(model, test_loader, device)

  model.load_state_dict(torch.load('/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_last.pth', map_location=device))


Test ACC: 0.9112, PRE: 0.9242, REC: 0.9535, F1: 0.9386, MCC: 0.7796
Test AUC: 0.9664, AUPRC: 0.9862, SN: 0.9535, SP: 0.8066


(0.9112271540469974,
 0.9241706161137441,
 0.9535452322738386,
 0.9386281588447654,
 0.7796373962089861,
 0.9663629514178713,
 0.9862288602159184,
 0.9535452322738386,
 0.8066465256797583)

In [51]:
# 重新训练普通Transformer - 修正版本（使用最后轮权重）
from torch.cuda.amp import autocast, GradScaler

# 数据加载
train_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
train_neg = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'
test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'

train_dataset = ProteinNPYDataset(train_pos, train_neg)
test_dataset = ProteinNPYDataset(test_pos, test_neg)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 创建普通Transformer模型
model = Transformer(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4, dropout=0.1, num_classes=2
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()

# 修正的训练函数
def train_one_epoch_transformer(model, loader, optimizer, criterion, device, scaler=None):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader, desc="Training", leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with autocast():
            logits, _ = model(x)  # 普通Transformer的lb_loss=0.0
            loss = criterion(logits, y)  # ✅ 只使用分类损失
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

print("=== 重新训练普通Transformer (消融实验baseline) ===")
epochs = 10

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss = train_one_epoch_transformer(model, train_loader, optimizer, criterion, device, scaler)
    print(f"Train Loss: {train_loss:.4f}")
    
    # ✅ 移除测试集验证，避免过拟合
    # 只在训练期间打印训练损失，不在测试集上选择模型

# ✅ 保存最后一轮训练后的模型权重
save_path = "/exp_data/sjx/star/experiments/xiaorongshiyan/transformer_baseline_final.pth"
torch.save(model.state_dict(), save_path)
print(f"\n✅ 普通Transformer最终权重保存至: {save_path}")

print("\n" + "="*70)
print("🔥 消融实验对比结果")
print("="*70)
print("📊 TransformerMoE (你的SOTA方法):")
print("   ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120")
print("   AUC: 0.9685, AUPRC: 0.9869, SN: 0.9425, SP: 0.8731")

print(f"\n📈 普通Transformer (训练15轮后的最终模型):")
# ✅ 使用训练完成后的最终模型进行测试
final_results = eval_model(model, test_loader, device)

print(f"\n🚀 MoE架构带来的性能提升:")
f1_improvement = 0.9454 - final_results[3]
mcc_improvement = 0.8120 - final_results[4]
auc_improvement = 0.9685 - final_results[5]

print(f"   F1提升: +{f1_improvement:.4f} ({(f1_improvement/final_results[3]*100):.1f}%)")
print(f"   MCC提升: +{mcc_improvement:.4f}")
print(f"   AUC提升: +{auc_improvement:.4f}")

if f1_improvement > 0.02:
    print("   ✨ MoE架构显著提升了模型性能!")
elif f1_improvement > 0.005:
    print("   👍 MoE架构有一定性能提升")
else:
    print("   🤔 MoE架构提升有限，需要进一步分析")

print(f"\n📋 实验设置说明:")
print(f"   - 普通Transformer: 训练{epochs}轮，使用最终权重测试")
print(f"   - TransformerMoE: 训练10轮，使用最终权重测试")
print(f"   - 两者使用相同的训练数据和超参数")
print(f"   - 避免基于测试集选择模型，确保公平对比")

  scaler = GradScaler()


=== 重新训练普通Transformer (消融实验baseline) ===

Epoch 1/10


  with autocast():
                                                           

Train Loss: 0.3558

Epoch 2/10


                                                           

Train Loss: 0.1823

Epoch 3/10


                                                           

Train Loss: 0.1662

Epoch 4/10


                                                           

Train Loss: 0.1516

Epoch 5/10


                                                           

Train Loss: 0.1483

Epoch 6/10


                                                           

Train Loss: 0.1452

Epoch 7/10


                                                           

Train Loss: 0.1369

Epoch 8/10


                                                           

Train Loss: 0.1247

Epoch 9/10


                                                           

Train Loss: 0.1181

Epoch 10/10


                                                           

Train Loss: 0.1033

✅ 普通Transformer最终权重保存至: /exp_data/sjx/star/experiments/xiaorongshiyan/transformer_baseline_final.pth

🔥 消融实验对比结果
📊 TransformerMoE (你的SOTA方法):
   ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120
   AUC: 0.9685, AUPRC: 0.9869, SN: 0.9425, SP: 0.8731

📈 普通Transformer (训练15轮后的最终模型):
Test ACC: 0.9199, PRE: 0.9211, REC: 0.9707, F1: 0.9452, MCC: 0.8005
Test AUC: 0.9631, AUPRC: 0.9831, SN: 0.9707, SP: 0.7946

🚀 MoE架构带来的性能提升:
   F1提升: +0.0002 (0.0%)
   MCC提升: +0.0115
   AUC提升: +0.0054
   🤔 MoE架构提升有限，需要进一步分析

📋 实验设置说明:
   - 普通Transformer: 训练10轮，使用最终权重测试
   - TransformerMoE: 训练10轮，使用最终权重测试
   - 两者使用相同的训练数据和超参数
   - 避免基于测试集选择模型，确保公平对比


### 不同层数

In [49]:
# 数据加载
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
train_neg = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'
test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'

train_dataset = ProteinNPYDataset(train_pos, train_neg)
test_dataset = ProteinNPYDataset(test_pos, test_neg)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# 实验不同层数
layer_configs = [1, 2, 3, 4]
results = {}

for num_layers in layer_configs:
    print(f"\n{'='*50}")
    print(f"训练 {num_layers}层 TransformerMoE")
    print(f"{'='*50}")
    
    # 创建模型
    model = TransformerMoE(
        d_model=1152, 
        nhead=8, 
        d_ff=2048, 
        num_layers=num_layers, 
        num_experts=30, 
        k=3, 
        dropout=0.1, 
        noisy_std=1.0, 
        num_classes=2
    ).to(device)
    
    # 优化器和损失函数
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()
    
    # 训练
    epochs = 10
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, 0.01, scaler)
        print(f"Train Loss: {train_loss:.4f}")
    
    # 保存训练完成后的最终模型权重
    save_path = f"/exp_data/sjx/star/experiments/xiaorongshiyan/cegnshu/transformer_moe_{num_layers}layer_final.pth"
    torch.save(model.state_dict(), save_path)
    print(f"\n{num_layers}层模型训练完成，权重保存至: {save_path}")
    
    # 测试最终模型性能
    print(f"\n{num_layers}层TransformerMoE最终测试结果:")
    final_metrics = eval_model(model, test_loader, device)
    results[num_layers] = {
        'acc': final_metrics[0], 'pre': final_metrics[1], 'rec': final_metrics[2],
        'f1': final_metrics[3], 'mcc': final_metrics[4], 'auc': final_metrics[5],
        'auprc': final_metrics[6], 'sn': final_metrics[7], 'sp': final_metrics[8]
    }

# 汇总所有结果
print(f"\n{'='*70}")
print("所有层数TransformerMoE结果汇总")
print(f"{'='*70}")
print(f"{'Layers':<8}{'ACC':<8}{'PRE':<8}{'REC':<8}{'F1':<8}{'MCC':<8}{'AUC':<8}{'AUPRC':<8}")
print(f"{'-'*70}")

for layers in layer_configs:
    r = results[layers]
    print(f"{layers:<8}{r['acc']:<8.4f}{r['pre']:<8.4f}{r['rec']:<8.4f}{r['f1']:<8.4f}{r['mcc']:<8.4f}{r['auc']:<8.4f}{r['auprc']:<8.4f}")
  



训练 1层 TransformerMoE


  scaler = GradScaler()



Epoch 1/10


  with autocast():  # 开启混合精度
                                                           

Train Loss: 0.6056

Epoch 2/10


                                                           

Train Loss: 0.4693

Epoch 3/10


                                                           

Train Loss: 0.4554

Epoch 4/10


                                                           

Train Loss: 0.4467

Epoch 5/10


                                                           

Train Loss: 0.4434

Epoch 6/10


                                                           

Train Loss: 0.4344

Epoch 7/10


                                                           

Train Loss: 0.4238

Epoch 8/10


                                                           

Train Loss: 0.4130

Epoch 9/10


                                                           

Train Loss: 0.4137

Epoch 10/10


                                                           

Train Loss: 0.3933

1层模型训练完成，权重保存至: /exp_data/sjx/star/experiments/xiaorongshiyan/cegnshu/transformer_moe_1layer_final.pth

1层TransformerMoE最终测试结果:
Test ACC: 0.9051, PRE: 0.9297, REC: 0.9377, F1: 0.9337, MCC: 0.7673
Test AUC: 0.9543, AUPRC: 0.9804, SN: 0.9377, SP: 0.8248

训练 2层 TransformerMoE


  scaler = GradScaler()



Epoch 1/10


  with autocast():  # 开启混合精度
                                                           

Train Loss: 0.8998

Epoch 2/10


                                                           

Train Loss: 0.7754

Epoch 3/10


                                                           

Train Loss: 0.7541

Epoch 4/10


                                                           

Train Loss: 0.7466

Epoch 5/10


                                                           

Train Loss: 0.7405

Epoch 6/10


                                                           

Train Loss: 0.7311

Epoch 7/10


                                                           

Train Loss: 0.7093

Epoch 8/10


                                                           

Train Loss: 0.7040

Epoch 9/10


                                                           

Train Loss: 0.6904

Epoch 10/10


                                                           

Train Loss: 0.6726

2层模型训练完成，权重保存至: /exp_data/sjx/star/experiments/xiaorongshiyan/cegnshu/transformer_moe_2layer_final.pth

2层TransformerMoE最终测试结果:
Test ACC: 0.9034, PRE: 0.9645, REC: 0.8973, F1: 0.9297, MCC: 0.7811
Test AUC: 0.9621, AUPRC: 0.9852, SN: 0.8973, SP: 0.9184

训练 3层 TransformerMoE


  scaler = GradScaler()



Epoch 1/10


  with autocast():  # 开启混合精度
                                                           

Train Loss: 1.2840

Epoch 2/10


                                                           

Train Loss: 1.0787

Epoch 3/10


                                                           

Train Loss: 1.0686

Epoch 4/10


                                                           

Train Loss: 1.0517

Epoch 5/10


                                                           

Train Loss: 1.0491

Epoch 6/10


                                                           

Train Loss: 1.0400

Epoch 7/10


                                                           

Train Loss: 1.0369

Epoch 8/10


                                                           

Train Loss: 1.0193

Epoch 9/10


                                                           

Train Loss: 1.0160

Epoch 10/10


                                                           

Train Loss: 0.9982

3层模型训练完成，权重保存至: /exp_data/sjx/star/experiments/xiaorongshiyan/cegnshu/transformer_moe_3layer_final.pth

3层TransformerMoE最终测试结果:
Test ACC: 0.9182, PRE: 0.9458, REC: 0.9389, F1: 0.9423, MCC: 0.8017
Test AUC: 0.9611, AUPRC: 0.9820, SN: 0.9389, SP: 0.8671

训练 4层 TransformerMoE


  scaler = GradScaler()



Epoch 1/10


  with autocast():  # 开启混合精度
                                                           

Train Loss: 1.5557

Epoch 2/10


                                                           

Train Loss: 1.3823

Epoch 3/10


                                                           

Train Loss: 1.3616

Epoch 4/10


                                                           

Train Loss: 1.3555

Epoch 5/10


                                                           

Train Loss: 1.3458

Epoch 6/10


                                                           

Train Loss: 1.3378

Epoch 7/10


                                                           

Train Loss: 1.3211

Epoch 8/10


                                                           

Train Loss: 1.3190

Epoch 9/10


                                                           

Train Loss: 1.3105

Epoch 10/10


                                                           

Train Loss: 1.3073

4层模型训练完成，权重保存至: /exp_data/sjx/star/experiments/xiaorongshiyan/cegnshu/transformer_moe_4layer_final.pth

4层TransformerMoE最终测试结果:
Test ACC: 0.9191, PRE: 0.9300, REC: 0.9584, F1: 0.9440, MCC: 0.7993
Test AUC: 0.9329, AUPRC: 0.9480, SN: 0.9584, SP: 0.8218

所有层数TransformerMoE结果汇总
Layers  ACC     PRE     REC     F1      MCC     AUC     AUPRC   
----------------------------------------------------------------------
1       0.9051  0.9297  0.9377  0.9337  0.7673  0.9543  0.9804  
2       0.9034  0.9645  0.8973  0.9297  0.7811  0.9621  0.9852  
3       0.9182  0.9458  0.9389  0.9423  0.8017  0.9611  0.9820  
4       0.9191  0.9300  0.9584  0.9440  0.7993  0.9329  0.9480  


In [54]:
# 严格控制随机性的训练对比
import torch
import numpy as np
import random
import os

def set_deterministic_seed(seed=42):
    """设置更严格的确定性种子"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    # 更严格的确定性设置
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True, warn_only=True)

# 重新训练4层模型，使用更严格的随机控制
set_deterministic_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 创建模型
model_new = TransformerMoE(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4, 
    num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
).to(device)

# 使用相同的训练设置
optimizer = torch.optim.AdamW(model_new.parameters(), lr=2e-4, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()

print("=== 重新训练4层TransformerMoE (严格随机控制) ===")
epochs = 10

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train_loss = train_one_epoch(model_new, train_loader, optimizer, criterion, device, 0.01, scaler)
    print(f"Train Loss: {train_loss:.4f}")

# 测试新训练的模型
print("\n=== 新训练的4层模型测试结果 ===")
new_results = eval_model(model_new, test_loader, device)

# 加载你的SOTA模型进行对比
model_sota = TransformerMoE(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4, 
    num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
).to(device)
model_sota.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))

print("\n=== SOTA模型测试结果 ===")
sota_results = eval_model(model_sota, test_loader, device)

print("\n" + "="*60)
print("📊 结果对比分析")
print("="*60)
print(f"SOTA模型 F1: {sota_results[3]:.4f}")
print(f"新训练模型 F1: {new_results[3]:.4f}")
print(f"F1差异: {abs(sota_results[3] - new_results[3]):.4f}")

if abs(sota_results[3] - new_results[3]) < 0.01:
    print("✅ 结果基本一致，差异在合理范围内")
elif abs(sota_results[3] - new_results[3]) < 0.02:
    print("⚠️ 有一定差异，但仍在可接受范围")
else:
    print("❗ 差异较大，可能存在其他因素")

  scaler = GradScaler()


=== 重新训练4层TransformerMoE (严格随机控制) ===

Epoch 1/10


  with autocast():  # 开启混合精度
  proj = linear(q, w, b)
  attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
  attn_output = torch.bmm(attn_output_weights, v)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
                                                           

Train Loss: 1.6783

Epoch 2/10


                                                           

Train Loss: 1.3769

Epoch 3/10


                                                           

Train Loss: 1.3604

Epoch 4/10


                                                           

Train Loss: 1.3480

Epoch 5/10


                                                           

Train Loss: 1.3414

Epoch 6/10


                                                           

Train Loss: 1.3197

Epoch 7/10


                                                           

Train Loss: 1.3178

Epoch 8/10


                                                           

Train Loss: 1.2931

Epoch 9/10


                                                           

Train Loss: 1.2949

Epoch 10/10


                                                           

Train Loss: 1.2835

=== 新训练的4层模型测试结果 ===


  return torch._native_multi_head_attention(


Test ACC: 0.9104, PRE: 0.9441, REC: 0.9291, F1: 0.9365, MCC: 0.7843


  model_sota.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))



=== SOTA模型测试结果 ===


  return torch._native_multi_head_attention(


Test ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120

📊 结果对比分析
SOTA模型 F1: 0.9454
新训练模型 F1: 0.9365
F1差异: 0.0089
✅ 结果基本一致，差异在合理范围内


In [None]:
  # 使用最佳模型进行最终测试
    model.load_state_dict(best_state)
    final_metrics = eval_model(model, test_loader, device)
    results[num_layers] = {
        'acc': final_metrics[0], 'pre': final_metrics[1], 'rec': final_metrics[2],
        'f1': final_metrics[3], 'mcc': final_metrics[4], 'auc': final_metrics[5],
        'auprc': final_metrics[6], 'sn': final_metrics[7], 'sp': final_metrics[8]
    }
    
    print(f"\n{num_layers}层TransformerMoE最终测试结果:")
    print(f"ACC: {final_metrics[0]:.4f}, PRE: {final_metrics[1]:.4f}, REC: {final_metrics[2]:.4f}")
    print(f"F1: {final_metrics[3]:.4f}, MCC: {final_metrics[4]:.4f}")
    print(f"AUC: {final_metrics[5]:.4f}, AUPRC: {final_metrics[6]:.4f}")
    print(f"SN: {final_metrics[7]:.4f}, SP: {final_metrics[8]:.4f}")

# 汇总所有结果
print(f"\n{'='*70}")
print("所有层数TransformerMoE结果汇总")
print(f"{'='*70}")
print(f"{'Layers':<8}{'ACC':<8}{'PRE':<8}{'REC':<8}{'F1':<8}{'MCC':<8}{'AUC':<8}{'AUPRC':<8}")
print(f"{'-'*70}")

for layers in layer_configs:
    r = results[layers]
    print(f"{layers:<8}{r['acc']:<8.4f}{r['pre']:<8.4f}{r['rec']:<8.4f}{r['f1']:<8.4f}{r['mcc']:<8.4f}{r['auc']:<8.4f}{r['auprc']:<8.4f}")

### 我的最佳模型测试集性能

In [53]:
# 假设模型结构和ProteinNPYDataset已定义，device已设置
import torch

# 1. 加载模型
model = TransformerMoE(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
).to(device)
model.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))
model.eval()

# 2. 加载测试集
test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'
test_dataset = ProteinNPYDataset(test_pos, test_neg)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# 3. 定义评估函数
def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    print(f"Test ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    return acc, pre, rec, f1, mcc

# 4. 测试
eval_model(model, test_loader, device)

  model.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))


Test ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120


(0.9225413402959095,
 0.948339483394834,
 0.9425427872860636,
 0.9454322501532803,
 0.8120485793877618)

# 记录
第一次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=10

结果 Test ACC: 0.9173, PRE: 0.9199, REC: 0.9682, F1: 0.9434, MCC: 0.7939

第二次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=5, num_experts=30, k=2, dropout=0.2, noisy_std=1.0, num_classes=2 epoch=20

结果 Test ACC: 0.8973, PRE: 0.9464, REC: 0.9071, F1: 0.9263, MCC: 0.7589

第三次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=15
结果Test ACC: 0.9121, PRE: 0.9544, REC: 0.9205, F1: 0.9371, MCC: 0.7926

第四次参数设置d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=10
数据集变成不加gan

结果：Test ACC: 0.8808, PRE: 0.9570, REC: 0.8716, F1: 0.9123, MCC: 0.7350

第五次参数设置d_model=1152, nhead=8, d_ff=2048, num_layers=3, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=10

结果：Test ACC: 0.9017, PRE: 0.9094, REC: 0.9572, F1: 0.9327, MCC: 0.7540

第六次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=2, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=10 加上了一个kaming初始化

结果：Test ACC: 0.9130, PRE: 0.9367, REC: 0.9413, F1: 0.9390, MCC: 0.7871

第七次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=40, k=2, dropout=0.1, noisy_std=1.0, num_classes=2

结果：Test ACC: 0.8782, PRE: 0.9508, REC: 0.8741, F1: 0.9108, MCC: 0.7260

第八次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=35, k=2, dropout=0.1, noisy_std=1.0, num_classes=2

结果Test ACC: 0.9112, PRE: 0.9409, REC: 0.9340, F1: 0.9374, MCC: 0.7848

第九次参数设置d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=25, k=2, dropout=0.1, noisy_std=1.0, num_classes=2

结果Test ACC: 0.8982, PRE: 0.8899, REC: 0.9780, F1: 0.9319, MCC: 0.7452

第十次参数设置 d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=1, dropout=0.1, noisy_std=1.0, num_classes=2 epoch=10

结果：Test ACC: 0.9017, PRE: 0.9657, REC: 0.8936, F1: 0.9283, MCC: 0.7786

第十一次参数设置d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2

结果Test ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120


### 十折验证集
参数均使用前边的最佳模型的参数

结果：========== 10-Fold CV Results ==========
Mean ACC: 0.9401 ± 0.0058
Mean PRE: 0.9264
Mean REC: 0.9260
Mean F1:  0.9252
Mean MCC: 0.8523

### 十折测试集
结果：========== 10-Fold Test Results ==========
Mean ACC: 0.9155 ± 0.0048
Mean PRE: 0.9338
Mean REC: 0.9488
Mean F1:  0.9411
Mean MCC: 0.7925

### sota十折训练集
 每一折得分：
        ACC        F1       MCC        SN        SP    Recall  Precision  \
0  0.884783  0.918836  0.720346  0.917431  0.804511  0.917431   0.920245   
1  0.876087  0.915052  0.689696  0.938838  0.721805  0.938838   0.892442   
2  0.891304  0.923313  0.736793  0.920489  0.819549  0.920489   0.926154   
3  0.895425  0.927273  0.741669  0.935780  0.795455  0.935780   0.918919   
4  0.895425  0.928144  0.738638  0.948012  0.765152  0.948012   0.909091   
5  0.877996  0.913846  0.705075  0.908257  0.803030  0.908257   0.919505   
6  0.901961  0.932127  0.756676  0.944954  0.795455  0.944954   0.919643   
7  0.880174  0.917541  0.700500  0.935780  0.742424  0.935780   0.900000   
8  0.886710  0.920973  0.721165  0.926606  0.787879  0.926606   0.915408   
9  0.895425  0.928571  0.737651  0.954128  0.750000  0.954128   0.904348   
 AUC     AUPRC  
0  0.958152  0.983695  
1  0.936746  0.972714  
2  0.944264  0.974113  
3  0.946958  0.978492  
4  0.965110  0.986893  
5  0.922667  0.964155  
6  0.942174  0.974796  
7  0.942591  0.976072  
8  0.944213  0.976352  
9  0.932583  0.959434  

📈 各指标平均 ± 标准差：
ACC: 0.8885 ± 0.0087
F1: 0.9226 ± 0.0063
MCC: 0.7248 ± 0.0212
SN: 0.9330 ± 0.0146
SP: 0.7785 ± 0.0319
Recall: 0.9330 ± 0.0146
Precision: 0.9126 ± 0.0107
AUC: 0.9435 ± 0.0120
AUPRC: 0.9747 ± 0.0081

### 十折交叉验证

In [23]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedKFold
from torch.cuda.amp import autocast, GradScaler
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score

import os
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [24]:
def train_one_epoch(model, loader, optimizer, criterion, device, moe_loss_weight=0.01, scaler=None):
    model.train()
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with autocast():
            logits, lb_loss = model(x)
            loss = criterion(logits, y) + moe_loss_weight * lb_loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    print(f"Val ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    return acc, pre, rec, f1, mcc

In [16]:
train_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
train_neg = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'

# 构建全体索引和标签
pos_len = np.load(train_pos, mmap_mode='r').shape[0]
neg_len = np.load(train_neg, mmap_mode='r').shape[0]
all_indices = np.concatenate([np.arange(pos_len), np.arange(neg_len) + pos_len])
all_labels = np.concatenate([np.ones(pos_len, dtype=int), np.zeros(neg_len, dtype=int)])

# 数据集
full_dataset = ProteinNPYDataset(train_pos, train_neg)

# K折分层
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(all_indices, all_labels), 1):
    print(f"\n========== Fold {fold}/10 ==========")
    train_loader = DataLoader(Subset(full_dataset, train_idx), batch_size=64, shuffle=True, num_workers=2)
    val_loader = DataLoader(Subset(full_dataset, val_idx), batch_size=64, shuffle=False, num_workers=2)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TransformerMoE(
        d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
    ).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    best_acc = 0
    best_state = None
    epochs = 10

    for epoch in range(epochs):
        print(f"\n[Fold {fold}] Epoch {epoch+1}/{epochs}")
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            with autocast():
                logits, lb_loss = model(x)
                loss = criterion(logits, y) + 0.01 * lb_loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"Train Loss: {total_loss / len(train_loader):.4f}")

        # 验证
        model.eval()
        all_preds, all_labels_fold = [], []
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits, _ = model(x)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels_fold.extend(y.cpu().numpy())
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
        acc = accuracy_score(all_labels_fold, all_preds)
        pre = precision_score(all_labels_fold, all_preds)
        rec = recall_score(all_labels_fold, all_preds)
        f1 = f1_score(all_labels_fold, all_preds)
        mcc = matthews_corrcoef(all_labels_fold, all_preds)
        print(f"Val ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")

        if acc > best_acc:
            best_acc = acc
            best_state = model.state_dict()
            torch.save(best_state, f"/exp_data/sjx/star/main_transformer_moe_weight/cv_point/best_fold{fold}.pth")
            print(f"Best model saved for fold {fold} at epoch {epoch+1}")

    all_metrics.append((best_acc, pre, rec, f1, mcc))
    print(f"[Fold {fold}] Best ACC: {best_acc:.4f}")

# 汇总结果
all_metrics = np.array(all_metrics)
print("\n========== 10-Fold CV Results ==========")
print(f"Mean ACC: {all_metrics[:,0].mean():.4f} ± {all_metrics[:,0].std():.4f}")
print(f"Mean PRE: {all_metrics[:,1].mean():.4f}± {all_metrics[:,1].std():.4f}")
print(f"Mean REC: {all_metrics[:,2].mean():.4f}± {all_metrics[:,2].std():.4f}")
print(f"Mean F1:  {all_metrics[:,3].mean():.4f}± {all_metrics[:,3].std():.4f}")
print(f"Mean MCC: {all_metrics[:,4].mean():.4f}± {all_metrics[:,4].std():.4f}")



[Fold 1] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.5652
Val ACC: 0.9115, PRE: 0.9327, REC: 0.8872, F1: 0.9094, MCC: 0.8239
Best model saved for fold 1 at epoch 1

[Fold 1] Epoch 2/10


  with autocast():


Train Loss: 1.3694
Val ACC: 0.9252, PRE: 0.9215, REC: 0.9299, F1: 0.9256, MCC: 0.8504
Best model saved for fold 1 at epoch 2

[Fold 1] Epoch 3/10


  with autocast():


Train Loss: 1.3596
Val ACC: 0.9328, PRE: 0.9383, REC: 0.9268, F1: 0.9325, MCC: 0.8657
Best model saved for fold 1 at epoch 3

[Fold 1] Epoch 4/10


  with autocast():


Train Loss: 1.3524
Val ACC: 0.9252, PRE: 0.9515, REC: 0.8963, F1: 0.9231, MCC: 0.8518

[Fold 1] Epoch 5/10


  with autocast():


Train Loss: 1.3447
Val ACC: 0.9298, PRE: 0.9462, REC: 0.9116, F1: 0.9286, MCC: 0.8601

[Fold 1] Epoch 6/10


  with autocast():


Train Loss: 1.3282
Val ACC: 0.9176, PRE: 0.8743, REC: 0.9756, F1: 0.9222, MCC: 0.8408

[Fold 1] Epoch 7/10


  with autocast():


Train Loss: 1.3202
Val ACC: 0.9237, PRE: 0.8798, REC: 0.9817, F1: 0.9280, MCC: 0.8531

[Fold 1] Epoch 8/10


  with autocast():


Train Loss: 1.3109
Val ACC: 0.9405, PRE: 0.9313, REC: 0.9512, F1: 0.9412, MCC: 0.8811
Best model saved for fold 1 at epoch 8

[Fold 1] Epoch 9/10


  with autocast():


Train Loss: 1.3105
Val ACC: 0.9191, PRE: 0.9231, REC: 0.9146, F1: 0.9188, MCC: 0.8382

[Fold 1] Epoch 10/10


  with autocast():


Train Loss: 1.3017
Val ACC: 0.9267, PRE: 0.9142, REC: 0.9421, F1: 0.9279, MCC: 0.8538
[Fold 1] Best ACC: 0.9405


[Fold 2] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.6760
Val ACC: 0.9405, PRE: 0.9706, REC: 0.9083, F1: 0.9384, MCC: 0.8827
Best model saved for fold 2 at epoch 1

[Fold 2] Epoch 2/10


  with autocast():


Train Loss: 1.3859
Val ACC: 0.9237, PRE: 0.9663, REC: 0.8777, F1: 0.9199, MCC: 0.8509

[Fold 2] Epoch 3/10


  with autocast():


Train Loss: 1.3672
Val ACC: 0.9313, PRE: 0.8895, REC: 0.9847, F1: 0.9347, MCC: 0.8676

[Fold 2] Epoch 4/10


  with autocast():


Train Loss: 1.3574
Val ACC: 0.9496, PRE: 0.9428, REC: 0.9572, F1: 0.9499, MCC: 0.8993
Best model saved for fold 2 at epoch 4

[Fold 2] Epoch 5/10


  with autocast():


Train Loss: 1.3480
Val ACC: 0.9481, PRE: 0.9681, REC: 0.9266, F1: 0.9469, MCC: 0.8970

[Fold 2] Epoch 6/10


  with autocast():


Train Loss: 1.3399
Val ACC: 0.9221, PRE: 0.9539, REC: 0.8869, F1: 0.9192, MCC: 0.8463

[Fold 2] Epoch 7/10


  with autocast():


Train Loss: 1.3293
Val ACC: 0.9145, PRE: 0.9721, REC: 0.8532, F1: 0.9088, MCC: 0.8352

[Fold 2] Epoch 8/10


  with autocast():


Train Loss: 1.3222
Val ACC: 0.9496, PRE: 0.9509, REC: 0.9480, F1: 0.9495, MCC: 0.8992

[Fold 2] Epoch 9/10


  with autocast():


Train Loss: 1.3238
Val ACC: 0.9237, PRE: 0.9425, REC: 0.9021, F1: 0.9219, MCC: 0.8481

[Fold 2] Epoch 10/10


  with autocast():


Train Loss: 1.3088
Val ACC: 0.9450, PRE: 0.9396, REC: 0.9511, F1: 0.9453, MCC: 0.8901
[Fold 2] Best ACC: 0.9496


[Fold 3] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.6411
Val ACC: 0.9312, PRE: 0.9490, REC: 0.9113, F1: 0.9298, MCC: 0.8631
Best model saved for fold 3 at epoch 1

[Fold 3] Epoch 2/10


  with autocast():


Train Loss: 1.3839
Val ACC: 0.9327, PRE: 0.9275, REC: 0.9388, F1: 0.9331, MCC: 0.8655
Best model saved for fold 3 at epoch 2

[Fold 3] Epoch 3/10


  with autocast():


Train Loss: 1.3561
Val ACC: 0.9358, PRE: 0.9331, REC: 0.9388, F1: 0.9360, MCC: 0.8716
Best model saved for fold 3 at epoch 3

[Fold 3] Epoch 4/10


  with autocast():


Train Loss: 1.3534
Val ACC: 0.9159, PRE: 0.9359, REC: 0.8930, F1: 0.9139, MCC: 0.8327

[Fold 3] Epoch 5/10


  with autocast():


Train Loss: 1.3500
Val ACC: 0.9297, PRE: 0.9049, REC: 0.9602, F1: 0.9318, MCC: 0.8609

[Fold 3] Epoch 6/10


  with autocast():


Train Loss: 1.3373
Val ACC: 0.9327, PRE: 0.9464, REC: 0.9174, F1: 0.9317, MCC: 0.8658

[Fold 3] Epoch 7/10


  with autocast():


Train Loss: 1.3219
Val ACC: 0.9235, PRE: 0.9511, REC: 0.8930, F1: 0.9211, MCC: 0.8487

[Fold 3] Epoch 8/10


  with autocast():


Train Loss: 1.3185
Val ACC: 0.9235, PRE: 0.9184, REC: 0.9297, F1: 0.9240, MCC: 0.8472

[Fold 3] Epoch 9/10


  with autocast():


Train Loss: 1.3068
Val ACC: 0.9297, PRE: 0.9404, REC: 0.9174, F1: 0.9288, MCC: 0.8596

[Fold 3] Epoch 10/10


  with autocast():


Train Loss: 1.3044
Val ACC: 0.9052, PRE: 0.9585, REC: 0.8471, F1: 0.8994, MCC: 0.8159
[Fold 3] Best ACC: 0.9358


[Fold 4] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.6252
Val ACC: 0.9495, PRE: 0.9401, REC: 0.9602, F1: 0.9501, MCC: 0.8993
Best model saved for fold 4 at epoch 1

[Fold 4] Epoch 2/10


  with autocast():


Train Loss: 1.3803
Val ACC: 0.9480, PRE: 0.9564, REC: 0.9388, F1: 0.9475, MCC: 0.8962

[Fold 4] Epoch 3/10


  with autocast():


Train Loss: 1.3782
Val ACC: 0.9251, PRE: 0.8819, REC: 0.9817, F1: 0.9291, MCC: 0.8556

[Fold 4] Epoch 4/10


  with autocast():


Train Loss: 1.3507
Val ACC: 0.9434, PRE: 0.9475, REC: 0.9388, F1: 0.9432, MCC: 0.8869

[Fold 4] Epoch 5/10


  with autocast():


Train Loss: 1.3403
Val ACC: 0.9343, PRE: 0.9226, REC: 0.9480, F1: 0.9351, MCC: 0.8688

[Fold 4] Epoch 6/10


  with autocast():


Train Loss: 1.3386
Val ACC: 0.9450, PRE: 0.9477, REC: 0.9419, F1: 0.9448, MCC: 0.8899

[Fold 4] Epoch 7/10


  with autocast():


Train Loss: 1.3254
Val ACC: 0.9037, PRE: 0.9074, REC: 0.8991, F1: 0.9032, MCC: 0.8074

[Fold 4] Epoch 8/10


  with autocast():


Train Loss: 1.3125
Val ACC: 0.9205, PRE: 0.9310, REC: 0.9083, F1: 0.9195, MCC: 0.8412

[Fold 4] Epoch 9/10


  with autocast():


Train Loss: 1.3007
Val ACC: 0.9343, PRE: 0.9176, REC: 0.9541, F1: 0.9355, MCC: 0.8692

[Fold 4] Epoch 10/10


  with autocast():


Train Loss: 1.3137
Val ACC: 0.9327, PRE: 0.9174, REC: 0.9511, F1: 0.9339, MCC: 0.8660
[Fold 4] Best ACC: 0.9495


[Fold 5] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.6743
Val ACC: 0.9220, PRE: 0.8966, REC: 0.9541, F1: 0.9244, MCC: 0.8458
Best model saved for fold 5 at epoch 1

[Fold 5] Epoch 2/10


  with autocast():


Train Loss: 1.3820
Val ACC: 0.9281, PRE: 0.9403, REC: 0.9144, F1: 0.9271, MCC: 0.8566
Best model saved for fold 5 at epoch 2

[Fold 5] Epoch 3/10


  with autocast():


Train Loss: 1.3556
Val ACC: 0.9373, PRE: 0.9206, REC: 0.9572, F1: 0.9385, MCC: 0.8753
Best model saved for fold 5 at epoch 3

[Fold 5] Epoch 4/10


  with autocast():


Train Loss: 1.3469
Val ACC: 0.9327, PRE: 0.9174, REC: 0.9511, F1: 0.9339, MCC: 0.8660

[Fold 5] Epoch 5/10


  with autocast():


Train Loss: 1.3418
Val ACC: 0.9266, PRE: 0.8974, REC: 0.9633, F1: 0.9292, MCC: 0.8555

[Fold 5] Epoch 6/10


  with autocast():


Train Loss: 1.3314
Val ACC: 0.9266, PRE: 0.8952, REC: 0.9664, F1: 0.9294, MCC: 0.8559

[Fold 5] Epoch 7/10


  with autocast():


Train Loss: 1.3155
Val ACC: 0.9098, PRE: 0.9408, REC: 0.8746, F1: 0.9065, MCC: 0.8216

[Fold 5] Epoch 8/10


  with autocast():


Train Loss: 1.3304
Val ACC: 0.9144, PRE: 0.8712, REC: 0.9725, F1: 0.9191, MCC: 0.8344

[Fold 5] Epoch 9/10


  with autocast():


Train Loss: 1.3061
Val ACC: 0.9220, PRE: 0.8988, REC: 0.9511, F1: 0.9242, MCC: 0.8455

[Fold 5] Epoch 10/10


  with autocast():


Train Loss: 1.3054
Val ACC: 0.9312, PRE: 0.9379, REC: 0.9235, F1: 0.9307, MCC: 0.8625
[Fold 5] Best ACC: 0.9373


[Fold 6] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.6754
Val ACC: 0.9251, PRE: 0.9696, REC: 0.8777, F1: 0.9213, MCC: 0.8540
Best model saved for fold 6 at epoch 1

[Fold 6] Epoch 2/10


  with autocast():


Train Loss: 1.3838
Val ACC: 0.9358, PRE: 0.9582, REC: 0.9113, F1: 0.9342, MCC: 0.8726
Best model saved for fold 6 at epoch 2

[Fold 6] Epoch 3/10


  with autocast():


Train Loss: 1.3685
Val ACC: 0.9373, PRE: 0.9583, REC: 0.9144, F1: 0.9358, MCC: 0.8755
Best model saved for fold 6 at epoch 3

[Fold 6] Epoch 4/10


  with autocast():


Train Loss: 1.3521
Val ACC: 0.9220, PRE: 0.8730, REC: 0.9878, F1: 0.9268, MCC: 0.8514

[Fold 6] Epoch 5/10


  with autocast():


Train Loss: 1.3478
Val ACC: 0.9434, PRE: 0.9290, REC: 0.9602, F1: 0.9444, MCC: 0.8874
Best model saved for fold 6 at epoch 5

[Fold 6] Epoch 6/10


  with autocast():


Train Loss: 1.3415
Val ACC: 0.9312, PRE: 0.9548, REC: 0.9052, F1: 0.9294, MCC: 0.8636

[Fold 6] Epoch 7/10


  with autocast():


Train Loss: 1.3325
Val ACC: 0.9251, PRE: 0.9696, REC: 0.8777, F1: 0.9213, MCC: 0.8540

[Fold 6] Epoch 8/10


  with autocast():


Train Loss: 1.3147
Val ACC: 0.9434, PRE: 0.9290, REC: 0.9602, F1: 0.9444, MCC: 0.8874

[Fold 6] Epoch 9/10


  with autocast():


Train Loss: 1.3241
Val ACC: 0.9495, PRE: 0.9623, REC: 0.9358, F1: 0.9488, MCC: 0.8994
Best model saved for fold 6 at epoch 9

[Fold 6] Epoch 10/10


  with autocast():


Train Loss: 1.3082
Val ACC: 0.9174, PRE: 0.9691, REC: 0.8624, F1: 0.9126, MCC: 0.8400
[Fold 6] Best ACC: 0.9495


[Fold 7] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.5767
Val ACC: 0.9312, PRE: 0.9147, REC: 0.9511, F1: 0.9325, MCC: 0.8631
Best model saved for fold 7 at epoch 1

[Fold 7] Epoch 2/10


  with autocast():


Train Loss: 1.3784
Val ACC: 0.9388, PRE: 0.9184, REC: 0.9633, F1: 0.9403, MCC: 0.8787
Best model saved for fold 7 at epoch 2

[Fold 7] Epoch 3/10


  with autocast():


Train Loss: 1.3571
Val ACC: 0.9190, PRE: 0.8806, REC: 0.9694, F1: 0.9229, MCC: 0.8422

[Fold 7] Epoch 4/10


  with autocast():


Train Loss: 1.3518
Val ACC: 0.9281, PRE: 0.9217, REC: 0.9358, F1: 0.9287, MCC: 0.8564

[Fold 7] Epoch 5/10


  with autocast():


Train Loss: 1.3414
Val ACC: 0.9297, PRE: 0.9460, REC: 0.9113, F1: 0.9283, MCC: 0.8599

[Fold 7] Epoch 6/10


  with autocast():


Train Loss: 1.3384
Val ACC: 0.9113, PRE: 0.8587, REC: 0.9847, F1: 0.9174, MCC: 0.8316

[Fold 7] Epoch 7/10


  with autocast():


Train Loss: 1.3220
Val ACC: 0.9297, PRE: 0.9377, REC: 0.9205, F1: 0.9290, MCC: 0.8595

[Fold 7] Epoch 8/10


  with autocast():


Train Loss: 1.3190
Val ACC: 0.9312, PRE: 0.9352, REC: 0.9266, F1: 0.9309, MCC: 0.8624

[Fold 7] Epoch 9/10


  with autocast():


Train Loss: 1.3087
Val ACC: 0.9373, PRE: 0.9333, REC: 0.9419, F1: 0.9376, MCC: 0.8747

[Fold 7] Epoch 10/10


  with autocast():


Train Loss: 1.3073
Val ACC: 0.9190, PRE: 0.8848, REC: 0.9633, F1: 0.9224, MCC: 0.8412
[Fold 7] Best ACC: 0.9388


[Fold 8] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.5816
Val ACC: 0.9343, PRE: 0.9410, REC: 0.9266, F1: 0.9337, MCC: 0.8686
Best model saved for fold 8 at epoch 1

[Fold 8] Epoch 2/10


  with autocast():


Train Loss: 1.3854
Val ACC: 0.9266, PRE: 0.8974, REC: 0.9633, F1: 0.9292, MCC: 0.8555

[Fold 8] Epoch 3/10


  with autocast():


Train Loss: 1.3647
Val ACC: 0.9220, PRE: 0.8730, REC: 0.9878, F1: 0.9268, MCC: 0.8514

[Fold 8] Epoch 4/10


  with autocast():


Train Loss: 1.3515
Val ACC: 0.9220, PRE: 0.9395, REC: 0.9021, F1: 0.9204, MCC: 0.8447

[Fold 8] Epoch 5/10


  with autocast():


Train Loss: 1.3449
Val ACC: 0.9159, PRE: 0.8676, REC: 0.9817, F1: 0.9211, MCC: 0.8391

[Fold 8] Epoch 6/10


  with autocast():


Train Loss: 1.3329
Val ACC: 0.9388, PRE: 0.9388, REC: 0.9388, F1: 0.9388, MCC: 0.8777
Best model saved for fold 8 at epoch 6

[Fold 8] Epoch 7/10


  with autocast():


Train Loss: 1.3322
Val ACC: 0.9450, PRE: 0.9396, REC: 0.9511, F1: 0.9453, MCC: 0.8900
Best model saved for fold 8 at epoch 7

[Fold 8] Epoch 8/10


  with autocast():


Train Loss: 1.3325
Val ACC: 0.9312, PRE: 0.9172, REC: 0.9480, F1: 0.9323, MCC: 0.8629

[Fold 8] Epoch 9/10


  with autocast():


Train Loss: 1.3134
Val ACC: 0.9297, PRE: 0.9518, REC: 0.9052, F1: 0.9279, MCC: 0.8604

[Fold 8] Epoch 10/10


  with autocast():


Train Loss: 1.3057
Val ACC: 0.9190, PRE: 0.9177, REC: 0.9205, F1: 0.9191, MCC: 0.8379
[Fold 8] Best ACC: 0.9450


[Fold 9] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.5858
Val ACC: 0.9266, PRE: 0.9240, REC: 0.9297, F1: 0.9268, MCC: 0.8532
Best model saved for fold 9 at epoch 1

[Fold 9] Epoch 2/10


  with autocast():


Train Loss: 1.3787
Val ACC: 0.9281, PRE: 0.9268, REC: 0.9297, F1: 0.9282, MCC: 0.8563
Best model saved for fold 9 at epoch 2

[Fold 9] Epoch 3/10


  with autocast():


Train Loss: 1.3666
Val ACC: 0.9343, PRE: 0.9410, REC: 0.9266, F1: 0.9337, MCC: 0.8686
Best model saved for fold 9 at epoch 3

[Fold 9] Epoch 4/10


  with autocast():


Train Loss: 1.3516
Val ACC: 0.9190, PRE: 0.9053, REC: 0.9358, F1: 0.9203, MCC: 0.8384

[Fold 9] Epoch 5/10


  with autocast():


Train Loss: 1.3365
Val ACC: 0.9297, PRE: 0.9120, REC: 0.9511, F1: 0.9311, MCC: 0.8601

[Fold 9] Epoch 6/10


  with autocast():


Train Loss: 1.3327
Val ACC: 0.9297, PRE: 0.9194, REC: 0.9419, F1: 0.9305, MCC: 0.8596

[Fold 9] Epoch 7/10


  with autocast():


Train Loss: 1.3212
Val ACC: 0.9251, PRE: 0.8971, REC: 0.9602, F1: 0.9276, MCC: 0.8523

[Fold 9] Epoch 8/10


  with autocast():


Train Loss: 1.3083
Val ACC: 0.9312, PRE: 0.8983, REC: 0.9725, F1: 0.9339, MCC: 0.8653

[Fold 9] Epoch 9/10


  with autocast():


Train Loss: 1.3090
Val ACC: 0.9235, PRE: 0.9134, REC: 0.9358, F1: 0.9245, MCC: 0.8473

[Fold 9] Epoch 10/10


  with autocast():


Train Loss: 1.2986
Val ACC: 0.9343, PRE: 0.9303, REC: 0.9388, F1: 0.9346, MCC: 0.8685
[Fold 9] Best ACC: 0.9343


[Fold 10] Epoch 1/10


  scaler = GradScaler()
  with autocast():


Train Loss: 1.5124
Val ACC: 0.9144, PRE: 0.9562, REC: 0.8685, F1: 0.9103, MCC: 0.8323
Best model saved for fold 10 at epoch 1

[Fold 10] Epoch 2/10


  with autocast():


Train Loss: 1.3734
Val ACC: 0.9220, PRE: 0.9313, REC: 0.9113, F1: 0.9212, MCC: 0.8442
Best model saved for fold 10 at epoch 2

[Fold 10] Epoch 3/10


  with autocast():


Train Loss: 1.3590
Val ACC: 0.9052, PRE: 0.9749, REC: 0.8318, F1: 0.8977, MCC: 0.8193

[Fold 10] Epoch 4/10


  with autocast():


Train Loss: 1.3600
Val ACC: 0.9235, PRE: 0.9397, REC: 0.9052, F1: 0.9221, MCC: 0.8477
Best model saved for fold 10 at epoch 4

[Fold 10] Epoch 5/10


  with autocast():


Train Loss: 1.3377
Val ACC: 0.9190, PRE: 0.9308, REC: 0.9052, F1: 0.9178, MCC: 0.8382

[Fold 10] Epoch 6/10


  with autocast():


Train Loss: 1.3370
Val ACC: 0.9235, PRE: 0.9315, REC: 0.9144, F1: 0.9228, MCC: 0.8472

[Fold 10] Epoch 7/10


  with autocast():


Train Loss: 1.3217
Val ACC: 0.9159, PRE: 0.8757, REC: 0.9694, F1: 0.9202, MCC: 0.8366

[Fold 10] Epoch 8/10


  with autocast():


Train Loss: 1.3195
Val ACC: 0.9312, PRE: 0.9434, REC: 0.9174, F1: 0.9302, MCC: 0.8627
Best model saved for fold 10 at epoch 8

[Fold 10] Epoch 9/10


  with autocast():


Train Loss: 1.3070
Val ACC: 0.9144, PRE: 0.9357, REC: 0.8899, F1: 0.9122, MCC: 0.8297

[Fold 10] Epoch 10/10


  with autocast():


Train Loss: 1.3082
Val ACC: 0.9235, PRE: 0.9369, REC: 0.9083, F1: 0.9224, MCC: 0.8475
[Fold 10] Best ACC: 0.9312

Mean ACC: 0.9411 ± 0.0065
Mean PRE: 0.9306± 0.0227
Mean REC: 0.9208± 0.0366
Mean F1:  0.9248± 0.0122
Mean MCC: 0.8524± 0.0195


In [25]:
train_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_train_embedding.npy'
train_neg = '/exp_data/sjx/star/gan_data/negative_train_all_combined.npy'

# 构建全体索引和标签
pos_len = np.load(train_pos, mmap_mode='r').shape[0]
neg_len = np.load(train_neg, mmap_mode='r').shape[0]
all_indices = np.concatenate([np.arange(pos_len), np.arange(neg_len) + pos_len])
all_labels = np.concatenate([np.ones(pos_len, dtype=int), np.zeros(neg_len, dtype=int)])

# 数据集
full_dataset = ProteinNPYDataset(train_pos, train_neg)

# K折分层
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(all_indices, all_labels), 1):
    print(f"\n========== Fold {fold}/10 ==========")
    train_loader = DataLoader(Subset(full_dataset, train_idx), batch_size=64, shuffle=True, num_workers=2)
    val_loader = DataLoader(Subset(full_dataset, val_idx), batch_size=64, shuffle=False, num_workers=2)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TransformerMoE(
        d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
    ).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    best_acc = 0
    best_state = None
    epochs = 10

    for epoch in range(epochs):
        print(f"\n[Fold {fold}] Epoch {epoch+1}/{epochs}")
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            with autocast():
                logits, lb_loss = model(x)
                loss = criterion(logits, y) + 0.01 * lb_loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"训练损失: {total_loss / len(train_loader):.4f}")

        # 验证
        model.eval()
        all_preds, all_labels_fold = [], []
        all_probs = []  # 存储预测概率用于AUC和AUPRC计算
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits, _ = model(x)
                probs = torch.softmax(logits, dim=1)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels_fold.extend(y.cpu().numpy())
                all_probs.extend(probs[:, 1].cpu().numpy())  # 正类的概率
        
        # 计算所有指标
        acc = accuracy_score(all_labels_fold, all_preds)
        pre = precision_score(all_labels_fold, all_preds)
        rec = recall_score(all_labels_fold, all_preds)
        f1 = f1_score(all_labels_fold, all_preds)
        mcc = matthews_corrcoef(all_labels_fold, all_preds)
        
        # 计算SN（敏感性/召回率）和SP（特异性）
        from sklearn.metrics import confusion_matrix
        tn, fp, fn, tp = confusion_matrix(all_labels_fold, all_preds).ravel()
        sn = tp / (tp + fn) if (tp + fn) > 0 else 0  # 敏感性
        sp = tn / (tn + fp) if (tn + fp) > 0 else 0  # 特异性
        
        # 计算AUC和AUPRC
        auc = roc_auc_score(all_labels_fold, all_probs)
        auprc = average_precision_score(all_labels_fold, all_probs)
        
        print(f"验证准确率: {acc:.4f}, 精确率: {pre:.4f}, 召回率: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
        print(f"敏感性(SN): {sn:.4f}, 特异性(SP): {sp:.4f}, AUC: {auc:.4f}, AUPRC: {auprc:.4f}")

        if acc > best_acc:
            best_acc = acc
            best_state = model.state_dict()
            # 确保保存目录存在
            save_dir = "/exp_data/sjx/star/main_transformer_moe_weight/cv_point"
            os.makedirs(save_dir, exist_ok=True)
            torch.save(best_state, f"{save_dir}/best_fold{fold}.pth")
            print(f"Fold {fold} 第 {epoch+1} 轮的最佳模型已保存")

    # 保存该fold的最佳指标
    all_metrics.append((best_acc, pre, rec, f1, mcc, sn, sp, auc, auprc))
    print(f"[Fold {fold}] 最佳准确率: {best_acc:.4f}")

# 汇总结果
all_metrics = np.array(all_metrics)
print("\n========== 10折交叉验证结果汇总 ==========")
print(f"平均准确率: {all_metrics[:,0].mean():.4f} ± {all_metrics[:,0].std():.4f}")
print(f"平均精确率: {all_metrics[:,1].mean():.4f} ± {all_metrics[:,1].std():.4f}")
print(f"平均召回率: {all_metrics[:,2].mean():.4f} ± {all_metrics[:,2].std():.4f}")
print(f"平均F1分数: {all_metrics[:,3].mean():.4f} ± {all_metrics[:,3].std():.4f}")
print(f"平均MCC: {all_metrics[:,4].mean():.4f} ± {all_metrics[:,4].std():.4f}")
print(f"平均敏感性(SN): {all_metrics[:,5].mean():.4f} ± {all_metrics[:,5].std():.4f}")
print(f"平均特异性(SP): {all_metrics[:,6].mean():.4f} ± {all_metrics[:,6].std():.4f}")
print(f"平均AUC: {all_metrics[:,7].mean():.4f} ± {all_metrics[:,7].std():.4f}")
print(f"平均AUPRC: {all_metrics[:,8].mean():.4f} ± {all_metrics[:,8].std():.4f}") 



[Fold 1] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.7336
验证准确率: 0.9176, 精确率: 0.8870, 召回率: 0.9573, F1: 0.9208, MCC: 0.8377
敏感性(SN): 0.9573, 特异性(SP): 0.8777, AUC: 0.9727, AUPRC: 0.9660
Fold 1 第 1 轮的最佳模型已保存

[Fold 1] Epoch 2/10


  with autocast():


训练损失: 1.3743
验证准确率: 0.9206, 精确率: 0.9207, 召回率: 0.9207, F1: 0.9207, MCC: 0.8412
敏感性(SN): 0.9207, 特异性(SP): 0.9205, AUC: 0.9791, AUPRC: 0.9777
Fold 1 第 2 轮的最佳模型已保存

[Fold 1] Epoch 3/10


  with autocast():


训练损失: 1.3609
验证准确率: 0.9130, 精确率: 0.9472, 召回率: 0.8750, F1: 0.9097, MCC: 0.8284
敏感性(SN): 0.8750, 特异性(SP): 0.9511, AUC: 0.9787, AUPRC: 0.9764

[Fold 1] Epoch 4/10


  with autocast():


训练损失: 1.3552
验证准确率: 0.9282, 精确率: 0.8871, 召回率: 0.9817, F1: 0.9320, MCC: 0.8614
敏感性(SN): 0.9817, 特异性(SP): 0.8746, AUC: 0.9811, AUPRC: 0.9749
Fold 1 第 4 轮的最佳模型已保存

[Fold 1] Epoch 5/10


  with autocast():


训练损失: 1.3404
验证准确率: 0.9206, 精确率: 0.8943, 召回率: 0.9543, F1: 0.9233, MCC: 0.8431
敏感性(SN): 0.9543, 特异性(SP): 0.8869, AUC: 0.9757, AUPRC: 0.9688

[Fold 1] Epoch 6/10


  with autocast():


训练损失: 1.3362
验证准确率: 0.9313, 精确率: 0.9101, 召回率: 0.9573, F1: 0.9331, MCC: 0.8637
敏感性(SN): 0.9573, 特异性(SP): 0.9052, AUC: 0.9782, AUPRC: 0.9701
Fold 1 第 6 轮的最佳模型已保存

[Fold 1] Epoch 7/10


  with autocast():


训练损失: 1.3225
验证准确率: 0.9328, 精确率: 0.9034, 召回率: 0.9695, F1: 0.9353, MCC: 0.8680
敏感性(SN): 0.9695, 特异性(SP): 0.8960, AUC: 0.9756, AUPRC: 0.9600
Fold 1 第 7 轮的最佳模型已保存

[Fold 1] Epoch 8/10


  with autocast():


训练损失: 1.3172
验证准确率: 0.9176, 精确率: 0.9335, 召回率: 0.8994, F1: 0.9161, MCC: 0.8357
敏感性(SN): 0.8994, 特异性(SP): 0.9358, AUC: 0.9776, AUPRC: 0.9726

[Fold 1] Epoch 9/10


  with autocast():


训练损失: 1.3054
验证准确率: 0.9252, 精确率: 0.9043, 召回率: 0.9512, F1: 0.9272, MCC: 0.8515
敏感性(SN): 0.9512, 特异性(SP): 0.8991, AUC: 0.9787, AUPRC: 0.9751

[Fold 1] Epoch 10/10


  with autocast():


训练损失: 1.2861
验证准确率: 0.9252, 精确率: 0.9240, 召回率: 0.9268, F1: 0.9254, MCC: 0.8504
敏感性(SN): 0.9268, 特异性(SP): 0.9235, AUC: 0.9772, AUPRC: 0.9721
[Fold 1] 最佳准确率: 0.9328


[Fold 2] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.6946
验证准确率: 0.9420, 精确率: 0.9530, 召回率: 0.9297, F1: 0.9412, MCC: 0.8842
敏感性(SN): 0.9297, 特异性(SP): 0.9543, AUC: 0.9855, AUPRC: 0.9776
Fold 2 第 1 轮的最佳模型已保存

[Fold 2] Epoch 2/10


  with autocast():


训练损失: 1.3795
验证准确率: 0.9115, 精确率: 0.8530, 召回率: 0.9939, F1: 0.9181, MCC: 0.8344
敏感性(SN): 0.9939, 特异性(SP): 0.8293, AUC: 0.9870, AUPRC: 0.9821

[Fold 2] Epoch 3/10


  with autocast():


训练损失: 1.3732
验证准确率: 0.9420, 精确率: 0.9140, 召回率: 0.9755, F1: 0.9438, MCC: 0.8860
敏感性(SN): 0.9755, 特异性(SP): 0.9085, AUC: 0.9869, AUPRC: 0.9838

[Fold 2] Epoch 4/10


  with autocast():


训练损失: 1.3498
验证准确率: 0.9527, 精确率: 0.9405, 召回率: 0.9664, F1: 0.9532, MCC: 0.9057
敏感性(SN): 0.9664, 特异性(SP): 0.9390, AUC: 0.9869, AUPRC: 0.9841
Fold 2 第 4 轮的最佳模型已保存

[Fold 2] Epoch 5/10


  with autocast():


训练损失: 1.3453
验证准确率: 0.9496, 精确率: 0.9298, 召回率: 0.9725, F1: 0.9507, MCC: 0.9002
敏感性(SN): 0.9725, 特异性(SP): 0.9268, AUC: 0.9849, AUPRC: 0.9817

[Fold 2] Epoch 6/10


  with autocast():


训练损失: 1.3256
验证准确率: 0.8992, 精确率: 0.9336, 召回率: 0.8593, F1: 0.8949, MCC: 0.8010
敏感性(SN): 0.8593, 特异性(SP): 0.9390, AUC: 0.9763, AUPRC: 0.9739

[Fold 2] Epoch 7/10


  with autocast():


训练损失: 1.3231
验证准确率: 0.9191, 精确率: 0.9597, 召回率: 0.8746, F1: 0.9152, MCC: 0.8415
敏感性(SN): 0.8746, 特异性(SP): 0.9634, AUC: 0.9836, AUPRC: 0.9813

[Fold 2] Epoch 8/10


  with autocast():


训练损失: 1.3168
验证准确率: 0.9115, 精确率: 0.9622, 召回率: 0.8563, F1: 0.9061, MCC: 0.8279
敏感性(SN): 0.8563, 特异性(SP): 0.9665, AUC: 0.9810, AUPRC: 0.9767

[Fold 2] Epoch 9/10


  with autocast():


训练损失: 1.3100
验证准确率: 0.9344, 精确率: 0.9610, 召回率: 0.9052, F1: 0.9323, MCC: 0.8702
敏感性(SN): 0.9052, 特异性(SP): 0.9634, AUC: 0.9816, AUPRC: 0.9771

[Fold 2] Epoch 10/10


  with autocast():


训练损失: 1.2960
验证准确率: 0.9176, 精确率: 0.9596, 召回率: 0.8716, F1: 0.9135, MCC: 0.8386
敏感性(SN): 0.8716, 特异性(SP): 0.9634, AUC: 0.9817, AUPRC: 0.9795
[Fold 2] 最佳准确率: 0.9527


[Fold 3] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5004
验证准确率: 0.9205, 精确率: 0.9568, 召回率: 0.8807, F1: 0.9172, MCC: 0.8436
敏感性(SN): 0.8807, 特异性(SP): 0.9602, AUC: 0.9789, AUPRC: 0.9753
Fold 3 第 1 轮的最佳模型已保存

[Fold 3] Epoch 2/10


  with autocast():


训练损失: 1.3890
验证准确率: 0.9281, 精确率: 0.9430, 召回率: 0.9113, F1: 0.9269, MCC: 0.8568
敏感性(SN): 0.9113, 特异性(SP): 0.9450, AUC: 0.9823, AUPRC: 0.9807
Fold 3 第 2 轮的最佳模型已保存

[Fold 3] Epoch 3/10


  with autocast():


训练损失: 1.3592
验证准确率: 0.9266, 精确率: 0.9515, 召回率: 0.8991, F1: 0.9245, MCC: 0.8545
敏感性(SN): 0.8991, 特异性(SP): 0.9541, AUC: 0.9839, AUPRC: 0.9826

[Fold 3] Epoch 4/10


  with autocast():


训练损失: 1.3496
验证准确率: 0.9266, 精确率: 0.9067, 召回率: 0.9511, F1: 0.9284, MCC: 0.8542
敏感性(SN): 0.9511, 特异性(SP): 0.9021, AUC: 0.9811, AUPRC: 0.9791

[Fold 3] Epoch 5/10


  with autocast():


训练损失: 1.3535
验证准确率: 0.9373, 精确率: 0.9441, 召回率: 0.9297, F1: 0.9368, MCC: 0.8747
敏感性(SN): 0.9297, 特异性(SP): 0.9450, AUC: 0.9849, AUPRC: 0.9851
Fold 3 第 5 轮的最佳模型已保存

[Fold 3] Epoch 6/10


  with autocast():


训练损失: 1.3345
验证准确率: 0.9251, 精确率: 0.9572, 召回率: 0.8899, F1: 0.9223, MCC: 0.8523
敏感性(SN): 0.8899, 特异性(SP): 0.9602, AUC: 0.9855, AUPRC: 0.9853

[Fold 3] Epoch 7/10


  with autocast():


训练损失: 1.3381
验证准确率: 0.9159, 精确率: 0.9595, 召回率: 0.8685, F1: 0.9117, MCC: 0.8356
敏感性(SN): 0.8685, 特异性(SP): 0.9633, AUC: 0.9798, AUPRC: 0.9792

[Fold 3] Epoch 8/10


  with autocast():


训练损失: 1.3212
验证准确率: 0.9343, 精确率: 0.9494, 召回率: 0.9174, F1: 0.9331, MCC: 0.8690
敏感性(SN): 0.9174, 特异性(SP): 0.9511, AUC: 0.9838, AUPRC: 0.9848

[Fold 3] Epoch 9/10


  with autocast():


训练损失: 1.3075
验证准确率: 0.9297, 精确率: 0.9323, 召回率: 0.9266, F1: 0.9294, MCC: 0.8593
敏感性(SN): 0.9266, 特异性(SP): 0.9327, AUC: 0.9821, AUPRC: 0.9821

[Fold 3] Epoch 10/10


  with autocast():


训练损失: 1.3073
验证准确率: 0.9343, 精确率: 0.9465, 召回率: 0.9205, F1: 0.9333, MCC: 0.8688
敏感性(SN): 0.9205, 特异性(SP): 0.9480, AUC: 0.9830, AUPRC: 0.9816
[Fold 3] 最佳准确率: 0.9373


[Fold 4] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5812
验证准确率: 0.9419, 精确率: 0.9164, 召回率: 0.9725, F1: 0.9436, MCC: 0.8854
敏感性(SN): 0.9725, 特异性(SP): 0.9113, AUC: 0.9821, AUPRC: 0.9787
Fold 4 第 1 轮的最佳模型已保存

[Fold 4] Epoch 2/10


  with autocast():


训练损失: 1.3806
验证准确率: 0.9205, 精确率: 0.8747, 召回率: 0.9817, F1: 0.9251, MCC: 0.8473
敏感性(SN): 0.9817, 特异性(SP): 0.8593, AUC: 0.9844, AUPRC: 0.9801

[Fold 4] Epoch 3/10


  with autocast():


训练损失: 1.3636
验证准确率: 0.9404, 精确率: 0.9500, 召回率: 0.9297, F1: 0.9397, MCC: 0.8809
敏感性(SN): 0.9297, 特异性(SP): 0.9511, AUC: 0.9842, AUPRC: 0.9793

[Fold 4] Epoch 4/10


  with autocast():


训练损失: 1.3449
验证准确率: 0.9404, 精确率: 0.9286, 召回率: 0.9541, F1: 0.9412, MCC: 0.8811
敏感性(SN): 0.9541, 特异性(SP): 0.9266, AUC: 0.9844, AUPRC: 0.9814

[Fold 4] Epoch 5/10


  with autocast():


训练损失: 1.3446
验证准确率: 0.9434, 精确率: 0.9265, 召回率: 0.9633, F1: 0.9445, MCC: 0.8876
敏感性(SN): 0.9633, 特异性(SP): 0.9235, AUC: 0.9832, AUPRC: 0.9793
Fold 4 第 5 轮的最佳模型已保存

[Fold 4] Epoch 6/10


  with autocast():


训练损失: 1.3329
验证准确率: 0.9404, 精确率: 0.9311, 召回率: 0.9511, F1: 0.9410, MCC: 0.8809
敏感性(SN): 0.9511, 特异性(SP): 0.9297, AUC: 0.9837, AUPRC: 0.9814

[Fold 4] Epoch 7/10


  with autocast():


训练损失: 1.3304
验证准确率: 0.9266, 精确率: 0.9401, 召回率: 0.9113, F1: 0.9255, MCC: 0.8536
敏感性(SN): 0.9113, 特异性(SP): 0.9419, AUC: 0.9778, AUPRC: 0.9736

[Fold 4] Epoch 8/10


  with autocast():


训练损失: 1.3171
验证准确率: 0.9312, 精确率: 0.9462, 召回率: 0.9144, F1: 0.9300, MCC: 0.8629
敏感性(SN): 0.9144, 特异性(SP): 0.9480, AUC: 0.9778, AUPRC: 0.9725

[Fold 4] Epoch 9/10


  with autocast():


训练损失: 1.3050
验证准确率: 0.9266, 精确率: 0.9401, 召回率: 0.9113, F1: 0.9255, MCC: 0.8536
敏感性(SN): 0.9113, 特异性(SP): 0.9419, AUC: 0.9766, AUPRC: 0.9697

[Fold 4] Epoch 10/10


  with autocast():


训练损失: 1.3006
验证准确率: 0.9128, 精确率: 0.8668, 召回率: 0.9755, F1: 0.9180, MCC: 0.8323
敏感性(SN): 0.9755, 特异性(SP): 0.8502, AUC: 0.9759, AUPRC: 0.9704
[Fold 4] 最佳准确率: 0.9434


[Fold 5] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5659
验证准确率: 0.9251, 精确率: 0.9162, 召回率: 0.9358, F1: 0.9259, MCC: 0.8503
敏感性(SN): 0.9358, 特异性(SP): 0.9144, AUC: 0.9800, AUPRC: 0.9751
Fold 5 第 1 轮的最佳模型已保存

[Fold 5] Epoch 2/10


  with autocast():


训练损失: 1.3716
验证准确率: 0.9266, 精确率: 0.8997, 召回率: 0.9602, F1: 0.9290, MCC: 0.8551
敏感性(SN): 0.9602, 特异性(SP): 0.8930, AUC: 0.9813, AUPRC: 0.9779
Fold 5 第 2 轮的最佳模型已保存

[Fold 5] Epoch 3/10


  with autocast():


训练损失: 1.3626
验证准确率: 0.9205, 精确率: 0.8873, 召回率: 0.9633, F1: 0.9238, MCC: 0.8441
敏感性(SN): 0.9633, 特异性(SP): 0.8777, AUC: 0.9802, AUPRC: 0.9761

[Fold 5] Epoch 4/10


  with autocast():


训练损失: 1.3556
验证准确率: 0.9343, 精确率: 0.9034, 召回率: 0.9725, F1: 0.9367, MCC: 0.8711
敏感性(SN): 0.9725, 特异性(SP): 0.8960, AUC: 0.9822, AUPRC: 0.9772
Fold 5 第 4 轮的最佳模型已保存

[Fold 5] Epoch 5/10


  with autocast():


训练损失: 1.3380
验证准确率: 0.9220, 精确率: 0.8898, 召回率: 0.9633, F1: 0.9251, MCC: 0.8469
敏感性(SN): 0.9633, 特异性(SP): 0.8807, AUC: 0.9745, AUPRC: 0.9660

[Fold 5] Epoch 6/10


  with autocast():


训练损失: 1.3381
验证准确率: 0.9404, 精确率: 0.9337, 召回率: 0.9480, F1: 0.9408, MCC: 0.8808
敏感性(SN): 0.9480, 特异性(SP): 0.9327, AUC: 0.9838, AUPRC: 0.9816
Fold 5 第 6 轮的最佳模型已保存

[Fold 5] Epoch 7/10


  with autocast():


训练损失: 1.3227
验证准确率: 0.9144, 精确率: 0.8774, 召回率: 0.9633, F1: 0.9184, MCC: 0.8327
敏感性(SN): 0.9633, 特异性(SP): 0.8654, AUC: 0.9790, AUPRC: 0.9707

[Fold 5] Epoch 8/10


  with autocast():


训练损失: 1.3131
验证准确率: 0.9358, 精确率: 0.9204, 召回率: 0.9541, F1: 0.9369, MCC: 0.8721
敏感性(SN): 0.9541, 特异性(SP): 0.9174, AUC: 0.9810, AUPRC: 0.9768

[Fold 5] Epoch 9/10


  with autocast():


训练损失: 1.3076
验证准确率: 0.9358, 精确率: 0.9204, 召回率: 0.9541, F1: 0.9369, MCC: 0.8721
敏感性(SN): 0.9541, 特异性(SP): 0.9174, AUC: 0.9842, AUPRC: 0.9814

[Fold 5] Epoch 10/10


  with autocast():


训练损失: 1.2993
验证准确率: 0.9281, 精确率: 0.9142, 召回率: 0.9450, F1: 0.9293, MCC: 0.8568
敏感性(SN): 0.9450, 特异性(SP): 0.9113, AUC: 0.9815, AUPRC: 0.9791
[Fold 5] 最佳准确率: 0.9404


[Fold 6] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.6475
验证准确率: 0.9404, 精确率: 0.9586, 召回率: 0.9205, F1: 0.9392, MCC: 0.8814
敏感性(SN): 0.9205, 特异性(SP): 0.9602, AUC: 0.9854, AUPRC: 0.9848
Fold 6 第 1 轮的最佳模型已保存

[Fold 6] Epoch 2/10


  with autocast():


训练损失: 1.3829
验证准确率: 0.9343, 精确率: 0.9201, 召回率: 0.9511, F1: 0.9353, MCC: 0.8690
敏感性(SN): 0.9511, 特异性(SP): 0.9174, AUC: 0.9868, AUPRC: 0.9867

[Fold 6] Epoch 3/10


  with autocast():


训练损失: 1.3668
验证准确率: 0.9373, 精确率: 0.9256, 召回率: 0.9511, F1: 0.9382, MCC: 0.8749
敏感性(SN): 0.9511, 特异性(SP): 0.9235, AUC: 0.9868, AUPRC: 0.9863

[Fold 6] Epoch 4/10


  with autocast():


训练损失: 1.3562
验证准确率: 0.9358, 精确率: 0.9495, 召回率: 0.9205, F1: 0.9348, MCC: 0.8720
敏感性(SN): 0.9205, 特异性(SP): 0.9511, AUC: 0.9861, AUPRC: 0.9853

[Fold 6] Epoch 5/10


  with autocast():


训练损失: 1.3476
验证准确率: 0.9450, 精确率: 0.9533, 召回率: 0.9358, F1: 0.9444, MCC: 0.8901
敏感性(SN): 0.9358, 特异性(SP): 0.9541, AUC: 0.9885, AUPRC: 0.9884
Fold 6 第 5 轮的最佳模型已保存

[Fold 6] Epoch 6/10


  with autocast():


训练损失: 1.3394
验证准确率: 0.9312, 精确率: 0.9462, 召回率: 0.9144, F1: 0.9300, MCC: 0.8629
敏感性(SN): 0.9144, 特异性(SP): 0.9480, AUC: 0.9861, AUPRC: 0.9857

[Fold 6] Epoch 7/10


  with autocast():


训练损失: 1.3310
验证准确率: 0.9312, 精确率: 0.9548, 召回率: 0.9052, F1: 0.9294, MCC: 0.8636
敏感性(SN): 0.9052, 特异性(SP): 0.9572, AUC: 0.9854, AUPRC: 0.9857

[Fold 6] Epoch 8/10


  with autocast():


训练损失: 1.3275
验证准确率: 0.9434, 精确率: 0.9448, 召回率: 0.9419, F1: 0.9433, MCC: 0.8869
敏感性(SN): 0.9419, 特异性(SP): 0.9450, AUC: 0.9827, AUPRC: 0.9823

[Fold 6] Epoch 9/10


  with autocast():


训练损失: 1.3171
验证准确率: 0.9358, 精确率: 0.9582, 召回率: 0.9113, F1: 0.9342, MCC: 0.8726
敏感性(SN): 0.9113, 特异性(SP): 0.9602, AUC: 0.9840, AUPRC: 0.9831

[Fold 6] Epoch 10/10


  with autocast():


训练损失: 1.3175
验证准确率: 0.9388, 精确率: 0.9498, 召回率: 0.9266, F1: 0.9381, MCC: 0.8779
敏感性(SN): 0.9266, 特异性(SP): 0.9511, AUC: 0.9849, AUPRC: 0.9856
[Fold 6] 最佳准确率: 0.9450


[Fold 7] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5406
验证准确率: 0.9312, 精确率: 0.9029, 召回率: 0.9664, F1: 0.9335, MCC: 0.8645
敏感性(SN): 0.9664, 特异性(SP): 0.8960, AUC: 0.9790, AUPRC: 0.9749
Fold 7 第 1 轮的最佳模型已保存

[Fold 7] Epoch 2/10


  with autocast():


训练损失: 1.3863
验证准确率: 0.9281, 精确率: 0.9046, 召回率: 0.9572, F1: 0.9302, MCC: 0.8577
敏感性(SN): 0.9572, 特异性(SP): 0.8991, AUC: 0.9806, AUPRC: 0.9796

[Fold 7] Epoch 3/10


  with autocast():


训练损失: 1.3604
验证准确率: 0.9220, 精确率: 0.9233, 召回率: 0.9205, F1: 0.9219, MCC: 0.8440
敏感性(SN): 0.9205, 特异性(SP): 0.9235, AUC: 0.9798, AUPRC: 0.9764

[Fold 7] Epoch 4/10


  with autocast():


训练损失: 1.3554
验证准确率: 0.9281, 精确率: 0.9242, 召回率: 0.9327, F1: 0.9285, MCC: 0.8563
敏感性(SN): 0.9327, 特异性(SP): 0.9235, AUC: 0.9811, AUPRC: 0.9787

[Fold 7] Epoch 5/10


  with autocast():


训练损失: 1.3493
验证准确率: 0.9388, 精确率: 0.9335, 召回率: 0.9450, F1: 0.9392, MCC: 0.8777
敏感性(SN): 0.9450, 特异性(SP): 0.9327, AUC: 0.9796, AUPRC: 0.9749
Fold 7 第 5 轮的最佳模型已保存

[Fold 7] Epoch 6/10


  with autocast():


训练损失: 1.3330
验证准确率: 0.9281, 精确率: 0.9000, 召回率: 0.9633, F1: 0.9306, MCC: 0.8584
敏感性(SN): 0.9633, 特异性(SP): 0.8930, AUC: 0.9805, AUPRC: 0.9783

[Fold 7] Epoch 7/10


  with autocast():


训练损失: 1.3320
验证准确率: 0.9358, 精确率: 0.9412, 召回率: 0.9297, F1: 0.9354, MCC: 0.8716
敏感性(SN): 0.9297, 特异性(SP): 0.9419, AUC: 0.9801, AUPRC: 0.9770

[Fold 7] Epoch 8/10


  with autocast():


训练损失: 1.3171
验证准确率: 0.9144, 精确率: 0.9625, 召回率: 0.8624, F1: 0.9097, MCC: 0.8333
敏感性(SN): 0.8624, 特异性(SP): 0.9664, AUC: 0.9808, AUPRC: 0.9729

[Fold 7] Epoch 9/10


  with autocast():


训练损失: 1.3141
验证准确率: 0.9358, 精确率: 0.9130, 召回率: 0.9633, F1: 0.9375, MCC: 0.8729
敏感性(SN): 0.9633, 特异性(SP): 0.9083, AUC: 0.9795, AUPRC: 0.9716

[Fold 7] Epoch 10/10


  with autocast():


训练损失: 1.2964
验证准确率: 0.9312, 精确率: 0.9147, 召回率: 0.9511, F1: 0.9325, MCC: 0.8631
敏感性(SN): 0.9511, 特异性(SP): 0.9113, AUC: 0.9731, AUPRC: 0.9595
[Fold 7] 最佳准确率: 0.9388


[Fold 8] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5265
验证准确率: 0.9281, 精确率: 0.8977, 召回率: 0.9664, F1: 0.9308, MCC: 0.8588
敏感性(SN): 0.9664, 特异性(SP): 0.8899, AUC: 0.9798, AUPRC: 0.9784
Fold 8 第 1 轮的最佳模型已保存

[Fold 8] Epoch 2/10


  with autocast():


训练损失: 1.3893
验证准确率: 0.9297, 精确率: 0.9194, 召回率: 0.9419, F1: 0.9305, MCC: 0.8596
敏感性(SN): 0.9419, 特异性(SP): 0.9174, AUC: 0.9829, AUPRC: 0.9817
Fold 8 第 2 轮的最佳模型已保存

[Fold 8] Epoch 3/10


  with autocast():


训练损失: 1.3647
验证准确率: 0.9358, 精确率: 0.9467, 召回率: 0.9235, F1: 0.9350, MCC: 0.8718
敏感性(SN): 0.9235, 特异性(SP): 0.9480, AUC: 0.9826, AUPRC: 0.9817
Fold 8 第 3 轮的最佳模型已保存

[Fold 8] Epoch 4/10


  with autocast():


训练损失: 1.3612
验证准确率: 0.9312, 精确率: 0.8939, 召回率: 0.9786, F1: 0.9343, MCC: 0.8663
敏感性(SN): 0.9786, 特异性(SP): 0.8838, AUC: 0.9815, AUPRC: 0.9764

[Fold 8] Epoch 5/10


  with autocast():


训练损失: 1.3441
验证准确率: 0.9388, 精确率: 0.9159, 召回率: 0.9664, F1: 0.9405, MCC: 0.8790
敏感性(SN): 0.9664, 特异性(SP): 0.9113, AUC: 0.9823, AUPRC: 0.9763
Fold 8 第 5 轮的最佳模型已保存

[Fold 8] Epoch 6/10


  with autocast():


训练损失: 1.3541
验证准确率: 0.9404, 精确率: 0.9186, 召回率: 0.9664, F1: 0.9419, MCC: 0.8819
敏感性(SN): 0.9664, 特异性(SP): 0.9144, AUC: 0.9854, AUPRC: 0.9837
Fold 8 第 6 轮的最佳模型已保存

[Fold 8] Epoch 7/10


  with autocast():


训练损失: 1.3265
验证准确率: 0.9358, 精确率: 0.8969, 召回率: 0.9847, F1: 0.9388, MCC: 0.8758
敏感性(SN): 0.9847, 特异性(SP): 0.8869, AUC: 0.9859, AUPRC: 0.9848

[Fold 8] Epoch 8/10


  with autocast():


训练损失: 1.3273
验证准确率: 0.8930, 精确率: 0.9707, 召回率: 0.8104, F1: 0.8833, MCC: 0.7969
敏感性(SN): 0.8104, 特异性(SP): 0.9755, AUC: 0.9847, AUPRC: 0.9838

[Fold 8] Epoch 9/10


  with autocast():


训练损失: 1.3130
验证准确率: 0.9235, 精确率: 0.8946, 召回率: 0.9602, F1: 0.9263, MCC: 0.8494
敏感性(SN): 0.9602, 特异性(SP): 0.8869, AUC: 0.9767, AUPRC: 0.9745

[Fold 8] Epoch 10/10


  with autocast():


训练损失: 1.3110
验证准确率: 0.9220, 精确率: 0.8988, 召回率: 0.9511, F1: 0.9242, MCC: 0.8455
敏感性(SN): 0.9511, 特异性(SP): 0.8930, AUC: 0.9763, AUPRC: 0.9706
[Fold 8] 最佳准确率: 0.9404


[Fold 9] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.6587
验证准确率: 0.9159, 精确率: 0.8820, 召回率: 0.9602, F1: 0.9195, MCC: 0.8351
敏感性(SN): 0.9602, 特异性(SP): 0.8716, AUC: 0.9715, AUPRC: 0.9647
Fold 9 第 1 轮的最佳模型已保存

[Fold 9] Epoch 2/10


  with autocast():


训练损失: 1.3730
验证准确率: 0.9052, 精确率: 0.8591, 召回率: 0.9694, F1: 0.9109, MCC: 0.8172
敏感性(SN): 0.9694, 特异性(SP): 0.8410, AUC: 0.9748, AUPRC: 0.9657

[Fold 9] Epoch 3/10


  with autocast():


训练损失: 1.3776
验证准确率: 0.9327, 精确率: 0.9224, 召回率: 0.9450, F1: 0.9335, MCC: 0.8657
敏感性(SN): 0.9450, 特异性(SP): 0.9205, AUC: 0.9776, AUPRC: 0.9671
Fold 9 第 3 轮的最佳模型已保存

[Fold 9] Epoch 4/10


  with autocast():


训练损失: 1.3428
验证准确率: 0.9388, 精确率: 0.9470, 召回率: 0.9297, F1: 0.9383, MCC: 0.8778
敏感性(SN): 0.9297, 特异性(SP): 0.9480, AUC: 0.9754, AUPRC: 0.9630
Fold 9 第 4 轮的最佳模型已保存

[Fold 9] Epoch 5/10


  with autocast():


训练损失: 1.3419
验证准确率: 0.9251, 精确率: 0.9399, 召回率: 0.9083, F1: 0.9238, MCC: 0.8506
敏感性(SN): 0.9083, 特异性(SP): 0.9419, AUC: 0.9759, AUPRC: 0.9640

[Fold 9] Epoch 6/10


  with autocast():


训练损失: 1.3293
验证准确率: 0.9343, 精确率: 0.9303, 召回率: 0.9388, F1: 0.9346, MCC: 0.8685
敏感性(SN): 0.9388, 特异性(SP): 0.9297, AUC: 0.9760, AUPRC: 0.9720

[Fold 9] Epoch 7/10


  with autocast():


训练损失: 1.3313
验证准确率: 0.9343, 精确率: 0.9522, 召回率: 0.9144, F1: 0.9329, MCC: 0.8692
敏感性(SN): 0.9144, 特异性(SP): 0.9541, AUC: 0.9764, AUPRC: 0.9611

[Fold 9] Epoch 8/10


  with autocast():


训练损失: 1.3145
验证准确率: 0.9343, 精确率: 0.9551, 召回率: 0.9113, F1: 0.9327, MCC: 0.8694
敏感性(SN): 0.9113, 特异性(SP): 0.9572, AUC: 0.9770, AUPRC: 0.9701

[Fold 9] Epoch 9/10


  with autocast():


训练损失: 1.3149
验证准确率: 0.9174, 精确率: 0.8845, 召回率: 0.9602, F1: 0.9208, MCC: 0.8379
敏感性(SN): 0.9602, 特异性(SP): 0.8746, AUC: 0.9751, AUPRC: 0.9644

[Fold 9] Epoch 10/10


  with autocast():


训练损失: 1.3095
验证准确率: 0.9128, 精确率: 0.9623, 召回率: 0.8593, F1: 0.9079, MCC: 0.8305
敏感性(SN): 0.8593, 特异性(SP): 0.9664, AUC: 0.9755, AUPRC: 0.9671
[Fold 9] 最佳准确率: 0.9388


[Fold 10] Epoch 1/10


  scaler = GradScaler()
  with autocast():


训练损失: 1.5349
验证准确率: 0.9251, 精确率: 0.9427, 召回率: 0.9052, F1: 0.9236, MCC: 0.8508
敏感性(SN): 0.9052, 特异性(SP): 0.9450, AUC: 0.9797, AUPRC: 0.9790
Fold 10 第 1 轮的最佳模型已保存

[Fold 10] Epoch 2/10


  with autocast():


训练损失: 1.3829
验证准确率: 0.9297, 精确率: 0.9297, 召回率: 0.9297, F1: 0.9297, MCC: 0.8593
敏感性(SN): 0.9297, 特异性(SP): 0.9297, AUC: 0.9822, AUPRC: 0.9825
Fold 10 第 2 轮的最佳模型已保存

[Fold 10] Epoch 3/10


  with autocast():


训练损失: 1.3651
验证准确率: 0.9235, 精确率: 0.9086, 召回率: 0.9419, F1: 0.9249, MCC: 0.8477
敏感性(SN): 0.9419, 特异性(SP): 0.9052, AUC: 0.9818, AUPRC: 0.9822

[Fold 10] Epoch 4/10


  with autocast():


训练损失: 1.3534
验证准确率: 0.9205, 精确率: 0.9393, 召回率: 0.8991, F1: 0.9187, MCC: 0.8418
敏感性(SN): 0.8991, 特异性(SP): 0.9419, AUC: 0.9823, AUPRC: 0.9826

[Fold 10] Epoch 5/10


  with autocast():


训练损失: 1.3394
验证准确率: 0.9312, 精确率: 0.9075, 召回率: 0.9602, F1: 0.9331, MCC: 0.8638
敏感性(SN): 0.9602, 特异性(SP): 0.9021, AUC: 0.9817, AUPRC: 0.9816
Fold 10 第 5 轮的最佳模型已保存

[Fold 10] Epoch 6/10


  with autocast():


训练损失: 1.3361
验证准确率: 0.9190, 精确率: 0.8870, 召回率: 0.9602, F1: 0.9222, MCC: 0.8408
敏感性(SN): 0.9602, 特异性(SP): 0.8777, AUC: 0.9823, AUPRC: 0.9829

[Fold 10] Epoch 7/10


  with autocast():


训练损失: 1.3184
验证准确率: 0.9281, 精确率: 0.9321, 召回率: 0.9235, F1: 0.9278, MCC: 0.8563
敏感性(SN): 0.9235, 特异性(SP): 0.9327, AUC: 0.9810, AUPRC: 0.9815

[Fold 10] Epoch 8/10


  with autocast():


训练损失: 1.3160
验证准确率: 0.9174, 精确率: 0.9003, 召回率: 0.9388, F1: 0.9192, MCC: 0.8356
敏感性(SN): 0.9388, 特异性(SP): 0.8960, AUC: 0.9783, AUPRC: 0.9768

[Fold 10] Epoch 9/10


  with autocast():


训练损失: 1.3104
验证准确率: 0.9251, 精确率: 0.9137, 召回率: 0.9388, F1: 0.9261, MCC: 0.8505
敏感性(SN): 0.9388, 特异性(SP): 0.9113, AUC: 0.9810, AUPRC: 0.9800

[Fold 10] Epoch 10/10


  with autocast():


训练损失: 1.3063
验证准确率: 0.9297, 精确率: 0.9271, 召回率: 0.9327, F1: 0.9299, MCC: 0.8593
敏感性(SN): 0.9327, 特异性(SP): 0.9266, AUC: 0.9765, AUPRC: 0.9724
[Fold 10] 最佳准确率: 0.9312

平均准确率: 0.9401 ± 0.0058
平均精确率: 0.9264 ± 0.0282
平均召回率: 0.9260 ± 0.0340
平均F1分数: 0.9252 ± 0.0090
平均MCC: 0.8523 ± 0.0149
平均敏感性(SN): 0.9260 ± 0.0340
平均特异性(SP): 0.9245 ± 0.0338
平均AUC: 0.9786 ± 0.0037
平均AUPRC: 0.9738 ± 0.0073


### 十折测试结果

In [6]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# 测试集
test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'
test_dataset = ProteinNPYDataset(test_pos, test_neg)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    return acc, pre, rec, f1, mcc

# 评估每一折
all_metrics = []
for fold in range(1, 11):
    print(f"\n========== Test Fold {fold}/10 ==========")
    model = TransformerMoE(
        d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
    ).to(device)
    model.load_state_dict(torch.load(f"/exp_data/sjx/star/main_transformer_moe_weight/cv_point/best_fold{fold}.pth", map_location=device))
    acc, pre, rec, f1, mcc = eval_model(model, test_loader, device)
    print(f"Test ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    all_metrics.append((acc, pre, rec, f1, mcc))

# 汇总平均
all_metrics = np.array(all_metrics)
print("\n========== 10-Fold Test Results ==========")
print(f"Mean ACC: {all_metrics[:,0].mean():.4f} ± {all_metrics[:,0].std():.4f}")
print(f"Mean PRE: {all_metrics[:,1].mean():.4f}")
print(f"Mean REC: {all_metrics[:,2].mean():.4f}")
print(f"Mean F1:  {all_metrics[:,3].mean():.4f}")
print(f"Mean MCC: {all_metrics[:,4].mean():.4f}")

### 更多指标得测试

In [26]:
# 假设模型结构和ProteinNPYDataset已定义，device已设置
import torch

# 1. 加载模型
model = TransformerMoE(
    d_model=1152, nhead=8, d_ff=2048, num_layers=4, num_experts=30, k=3, dropout=0.1, noisy_std=1.0, num_classes=2
).to(device)
model.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))
model.eval()

# 2. 加载测试集
test_pos = '/exp_data/sjx/star/first_data/ESM-embedding/positive_test_embedding.npy'
test_neg = '/exp_data/sjx/star/first_data/ESM-embedding/negative_test_embedding.npy'
test_dataset = ProteinNPYDataset(test_pos, test_neg)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# 3. 定义评估函数
def eval_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    all_probs = []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())  # 正类概率
    
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef,
        confusion_matrix, roc_auc_score, average_precision_score
    )
    
    # 计算混淆矩阵
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()
    
    # 计算所有指标
    acc = accuracy_score(all_labels, all_preds)
    pre = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    mcc = matthews_corrcoef(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    auprc = average_precision_score(all_labels, all_probs)
    sn = tp / (tp + fn) if (tp + fn) > 0 else 0  # 敏感性
    sp = tn / (tn + fp) if (tn + fp) > 0 else 0  # 特异性
    
    print(f"Test ACC: {acc:.4f}, PRE: {pre:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, MCC: {mcc:.4f}")
    print(f"Test AUC: {auc:.4f}, AUPRC: {auprc:.4f}, SN: {sn:.4f}, SP: {sp:.4f}")
    return acc, pre, rec, f1, mcc, auc, auprc, sn, sp

# 4. 测试
eval_model(model, test_loader, device)

  model.load_state_dict(torch.load('/exp_data/sjx/star/main_transformer_moe_weight/best_transformer_moe_last.pth', map_location=device))


Test ACC: 0.9225, PRE: 0.9483, REC: 0.9425, F1: 0.9454, MCC: 0.8120
Test AUC: 0.9685, AUPRC: 0.9869, SN: 0.9425, SP: 0.8731


(0.9225413402959095,
 0.948339483394834,
 0.9425427872860636,
 0.9454322501532803,
 0.8120485793877618,
 0.9685438657398856,
 0.9869455888266454,
 0.9425427872860636,
 0.8731117824773413)