# --- 0. CONFIGURATION ---

In [2]:
import os

# --- 1. C·∫§U H√åNH G·ªêC (ROOT) ---
# D√πng r'' ƒë·ªÉ Python hi·ªÉu ƒë√¢y l√† ƒë∆∞·ªùng d·∫´n Windows (kh√¥ng b·ªã l·ªói k√Ω t·ª± ƒë·∫∑c bi·ªát)
ROOT = r'E:\CAFA-6-Protein-Function-Prediction'

# --- 2. ƒê·ªäNH NGHƒ®A C√ÅC TH∆Ø M·ª§C CON (T·ª± ƒë·ªông n·ªëi ƒëu√¥i) ---
# os.path.join gi√∫p n·ªëi ƒë∆∞·ªùng d·∫´n ƒë√∫ng chu·∫©n cho c·∫£ Windows/Linux/Mac
INPUT_DIR  = os.path.join(ROOT, 'input')
OUTPUT_DIR = os.path.join(ROOT, 'output')
MODEL_DIR  = os.path.join(ROOT, 'models')

# T·∫°o s·∫µn th∆∞ m·ª•c output v√† models n·∫øu ch∆∞a c√≥ (Tr√°nh l·ªói kh√¥ng l∆∞u ƒë∆∞·ª£c file)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"‚úÖ ƒê√£ c·∫•u h√¨nh ROOT: {ROOT}")
print(f"üìÇ Input: {INPUT_DIR}")
print(f"üìÇ Output: {OUTPUT_DIR}")
print(f"üìÇ Models: {MODEL_DIR}")

‚úÖ ƒê√£ c·∫•u h√¨nh ROOT: E:\CAFA-6-Protein-Function-Prediction
üìÇ Input: E:\CAFA-6-Protein-Function-Prediction\input
üìÇ Output: E:\CAFA-6-Protein-Function-Prediction\output
üìÇ Models: E:\CAFA-6-Protein-Function-Prediction\models


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import os
from pathlib import Path

# TORCH MODULES
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score

# --- 1. SETUP & CONFIGURATION ---

In [4]:
# --- C·∫§U H√åNH H·ªÜ TH·ªêNG & MODEL (UPDATED FOR 1D-CNN) ---

class config:
    # 1. ƒê∆∞·ªùng d·∫´n (S·ª≠ d·ª•ng ROOT ƒë√£ ƒë·ªãnh nghƒ©a ·ªü cell ƒë·∫ßu ti√™n)
    MAIN_DIR = os.path.join(ROOT, "input", "cafa-6-protein-function-prediction")
    
    # ƒê∆∞·ªùng d·∫´n c·ª• th·ªÉ ƒë·∫øn file Fasta (Ti·ªán cho Dataset g·ªçi)
    TRAIN_FASTA = os.path.join(MAIN_DIR, "Train", "train_sequences.fasta")
    TRAIN_LABELS = os.path.join(MAIN_DIR, "Train", "train_terms.tsv")
    TEST_FASTA = os.path.join(MAIN_DIR, "Test", "testsuperset.fasta")
    
    # 2. Tham s·ªë Model (D√†nh ri√™ng cho x·ª≠ l√Ω chu·ªói)
    num_labels = 500      # S·ªë l∆∞·ª£ng GO Term c·∫ßn d·ª± ƒëo√°n
    vocab_size = 25       # S·ªë l∆∞·ª£ng k√Ω t·ª± Amino Acid (20 chu·∫©n + v√†i k√Ω t·ª± l·∫°/padding)
    embedding_dim = 128   # K√≠ch th∆∞·ªõc vector nh√∫ng n·ªôi b·ªô (Internal Embedding c·ªßa CNN)
    max_len = 1024        # ƒê·ªô d√†i chu·ªói t·ªëi ƒëa (C·∫Øt ho·∫∑c Pad v·ªÅ ƒë·ªô d√†i n√†y)
    
    # 3. Tham s·ªë Training
    n_epochs = 8
    
    # Batch size = 16 (T·ªëi ∆∞u cho card MX550 2GB/4GB VRAM)
    # N·∫øu ch·∫°y th·ª≠ th·∫•y m∆∞·ª£t, c√≥ th·ªÉ tƒÉng l√™n 32. N·∫øu l·ªói OOM th√¨ gi·∫£m xu·ªëng 8.
    batch_size = 16       
    
    # Learning rate gi·∫£m nh·∫π ƒë·ªÉ model h·ªçc ·ªïn ƒë·ªãnh v·ªõi batch size nh·ªè
    lr = 0.0005           
    
    # T·ª± ƒë·ªông ch·ªçn GPU n·∫øu c√≥
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"‚úÖ ƒê√£ t·∫£i c·∫•u h√¨nh Config.")
print(f"üîπ Device: {config.device}")
print(f"üîπ Batch Size: {config.batch_size}")
print(f"üîπ Max Sequence Length: {config.max_len}")

‚úÖ ƒê√£ t·∫£i c·∫•u h√¨nh Config.
üîπ Device: cuda
üîπ Batch Size: 16
üîπ Max Sequence Length: 1024


# --- 2. DATA LOADING: PROTEIN SEQUENCE DATASET ---

In [5]:
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
import torch
# C·∫ßn c√†i ƒë·∫∑t th∆∞ vi·ªán biopython n·∫øu ch∆∞a c√≥: !pip install biopython
from Bio import SeqIO

# 1. T·∫°o b·ªô t·ª´ ƒëi·ªÉn (Vocab) cho Axit Amin
AA_VOCAB = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: i+1 for i, aa in enumerate(AA_VOCAB)} # 1-20, 0 l√† padding

def encode_seq(seq, max_len=1024):
    seq = str(seq)
    encoded = [aa_to_int.get(aa, 0) for aa in seq]
    if len(encoded) > max_len:
        return encoded[:max_len]
    else:
        return encoded + [0] * (max_len - len(encoded))

class ProteinSequenceDataset(Dataset):
    def __init__(self, datatype, fasta_file, targets_file=None):
        self.datatype = datatype
        self.data = []
        
        # ƒê·ªçc file FASTA (Sequence th√¥)
        print(f"Loading sequences from {fasta_file}...")
        for record in SeqIO.parse(fasta_file, "fasta"):
            self.data.append({
                "id": record.id,
                "seq": encode_seq(record.seq)
            })
            
        self.df = pd.DataFrame(self.data)
        
        # N·∫øu l√† Train th√¨ load th√™m nh√£n (Targets)
        if datatype == "train" and targets_file:
            print("Loading targets...")
            labels = np.load(targets_file)
            # Gi·∫£ s·ª≠ th·ª© t·ª± file npy kh·ªõp v·ªõi fasta (c·∫ßn c·∫©n th·∫≠n ƒëo·∫°n n√†y trong th·ª±c t·∫ø)
            # ƒê·ªÉ ƒë∆°n gi·∫£n cho starter, ta g√°n tr·ª±c ti·∫øp
            self.labels = labels[:len(self.df)] 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Input l√† chu·ªói s·ªë nguy√™n (VD: [1, 5, 2...])
        seq_tensor = torch.tensor(self.df.iloc[index]["seq"], dtype=torch.long)
        
        if self.datatype == "train":
            target = torch.tensor(self.labels[index], dtype=torch.float)
            return seq_tensor, target
        else:
            return seq_tensor, self.df.iloc[index]["id"]

# --- 3. MODEL ARCHITECTURE: 1D CONVOLUTIONAL NEURAL NETWORK (CNN) ---

In [6]:
class CNN1D(nn.Module):
    def __init__(self, num_classes, vocab_size=21, embed_dim=128):
        super(CNN1D, self).__init__()
        # L·ªõp n√†y s·∫Ω t·ª± h·ªçc ƒë·∫∑c tr∆∞ng t·ª´ con s·ªë (Axit amin)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # CNN qu√©t tr√™n c√°c vector v·ª´a h·ªçc ƒë∆∞·ª£c
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_channels=embed_dim, out_channels=32, kernel_size=9, padding=4),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=7, padding=3),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        self.fc = nn.Sequential(
            nn.AdaptiveMaxPool1d(1), # L·∫•y ƒë·∫∑c tr∆∞ng m·∫°nh nh·∫•t
            nn.Flatten(),
            nn.Linear(64, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        # x: [batch, seq_len] -> embed: [batch, seq_len, 128]
        x = self.embedding(x)
        # ƒê·ªïi chi·ªÅu cho Conv1D: [batch, 128, seq_len]
        x = x.permute(0, 2, 1) 
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        return self.fc(x)

# --- 4. MODEL TRAINING (S·ª¨A ƒê·ªîI CHO SEQUENCE CNN) ---

In [None]:
# --- 4. MODEL TRAINING (S·ª¨A ƒê·ªîI CHO SEQUENCE CNN) ---

def train_model(train_size=0.9):
    
    # 1. C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n file (ƒê·∫£m b·∫£o b·∫°n ƒë√£ Add dataset train_targets_top500)
    targets_path = F"{ROOT}/input/train_targets_top500/train_targets_top500.npy"
    
    # 2. Kh·ªüi t·∫°o Dataset m·ªõi (ƒê·ªçc t·ª´ Fasta)
    print("Initializing Dataset...")
    train_dataset = ProteinSequenceDataset(
        datatype="train", 
        fasta_file=Path(config.TRAIN_FASTA), 
        targets_file=targets_path
    )
    
    # 3. Chia t·∫≠p Train/Val
    train_len = int(len(train_dataset) * train_size)
    val_len = len(train_dataset) - train_len
    train_set, val_set = random_split(train_dataset, [train_len, val_len])
    
    train_dataloader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
    val_dataloader = DataLoader(val_set, batch_size=config.batch_size, shuffle=False)

    # 4. Kh·ªüi t·∫°o Model CNN (Kh√¥ng c·∫ßn input_dim t·ª´ ESM2 n·ªØa)
    # vocab_size=21 (20 axit amin + 1 padding), embed_dim=128 (t√πy ch·ªçn)
    model = CNN1D(num_classes=config.num_labels, vocab_size=21, embed_dim=128).to(config.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
    
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    # Metric F1
    f1_metric = MultilabelF1Score(num_labels=config.num_labels, average='macro').to(config.device)

    print(f"STARTING TRAINING on {config.device}...")
    
    best_val_f1 = 0.0
    best_threshold = 0.2 # Ng∆∞·ª°ng m·∫∑c ƒë·ªãnh ban ƒë·∫ßu

    for epoch in range(config.n_epochs):
        print(f"\nEPOCH {epoch+1}/{config.n_epochs}")
        
        # --- TRAIN PHASE ---
        model.train()
        total_train_loss = 0
        for seqs, targets in tqdm(train_dataloader, desc="Training"):
            seqs, targets = seqs.to(config.device), targets.to(config.device)
            
            optimizer.zero_grad()
            preds_logits = model(seqs)
            loss = loss_fn(preds_logits, targets)
            
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"  Average Training Loss: {avg_train_loss:.4f}")

        # --- VALIDATION PHASE ---
        model.eval()
        all_val_preds = []
        all_val_targets = []
        with torch.no_grad():
            for seqs, targets in tqdm(val_dataloader, desc="Validation"):
                seqs, targets = seqs.to(config.device), targets.to(config.device)
                preds_logits = model(seqs)
                
                all_val_preds.append(torch.sigmoid(preds_logits))
                all_val_targets.append(targets)

        all_val_preds = torch.cat(all_val_preds)
        all_val_targets = torch.cat(all_val_targets)

        # T√¨m ng∆∞·ª°ng (Threshold) t·ªëi ∆∞u
        best_f1_epoch = 0
        best_thresh_epoch = 0
        thresholds = np.arange(0.1, 0.51, 0.05)
        
        # Chuy·ªÉn metric v·ªÅ CPU ƒë·ªÉ tr√°nh l·ªói b·ªô nh·ªõ n·∫øu GPU y·∫øu
        f1_metric_cpu = f1_metric.to('cpu')
        all_val_preds_cpu = all_val_preds.to('cpu')
        all_val_targets_cpu = all_val_targets.to('cpu').int()

        for thresh in thresholds:
            f1_metric_cpu.threshold = thresh
            f1 = f1_metric_cpu(all_val_preds_cpu, all_val_targets_cpu)
            if f1 > best_f1_epoch:
                best_f1_epoch = f1
                best_thresh_epoch = thresh

        print(f"  Val F1: {best_f1_epoch:.4f} (Threshold: {best_thresh_epoch:.2f})")

        scheduler.step(best_f1_epoch)

        if best_f1_epoch > best_val_f1:
            best_val_f1 = best_f1_epoch
            best_threshold = best_thresh_epoch
            torch.save(model.state_dict(), f"{ROOT}/models/cnn_best.pth")
            print(f"  ‚≠ê New best model saved!")

    print("\nTRAINING FINISHED")
    print(f"Highest Validation F1-Score: {best_val_f1:.4f}")
    
    model.load_state_dict(
        torch.load(f"{ROOT}/models/cnn_best.pth", map_location=config.device, weights_only=True)
    )
    return model, best_threshold

# G·ªçi h√†m train (Kh√¥ng c·∫ßn truy·ªÅn tham s·ªë ESM2 n·ªØa)
cnn_model, best_threshold = train_model()

Initializing Dataset...
Loading sequences from E:\CAFA-6-Protein-Function-Prediction\input\cafa-6-protein-function-prediction\Train\train_sequences.fasta...
Loading targets...
STARTING TRAINING on cuda...

EPOCH 1/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [00:49<00:00, 93.35it/s] 


  Average Training Loss: 0.1875


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:02<00:00, 242.94it/s]


  Val F1: 0.0459 (Threshold: 0.10)
  ‚≠ê New best model saved!

EPOCH 2/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [03:50<00:00, 20.13it/s] 


  Average Training Loss: 0.1856


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.00it/s]


  Val F1: 0.0498 (Threshold: 0.10)
  ‚≠ê New best model saved!

EPOCH 3/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [06:59<00:00, 11.04it/s]


  Average Training Loss: 0.1853


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.16it/s]


  Val F1: 0.0534 (Threshold: 0.10)
  ‚≠ê New best model saved!

EPOCH 4/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [07:03<00:00, 10.95it/s]


  Average Training Loss: 0.1853


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.14it/s]


  Val F1: 0.0494 (Threshold: 0.10)

EPOCH 5/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [06:57<00:00, 11.10it/s]


  Average Training Loss: 0.1850


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.20it/s]


  Val F1: 0.0515 (Threshold: 0.10)

EPOCH 6/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [06:56<00:00, 11.12it/s]


  Average Training Loss: 0.1848


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.26it/s]


  Val F1: 0.0456 (Threshold: 0.10)

EPOCH 7/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [07:00<00:00, 11.03it/s]


  Average Training Loss: 0.1844


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.19it/s]


  Val F1: 0.0492 (Threshold: 0.10)

EPOCH 8/8


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4636/4636 [06:58<00:00, 11.09it/s]


  Average Training Loss: 0.1841


Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 516/516 [00:12<00:00, 40.31it/s]


  Val F1: 0.0532 (Threshold: 0.10)

TRAINING FINISHED
Highest Validation F1-Score: 0.0534


  torch.load(f"{ROOT}/models/cnn_best.pth", map_location=config.device)


# --- 5. GENERATING PREDICTIONS ---

In [8]:
# --- 5. GENERATING PREDICTIONS (UPDATED FOR SEQUENCE CNN) ---

def predict(model, threshold):
    # 1. C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n file Test (File Sequence th√¥)
    # L∆∞u √Ω: Ki·ªÉm tra l·∫°i ƒë∆∞·ªùng d·∫´n n√†y n·∫øu b·∫°n l∆∞u ·ªü ch·ªó kh√°c
    test_fasta_path = f"{config.MAIN_DIR}/Test/testsuperset.fasta"
    
    print(f"Loading Test Data from: {test_fasta_path}")
    
    # 2. Kh·ªüi t·∫°o Dataset (S·ª≠ d·ª•ng class m·ªõi ƒë√£ s·ª≠a ·ªü tr√™n)
    test_dataset = ProteinSequenceDataset(datatype="test", fasta_file=test_fasta_path)
    test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
    
    model.eval()
    
    # 3. L·∫•y danh s√°ch t√™n nh√£n (GO Terms)
    # C·∫ßn load l·∫°i file train_terms ƒë·ªÉ bi·∫øt c·ªôt 0 l√† nh√£n g√¨, c·ªôt 1 l√† nh√£n g√¨...
    labels_path = f"{config.MAIN_DIR}/Train/train_terms.tsv"
    labels_df = pd.read_csv(labels_path, sep="\t")
    top_terms = labels_df.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms.head(config.num_labels).index.values
    
    print("\nGENERATING PREDICTIONS FOR THE TEST SET...")
    
    results = []
    with torch.no_grad():
        for seqs, ids in tqdm(test_dataloader, desc="Predicting"):
            seqs = seqs.to(config.device)
            
            # ƒê∆∞a qua model
            preds_logits = model(seqs)
            preds_probs = torch.sigmoid(preds_logits).cpu().numpy()
            
            # L·ªçc k·∫øt qu·∫£ theo ng∆∞·ª°ng (Threshold)
            for i, protein_id in enumerate(ids):
                protein_probs = preds_probs[i]
                # Ch·ªâ l·∫•y nh·ªØng nh√£n c√≥ x√°c su·∫•t > threshold
                go_indices = np.where(protein_probs > threshold)[0]
                
                for idx in go_indices:
                    results.append({
                        "Id": protein_id,
                        "GO term": labels_names[idx],
                        "Confidence": protein_probs[idx]
                    })
    
    submission_df = pd.DataFrame(results)
    print("PREDICTIONS COMPLETE.")
    return submission_df

# G·ªçi h√†m predict v·ªõi model CNN v·ª´a train xong
# L∆∞u √Ω: Thay 'ems2_model' c≈© b·∫±ng 'cnn_model'
submission_df = predict(cnn_model, best_threshold)

Loading Test Data from: E:\CAFA-6-Protein-Function-Prediction\input\cafa-6-protein-function-prediction/Test/testsuperset.fasta
Loading sequences from E:\CAFA-6-Protein-Function-Prediction\input\cafa-6-protein-function-prediction/Test/testsuperset.fasta...

GENERATING PREDICTIONS FOR THE TEST SET...


Predicting: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14020/14020 [07:43<00:00, 30.27it/s]


PREDICTIONS COMPLETE.


# --- 6. SUBMISSION FILE GENERATION ---

In [9]:
import os
import pandas as pd

# --- 1. C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N (D·ª±a tr√™n Tree c·ªßa b·∫°n) ---
# V√¨ notebook ƒëang ·ªü trong '1_train_models', ta d√πng '../' ƒë·ªÉ ra th∆∞ m·ª•c g·ªëc
blast_path = f"{ROOT}/input/blast-quick-sprof-zero-pred/submission.tsv"
output_path = f"{ROOT}/output/submission_2.tsv"

print("\n--- B·∫ÆT ƒê·∫¶U T·∫†O FILE SUBMISSION (CNN + BLAST FILL) ---")

# --- 2. LOAD DATA ---
if os.path.exists(blast_path):
    print(f"‚úÖ ƒê√£ t√¨m th·∫•y file BLAST t·∫°i: {blast_path}")
    
    # Load file BLAST (Header=None v√¨ file n√†y th∆∞·ªùng kh√¥ng c√≥ d√≤ng ti√™u ƒë·ªÅ)
    submission_blast = pd.read_csv(blast_path, sep='\t', header=None, names=['Id', 'GO term', 'Confidence_Blast'])
    
    # Load k·∫øt qu·∫£ t·ª´ CNN (submission_df l√† bi·∫øn c√≥ s·∫µn t·ª´ cell tr√™n)
    # ƒê·ªïi t√™n c·ªôt Confidence th√†nh Confidence_CNN ƒë·ªÉ d·ªÖ ph√¢n bi·ªát
    submission_cnn = submission_df.rename(columns={'Confidence': 'Confidence_CNN'})

    # --- 3. MERGE V√Ä X·ª¨ L√ù LOGIC ---
    print("‚è≥ ƒêang tr·ªôn k·∫øt qu·∫£ (Merge)...")
    # Outer join ƒë·ªÉ gi·ªØ l·∫°i t·∫•t c·∫£ c√°c c·∫∑p (Protein, GO Term) t·ª´ c·∫£ 2 ngu·ªìn
    subs = pd.merge(submission_cnn, submission_blast, on=['Id', 'GO term'], how='outer')

    # LOGIC QUAN TR·ªåNG: "∆Øu ti√™n CNN, thi·∫øu m·ªõi d√πng BLAST"
    # M·ª•c ti√™u: File n√†y ph·∫£i mang ƒë·∫∑c tr∆∞ng c·ªßa CNN ƒë·ªÉ ph·ª•c v·ª• Ensemble sau n√†y.
    # fillna: N·∫øu CNN c√≥ ƒëi·ªÉm -> l·∫•y CNN. N·∫øu CNN l√† NaN (kh√¥ng d·ª± ƒëo√°n) -> l·∫•y BLAST.
    subs['Confidence_Final'] = subs['Confidence_CNN'].fillna(subs['Confidence_Blast'])
    
    # Ch·ªçn c·ªôt cu·ªëi c√πng
    final_df = subs[['Id', 'GO term', 'Confidence_Final']]

else:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng t√¨m th·∫•y file BLAST! Ch·ªâ l∆∞u k·∫øt qu·∫£ thu·∫ßn c·ªßa CNN.")
    final_df = submission_df.rename(columns={'Confidence': 'Confidence_Final'})

# --- 4. L∆ØU FILE ---
# T·∫°o th∆∞ m·ª•c output n·∫øu ch∆∞a c√≥
os.makedirs(f"{ROOT}/output", exist_ok=True)

# L∆∞u file (kh√¥ng index, kh√¥ng header ƒë·ªÉ ƒë√∫ng format cu·ªôc thi)
final_df.to_csv(output_path, sep='\t', header=False, index=False)

print("="*40)
print(f"üéâ TH√ÄNH C√îNG! File ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {output_path}")
print(f"üìä T·ªïng s·ªë d·ª± ƒëo√°n: {len(final_df)}")
print("="*40)


--- B·∫ÆT ƒê·∫¶U T·∫†O FILE SUBMISSION (CNN + BLAST FILL) ---
‚úÖ ƒê√£ t√¨m th·∫•y file BLAST t·∫°i: E:\CAFA-6-Protein-Function-Prediction/input/blast-quick-sprof-zero-pred/submission.tsv
‚è≥ ƒêang tr·ªôn k·∫øt qu·∫£ (Merge)...
üéâ TH√ÄNH C√îNG! File ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: E:\CAFA-6-Protein-Function-Prediction/output/submission_2.tsv
üìä T·ªïng s·ªë d·ª± ƒëo√°n: 28968486
