ƒê√¢y l√† ki·∫øn tr√∫c CNN ch√∫ng ta s·∫Ω x√¢y d·ª±ng (d·ª±a tr√™n m√¥ t·∫£ c·ªßa b√†i b√°o ):

Input: Sequence d·∫°ng One-Hot Encoding (Ma tr·∫≠n 21 x ƒê·ªô d√†i chu·ªói).

Conv Layers: Nhi·ªÅu b·ªô l·ªçc v·ªõi k√≠ch th∆∞·ªõc kh√°c nhau (v√≠ d·ª•: qu√©t 8, 16, 24, 32 axit amin c√πng l√∫c) ƒë·ªÉ b·∫Øt c√°c motif d√†i ng·∫Øn kh√°c nhau.

Pooling: Max Pooling ƒë·ªÉ ch·ªâ gi·ªØ l·∫°i t√≠n hi·ªáu m·∫°nh nh·∫•t c·ªßa motif.

Output: D·ª± ƒëo√°n nh√£n.

## One-hot encoding

In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
import gc

# --- C·∫§U H√åNH ---
TRAIN_FASTA = "/workspace/data/Train/train_sequences.fasta"
TRAIN_TERMS = "/workspace/data/Train/train_terms.tsv"
TEST_FASTA  = "/workspace/data/Test/testsuperset.fasta"
MODEL_PATH  = "deepgoplus_model.pth"

MAX_LEN = 1000   # DeepGOPlus th∆∞·ªùng d√πng ƒë·ªô d√†i c·ªë ƒë·ªãnh (v√≠ d·ª• 2000, ta d√πng 1000 cho nh·∫π)
BATCH_SIZE = 64  # TƒÉng batch size v√¨ CNN nh·∫π h∆°n Transformer nhi·ªÅu
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY" # 20 lo·∫°i axit amin chu·∫©n

# Map: K√Ω t·ª± -> S·ªë nguy√™n (1-20), 0 ƒë·ªÉ padding
aa_to_id = {aa: i + 1 for i, aa in enumerate(AMINO_ACIDS)}

def get_one_hot_data(fasta_file, ids_filter=None):
    sequences = []
    ids = []
    print(f"‚è≥ Encoding {fasta_file}...")
    
    for record in tqdm(SeqIO.parse(fasta_file, "fasta")):
        pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
        
        if ids_filter is not None and pid not in ids_filter:
            continue
            
        seq = str(record.seq)
        # Encode sang s·ªë
        encoded_seq = [aa_to_id.get(aa, 0) for aa in seq[:MAX_LEN]]
        # Padding (ƒëi·ªÅn s·ªë 0 v√†o ƒëu√¥i cho ƒë·ªß ƒë·ªô d√†i MAX_LEN)
        if len(encoded_seq) < MAX_LEN:
            encoded_seq += [0] * (MAX_LEN - len(encoded_seq))
            
        sequences.append(encoded_seq)
        ids.append(pid)
        
    return np.array(sequences, dtype=np.int32), ids

# --- LOAD D·ªÆ LI·ªÜU ---
# 1. Load Sequence
X_train_seq, train_ids = get_one_hot_data(TRAIN_FASTA)

# 2. Load Labels (Y) - Top 1500 nh√£n ph·ªï bi·∫øn
print("‚è≥ Processing Labels...")
train_terms = pd.read_csv(TRAIN_TERMS, sep="\t", usecols=["EntryID", "term"])
TOP_N = 1500
top_terms = train_terms["term"].value_counts().head(TOP_N).index.tolist()
term_to_idx = {t: i for i, t in enumerate(top_terms)}

Y_train = np.zeros((len(train_ids), TOP_N), dtype=np.float32)
id_map = {pid: i for i, pid in enumerate(train_ids)}

# Fill Y matrix
grouped = train_terms[train_terms["term"].isin(top_terms)].groupby("EntryID")["term"].apply(list)
for pid, terms in tqdm(grouped.items()):
    if pid in id_map:
        indices = [term_to_idx[t] for t in terms]
        Y_train[id_map[pid], indices] = 1.0

print(f"‚úÖ Data Ready: X={X_train_seq.shape}, Y={Y_train.shape}")

‚è≥ Encoding /workspace/data/Train/train_sequences.fasta...


82404it [00:01, 49812.91it/s]


‚è≥ Processing Labels...


76297it [00:00, 304256.58it/s]

‚úÖ Data Ready: X=(82404, 1000), Y=(82404, 1500)





## Build model

In [2]:
class DeepGOPlus(nn.Module):
    def __init__(self, num_classes, vocab_size=21, embedding_dim=128, num_filters=512, kernel_sizes=[8, 16, 24, 32]):
        super(DeepGOPlus, self).__init__()
        
        # L·ªõp Embedding: Bi·∫øn s·ªë nguy√™n (1, 2...) th√†nh vector d√†y ƒë·∫∑c
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # C√°c l·ªõp Convolution song song (Multi-kernel)
        # M·ªói kernel size s·∫Ω b·∫Øt c√°c motif c√≥ ƒë·ªô d√†i kh√°c nhau
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
        
        # L·ªõp ph√¢n lo·∫°i cu·ªëi c√πng
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
        self.dropout = nn.Dropout(0.5) # Ch·ªëng h·ªçc v·∫πt
        
    def forward(self, x):
        # x shape: [batch, seq_len]
        x = self.embedding(x)           # -> [batch, seq_len, emb_dim]
        x = x.permute(0, 2, 1)          # -> [batch, emb_dim, seq_len] (Pytorch Conv1d c·∫ßn channel ·ªü gi·ªØa)
        
        # Ch·∫°y qua t·ª´ng b·ªô l·ªçc Conv -> ReLU -> MaxPool
        outs = []
        for conv in self.convs:
            out = conv(x)               # Convolution
            out = torch.relu(out)       # Activation
            out, _ = torch.max(out, dim=2) # Global Max Pooling (L·∫•y t√≠n hi·ªáu m·∫°nh nh·∫•t c·ªßa motif)
            outs.append(out)
            
        # N·ªëi c√°c ƒë·∫∑c tr∆∞ng l·∫°i
        out = torch.cat(outs, dim=1)
        out = self.dropout(out)
        out = self.fc(out)
        return out # Tr·∫£ v·ªÅ Logits (ch∆∞a qua Sigmoid)

# Kh·ªüi t·∫°o
device = "cuda" if torch.cuda.is_available() else "cpu"
model = DeepGOPlus(num_classes=TOP_N).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

print("‚úÖ Model CNN (DeepGOPlus style) ƒë√£ s·∫µn s√†ng chi·∫øn ƒë·∫•u!")

‚úÖ Model CNN (DeepGOPlus style) ƒë√£ s·∫µn s√†ng chi·∫øn ƒë·∫•u!


## Train

In [3]:
# Chia t·∫≠p train/val
from sklearn.model_selection import train_test_split
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train_seq, Y_train, test_size=0.1, random_state=42)

# Dataset Loader
train_ds = data.TensorDataset(torch.from_numpy(X_tr).long(), torch.from_numpy(Y_tr).float())
val_ds = data.TensorDataset(torch.from_numpy(X_val).long(), torch.from_numpy(Y_val).float())
train_loader = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = data.DataLoader(val_ds, batch_size=BATCH_SIZE)

print("üöÄ B·∫Øt ƒë·∫ßu Train CNN...")
for epoch in range(10): # Train nhanh 10 epoch
    model.train()
    total_loss = 0
    for bx, by in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        outputs = model(bx)
        loss = criterion(outputs, by)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    # Validate
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for bx, by in val_loader:
            bx, by = bx.to(device), by.to(device)
            val_loss += criterion(model(bx), by).item()
            
    print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader):.4f} | Val Loss = {val_loss/len(val_loader):.4f}")

# L∆∞u model
torch.save(model.state_dict(), MODEL_PATH)
print("‚úÖ Train xong & ƒê√£ l∆∞u model!")

üöÄ B·∫Øt ƒë·∫ßu Train CNN...


                                                            

Epoch 1: Train Loss = 0.0173 | Val Loss = 0.0153


                                                            

Epoch 2: Train Loss = 0.0156 | Val Loss = 0.0149


                                                            

Epoch 3: Train Loss = 0.0153 | Val Loss = 0.0146


                                                            

Epoch 4: Train Loss = 0.0150 | Val Loss = 0.0144


                                                            

Epoch 5: Train Loss = 0.0147 | Val Loss = 0.0145


                                                            

Epoch 6: Train Loss = 0.0144 | Val Loss = 0.0144


                                                            

Epoch 7: Train Loss = 0.0141 | Val Loss = 0.0143


                                                            

Epoch 8: Train Loss = 0.0138 | Val Loss = 0.0144


                                                            

Epoch 9: Train Loss = 0.0135 | Val Loss = 0.0147


                                                             

Epoch 10: Train Loss = 0.0132 | Val Loss = 0.0149
‚úÖ Train xong & ƒê√£ l∆∞u model!


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tqdm import tqdm

# --- C·∫§U H√åNH ---
TEST_FASTA = "/workspace/data/Test/testsuperset.fasta"
OUTPUT_CNN_RAW = "submission_cnn_raw.tsv"
BATCH_SIZE = 128

# 1. Encode t·∫≠p Test (D√πng l·∫°i h√†m get_one_hot_data c≈©)
print("‚è≥ ƒêang m√£ h√≥a One-Hot cho t·∫≠p Test (s·∫Ω m·∫•t 1-2 ph√∫t)...")
# H√†m get_one_hot_data ƒë√£ khai b√°o ·ªü cell tr∆∞·ªõc
X_test_seq, test_ids = get_one_hot_data(TEST_FASTA)

# Chuy·ªÉn sang Tensor
test_dataset = TensorDataset(torch.from_numpy(X_test_seq).long())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"‚úÖ ƒê√£ load {len(test_ids)} protein test.")

# 2. D·ª± ƒëo√°n (Inference)
print("üöÄ ƒêang ch·∫°y model CNN tr√™n t·∫≠p Test...")
model.eval()
output_lines = []
# Danh s√°ch nh√£n (top_terms) ƒë√£ c√≥ t·ª´ cell tr∆∞·ªõc

all_probs = []

with torch.no_grad():
    for bx in tqdm(test_loader):
        bx = bx[0].to(device)
        logits = model(bx)
        probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(probs)

# G·ªôp t·∫•t c·∫£ batch l·∫°i
all_probs = np.vstack(all_probs)

# 3. Ghi file (L·∫•y Top 50 ƒë·ªÉ nh·∫π file)
print(f"üíæ ƒêang ghi k·∫øt qu·∫£ xu·ªëng {OUTPUT_CNN_RAW}...")
for i, pid in enumerate(tqdm(test_ids)):
    # L·∫•y x√°c su·∫•t c·ªßa protein i
    prob_row = all_probs[i]
    
    # L·ªçc Top 50 nh√£n cao nh·∫•t
    top_indices = np.argsort(prob_row)[-50:]
    
    for idx in top_indices:
        score = prob_row[idx]
        # Ch·ªâ l·∫•y n·∫øu score > 0.01
        if score > 0.01:
            term = top_terms[idx] # L·∫•y t√™n GO Term
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

with open(OUTPUT_CNN_RAW, "w") as f:
    f.write("\n".join(output_lines))

print("‚úÖ ƒê√£ c√≥ file d·ª± ƒëo√°n th√¥ c·ªßa CNN!")

‚è≥ ƒêang m√£ h√≥a One-Hot cho t·∫≠p Test (s·∫Ω m·∫•t 1-2 ph√∫t)...
‚è≥ Encoding /workspace/data/Test/testsuperset.fasta...


224309it [00:04, 48922.21it/s]


‚úÖ ƒê√£ load 224309 protein test.
üöÄ ƒêang ch·∫°y model CNN tr√™n t·∫≠p Test...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1753/1753 [00:18<00:00, 93.12it/s]


üíæ ƒêang ghi k·∫øt qu·∫£ xu·ªëng submission_cnn_raw.tsv...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:07<00:00, 28310.53it/s]


‚úÖ ƒê√£ c√≥ file d·ª± ƒëo√°n th√¥ c·ªßa CNN!


In [5]:
# --- C·∫§U H√åNH ---
INPUT_FILE = "submission_cnn_raw.tsv"
OUTPUT_FILE = "submission_cnn_propagated.tsv"

# (ƒê·∫£m b·∫£o ƒë√£ c√†i obonet: pip install obonet networkx)
import obonet
import networkx
import os

# Load OBO
OBO_PATH = "/workspace/data/go-basic.obo"
if not os.path.exists(OBO_PATH):
    OBO_PATH = "/workspace/data/Train/go-basic.obo"

print(f"üìñ Reading OBO: {OBO_PATH}")
graph = obonet.read_obo(OBO_PATH)
ancestors_map = {}
for node in graph.nodes():
    try:
        ancestors_map[node] = networkx.descendants(graph, node)
    except:
        pass

# ƒê·ªçc file & Propagate
print("üöÄ Propagating scores...")
preds = {}
with open(INPUT_FILE) as f:
    for line in f:
        p, t, s = line.strip().split('\t')
        if p not in preds: preds[p] = {}
        preds[p][t] = float(s)

final_lines = []
for pid, scores in tqdm(preds.items()):
    new_scores = scores.copy()
    for term, score in scores.items():
        if term in ancestors_map:
            for parent in ancestors_map[term]:
                new_scores[parent] = max(new_scores.get(parent, 0.0), score)
    
    # Ghi l·∫°i (Top 70)
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)[:70]
    for term, score in sorted_terms:
        if score > 0.01:
            final_lines.append(f"{pid}\t{term}\t{score:.3f}")

with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(final_lines))
print(f"‚úÖ ƒê√£ xong file CNN chu·∫©n: {OUTPUT_FILE}")

üìñ Reading OBO: /workspace/data/Train/go-basic.obo
üöÄ Propagating scores...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224305/224305 [00:12<00:00, 17964.95it/s]


‚úÖ ƒê√£ xong file CNN chu·∫©n: submission_cnn_propagated.tsv


In [6]:
# --- C·∫§U H√åNH ---
# 1. File KNN x·ªãn nh·∫•t c·ªßa b·∫°n (File ƒë·∫°t 0.256)
# H√£y thay t√™n file n√†y b·∫±ng t√™n file t·ªët nh·∫•t b·∫°n ƒëang c√≥!
FILE_KNN = "submission_level11_IA_official.tsv" 
# N·∫øu b·∫°n l∆∞u t√™n kh√°c (vd submission_v11_B_bold.tsv) th√¨ s·ª≠a l·∫°i nh√©!

# 2. File CNN v·ª´a t·∫°o
FILE_CNN = "submission_cnn_propagated.tsv"

# 3. File n·ªôp cu·ªëi c√πng
OUTPUT_ENSEMBLE = "submission_FINAL_FUSION_0.35.tsv"

# Tr·ªçng s·ªë (Weights)
W_KNN = 0.60
W_CNN = 0.40

print("‚è≥ ƒêang load 2 file ƒë·ªÉ h·ª£p th·ªÉ...")
preds_knn = {}
with open(FILE_KNN) as f:
    for line in tqdm(f, desc="Loading KNN"):
        p, t, s = line.strip().split('\t')
        preds_knn[(p, t)] = float(s)

preds_cnn = {}
with open(FILE_CNN) as f:
    for line in tqdm(f, desc="Loading CNN"):
        p, t, s = line.strip().split('\t')
        preds_cnn[(p, t)] = float(s)

# Tr·ªôn
print("üöÄ FUSION: Mixing KNN (Sequence Sim) & CNN (Motif)...")
all_keys = set(preds_knn.keys()) | set(preds_cnn.keys())
output_lines = []

for key in tqdm(all_keys):
    pid, term = key
    s1 = preds_knn.get(key, 0.0)
    s2 = preds_cnn.get(key, 0.0)
    
    # Weighted Average
    final_score = (s1 * W_KNN) + (s2 * W_CNN)
    
    if final_score > 0.01:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

with open(OUTPUT_ENSEMBLE, "w") as f:
    f.write("\n".join(output_lines))

print(f"üéâ XONG! File '{OUTPUT_ENSEMBLE}' l√† ni·ªÅm hy v·ªçng cu·ªëi c√πng.")
print("üëâ H√£y n·ªôp file n√†y. S·ª± k·∫øt h·ª£p gi·ªØa Global (ESM2) v√† Local (CNN) th∆∞·ªùng ƒë·∫©y ƒëi·ªÉm l√™n r·∫•t cao!")

‚è≥ ƒêang load 2 file ƒë·ªÉ h·ª£p th·ªÉ...


Loading KNN: 14386016it [00:08, 1712368.83it/s]
Loading CNN: 14009920it [00:08, 1708294.71it/s]


üöÄ FUSION: Mixing KNN (Sequence Sim) & CNN (Motif)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23460696/23460696 [00:27<00:00, 859221.14it/s]


üéâ XONG! File 'submission_FINAL_FUSION_0.35.tsv' l√† ni·ªÅm hy v·ªçng cu·ªëi c√πng.
üëâ H√£y n·ªôp file n√†y. S·ª± k·∫øt h·ª£p gi·ªØa Global (ESM2) v√† Local (CNN) th∆∞·ªùng ƒë·∫©y ƒëi·ªÉm l√™n r·∫•t cao!
