In [1]:
# ============================================================================

# Section 1: C·∫§U H√åNH & IMPORT

# ============================================================================

import numpy as np

import pandas as pd

import os

import torch

import torch.nn as nn

import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import gc

import warnings

!pip install goatools

warnings.filterwarnings('ignore')

EMBED_DIR = '/kaggle/input/cafa-6-t5-embeddings' 

TRAIN_TERMS_PATH = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv'

OBO_PATH = '/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo'



# --- HYPERPARAMETERS ---

BATCH_SIZE = 512       

EPOCHS = 25           

LEARNING_RATE = 1e-3   

NUM_LABELS = 1500   

SEEDS = [42, 2024, 123, 777, 888] # Ensemble 5 m√¥ h√¨nh



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"üîß Device: {device}")

print("üöÄ Strategy: T5 Embeddings Only + MLP Multi-label")

Collecting goatools
  Downloading goatools-1.5.2-py3-none-any.whl.metadata (14 kB)
Collecting docopt-ng (from goatools)
  Downloading docopt_ng-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting ftpretty (from goatools)
  Downloading ftpretty-0.4.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting xlsxwriter (from goatools)
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading goatools-1.5.2-py3-none-any.whl (15.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m15.8/15.8 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading docopt_ng-0.9.0-py3-none-any.whl (16 kB)
Downloading ftpretty-0.4.0-py2.py3-none-any.whl (8.2 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m175.3/175.3 kB[0m [31

In [2]:
# ============================================================================

# Section 2: LOAD D·ªÆ LI·ªÜU & CHU·∫®N H√ìA (STANDARD SCALER)

# ============================================================================

print("\n" + "="*60)

print("üìÇ B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU T5 & CHU·∫®N H√ìA")

print("="*60)



# H√†m l√†m s·∫°ch ID ƒë·ªÉ kh·ªõp gi·ªØa c√°c file

def clean_id(pid):

    if isinstance(pid, bytes): pid = pid.decode('utf-8')

    pid = str(pid).strip()

    if '|' in pid: 

        parts = pid.split('|')

        return parts[1] if len(parts) >= 2 else pid

    return pid

clean_func = np.vectorize(clean_id)



# 1. Load T5 Embeddings (Train)

print("1Ô∏è‚É£ Loading T5 Train Embeddings...")

try:

    X_raw = np.load(os.path.join(EMBED_DIR, 'train_embeds.npy'))

    ids = clean_func(np.load(os.path.join(EMBED_DIR, 'train_ids.npy'))) 

    print(f"   ‚úì Loaded Shape: {X_raw.shape}")

except Exception as e:

    print(f"‚ùå Error loading T5: {e}"); exit()



# 2. Load Labels

print("2Ô∏è‚É£ Loading Labels...")

df_terms = pd.read_csv(TRAIN_TERMS_PATH, sep='\t')

if 'EntryID' not in df_terms.columns:

    df_terms = pd.read_csv(TRAIN_TERMS_PATH, sep='\t', header=None, names=['EntryID', 'term', 'aspect'])

df_terms['EntryID'] = df_terms['EntryID'].apply(clean_id)



# 3. Align Data (T√¨m ID chung)

print("3Ô∏è‚É£ Aligning Data...")

common_ids = sorted(list(set(ids) & set(df_terms['EntryID'])))

print(f"   ‚úì S·ªë l∆∞·ª£ng protein d√πng ƒë·ªÉ train: {len(common_ids)}")



if len(common_ids) == 0: raise ValueError("‚ùå Kh√¥ng t√¨m th·∫•y ID chung!")



# Map ID -> Index

id_map = {pid: i for i, pid in enumerate(ids)}

final_map = {pid: i for i, pid in enumerate(common_ids)}



# T·∫°o ma tr·∫≠n X, y

INPUT_DIM = X_raw.shape[1]

X_all = np.zeros((len(common_ids), INPUT_DIM), dtype=np.float32)

y_all = np.zeros((len(common_ids), NUM_LABELS), dtype=np.float32)



print("   -> Building Feature Matrix X...")

for i, pid in enumerate(common_ids):

    X_all[i] = X_raw[id_map[pid]]



print("   -> Building Label Matrix y (Multi-label)...")

top_terms = df_terms['term'].value_counts().head(NUM_LABELS).index.tolist()

term2idx = {t: i for i, t in enumerate(top_terms)}
idx2term = {i: t for i, t in enumerate(top_terms)}



relevant = df_terms[df_terms['EntryID'].isin(common_ids) & df_terms['term'].isin(top_terms)]

for row in relevant.itertuples(index=False):

    pid = getattr(row, 'EntryID', row[0])

    term = getattr(row, 'term', row[1])

    if pid in final_map and term in term2idx:

        y_all[final_map[pid], term2idx[term]] = 1.0



# 4. CHU·∫®N H√ìA D·ªÆ LI·ªÜU

print("4Ô∏è‚É£ Applying StandardScaler...")

scaler = StandardScaler()

X_all = scaler.fit_transform(X_all) # ƒê∆∞a v·ªÅ Mean=0, Std=1

print("   ‚úì Done! D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c chu·∫©n h√≥a.")



# D·ªçn d·∫πp RAM

del X_raw, ids, df_terms, relevant

gc.collect()



print(f"‚úÖ DATA READY: X={X_all.shape}, y={y_all.shape}")


üìÇ B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU T5 & CHU·∫®N H√ìA
1Ô∏è‚É£ Loading T5 Train Embeddings...
   ‚úì Loaded Shape: (82404, 1024)
2Ô∏è‚É£ Loading Labels...
3Ô∏è‚É£ Aligning Data...
   ‚úì S·ªë l∆∞·ª£ng protein d√πng ƒë·ªÉ train: 82404
   -> Building Feature Matrix X...
   -> Building Label Matrix y (Multi-label)...
4Ô∏è‚É£ Applying StandardScaler...
   ‚úì Done! D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c chu·∫©n h√≥a.
‚úÖ DATA READY: X=(82404, 1024), y=(82404, 1500)


In [3]:
# ============================================================================

# Section 3: MODEL (RESIDUAL MLP) & DATASET

# ============================================================================



class TensorDataset(Dataset):

    def __init__(self, X, y=None):

        self.X = torch.tensor(X, dtype=torch.float32)

        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self): return len(self.X)

    def __getitem__(self, idx):

        if self.y is not None: return self.X[idx], self.y[idx]

        return self.X[idx]



class ResidualBlock(nn.Module):

    def __init__(self, dim, dropout=0.4): 

        super().__init__()

        self.net = nn.Sequential(

            nn.Linear(dim, dim),

            nn.BatchNorm1d(dim), 

            nn.ReLU(),

            nn.Dropout(dropout)

        )

    def forward(self, x): return x + self.net(x)



class ResMLP(nn.Module):

    def __init__(self, inp_dim, out_dim):

        super().__init__()

        self.entry = nn.Sequential(

            nn.Linear(inp_dim, 1024),

            nn.BatchNorm1d(1024),

            nn.ReLU(),

            nn.Dropout(0.35)

        )

        self.blocks = nn.Sequential(

            ResidualBlock(1024),

            ResidualBlock(1024)

        )

        self.head = nn.Linear(1024, out_dim)

    

    def forward(self, x):

        x = self.entry(x)

        x = self.blocks(x)

        return self.head(x)

In [4]:
# ============================================================================

# Section 4: TRAINING FUNCTION (COSINE ANNEALING SCHEDULER)

# ============================================================================



def train_single_seed(seed, X_tr, y_tr, X_va, y_va, save_path):

    print(f"\nüé≤ Training Seed: {seed}")

    torch.manual_seed(seed)

    np.random.seed(seed)

    

    train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True)

    val_loader = DataLoader(TensorDataset(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False)

    

    model = ResMLP(INPUT_DIM, NUM_LABELS).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) # Weight decay ch·ªëng overfit

    criterion = nn.BCEWithLogitsLoss()

    

    # CosineAnnealingLR: Gi√∫p model "nh·∫£y" ra kh·ªèi local minima v√† h·ªôi t·ª• t·ªët h∆°n

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-5)

    

    best_loss = float('inf')

    

    for ep in range(EPOCHS):

        model.train()

        t_loss = 0

        for fx, fy in train_loader:

            fx, fy = fx.to(device), fy.to(device)

            optimizer.zero_grad()

            out = model(fx)

            loss = criterion(out, fy)

            loss.backward()

            optimizer.step()

            t_loss += loss.item()

            

        model.eval()

        v_loss = 0

        with torch.no_grad():

            for fx, fy in val_loader:

                v_loss += criterion(model(fx.to(device)), fy.to(device)).item()

        

        avg_v = v_loss/len(val_loader)

        

        if avg_v < best_loss:

            best_loss = avg_v

            torch.save(model.state_dict(), save_path)

            

        scheduler.step() # C·∫≠p nh·∫≠t learning rate

        

        if (ep+1) % 5 == 0:

            lr_curr = optimizer.param_groups[0]['lr']

            print(f"   Ep {ep+1}/{EPOCHS}: Train={t_loss/len(train_loader):.4f} | Val={avg_v:.4f} | LR={lr_curr:.6f}")

            

    return best_loss

In [5]:
# ============================================================================

# Section 5: CH·∫†Y TRAINING (ENSEMBLE 5 SEEDS)

# ============================================================================

print("\n" + "="*60)

print("üöÄ B∆Ø·ªöC 2: TRAIN ENSEMBLE MODELS")

print("="*60)



os.makedirs('models_t5_final', exist_ok=True)

model_paths = []



for seed in SEEDS:

    X_tr, X_va, y_tr, y_va = train_test_split(X_all, y_all, test_size=0.1, random_state=seed)

    path = f'models_t5_final/model_{seed}.pth'

    train_single_seed(seed, X_tr, y_tr, X_va, y_va, path)

    model_paths.append(path)

    

    del X_tr, X_va, y_tr, y_va

    gc.collect()

    torch.cuda.empty_cache()


üöÄ B∆Ø·ªöC 2: TRAIN ENSEMBLE MODELS

üé≤ Training Seed: 42
   Ep 5/25: Train=0.0112 | Val=0.0112 | LR=0.000905
   Ep 10/25: Train=0.0102 | Val=0.0109 | LR=0.000658
   Ep 15/25: Train=0.0095 | Val=0.0107 | LR=0.000352
   Ep 20/25: Train=0.0090 | Val=0.0107 | LR=0.000105
   Ep 25/25: Train=0.0088 | Val=0.0107 | LR=0.000010

üé≤ Training Seed: 2024
   Ep 5/25: Train=0.0112 | Val=0.0113 | LR=0.000905
   Ep 10/25: Train=0.0102 | Val=0.0110 | LR=0.000658
   Ep 15/25: Train=0.0095 | Val=0.0108 | LR=0.000352
   Ep 20/25: Train=0.0090 | Val=0.0108 | LR=0.000105
   Ep 25/25: Train=0.0088 | Val=0.0107 | LR=0.000010

üé≤ Training Seed: 123
   Ep 5/25: Train=0.0112 | Val=0.0110 | LR=0.000905
   Ep 10/25: Train=0.0103 | Val=0.0107 | LR=0.000658
   Ep 15/25: Train=0.0096 | Val=0.0105 | LR=0.000352
   Ep 20/25: Train=0.0090 | Val=0.0104 | LR=0.000105
   Ep 25/25: Train=0.0088 | Val=0.0104 | LR=0.000010

üé≤ Training Seed: 777
   Ep 5/25: Train=0.0112 | Val=0.0111 | LR=0.000905
   Ep 10/25: Trai

In [6]:
# ============================================================================

# Section 6: D·ª∞ ƒêO√ÅN & H·∫¨U X·ª¨ L√ù

# ============================================================================

print("\n" + "="*60)

print("üîÆ B∆Ø·ªöC 3: D·ª∞ ƒêO√ÅN & GO PROPAGATION")

print("="*60)



# 1. Load Test Data

print("Loading Test T5 Embeddings...")

test_emb = np.load(os.path.join(EMBED_DIR, 'test_embeds.npy'))

test_ids = clean_func(np.load(os.path.join(EMBED_DIR, 'test_ids.npy')))



# 2. Normalize Test Data (D√πng scaler t·ª´ t·∫≠p train)

print("Applying StandardScaler to Test Data...")

test_X = scaler.transform(test_emb)



# 3. Predict Ensemble

print("Predicting with 5 models...")

models = [ResMLP(INPUT_DIM, NUM_LABELS).to(device) for _ in model_paths]

for i, m in enumerate(models):

    m.load_state_dict(torch.load(model_paths[i]))

    m.eval()



TEMP_FILE = 'submission_raw.tsv'

loader = DataLoader(TensorDataset(test_X), batch_size=BATCH_SIZE*2, shuffle=False)



with open(TEMP_FILE, 'w') as f:

    start_idx = 0

    with torch.no_grad():

        for fx in loader:

            fx = fx.to(device)

            bs = fx.size(0)

            

            # T√≠nh trung b√¨nh c·ªông x√°c su·∫•t c·ªßa 5 models

            avg_prob = torch.zeros((bs, NUM_LABELS)).to(device)

            for m in models:

                avg_prob += torch.sigmoid(m(fx))

            avg_prob /= len(models)

            avg_prob = avg_prob.cpu().numpy()

            

            for k in range(bs):

                pid = test_ids[start_idx + k]

                probs = avg_prob[k]

                # L·∫•y Top 60 ƒë·ªÉ d·ª± ph√≤ng cho b∆∞·ªõc propagation

                top_k = np.argsort(probs)[::-1][:60]

                

                for idx in top_k:

                    sc = probs[idx]

                    if sc > 0.001: # Threshold th·∫•p ƒë·ªÉ gi·ªØ th√¥ng tin

                        f.write(f"{pid}\t{idx2term[idx]}\t{sc:.3f}\n")

            start_idx += bs



print(f"‚úÖ Raw Predictions Saved: {TEMP_FILE}")

del test_X, models

gc.collect()



# 4. GO Hierarchy Propagation

print("Applying GO Propagation (Goatools)...")

FINAL_OUTPUT = 'submission.tsv'



try:

    from goatools.obo_parser import GODag

    if os.path.exists(OBO_PATH):

        godag = GODag(OBO_PATH)

        

        def propagate(pid, df_grp):

            scores = dict(zip(df_grp['Term'], df_grp['Score']))

            new_scores = scores.copy()

            for term, score in scores.items():

                if term in godag:

                    for parent in godag[term].get_all_parents():

                        new_scores[parent] = max(new_scores.get(parent, 0), score)

            return [[pid, t, s] for t, s in new_scores.items() if s >= 0.01]



        df = pd.read_csv(TEMP_FILE, sep='\t', names=['ProteinID', 'Term', 'Score'], dtype={'ProteinID': str})

        final_rows = []

        

        # Iterator ƒë∆°n gi·∫£n

        for pid, grp in df.groupby('ProteinID'):

            final_rows.extend(propagate(pid, grp))

            

        res_df = pd.DataFrame(final_rows, columns=['ProteinID', 'Term', 'Score'])

        res_df['Score'] = res_df['Score'].apply(lambda x: f"{x:.3f}")

        res_df.to_csv(FINAL_OUTPUT, sep='\t', index=False, header=False)

        print(f"‚úÖ DONE! Final Submission: {FINAL_OUTPUT}")

    else:

        print("‚ö†Ô∏è No OBO found. Using raw submission.")

        os.rename(TEMP_FILE, FINAL_OUTPUT)

except Exception as e:

    print(f"‚ö†Ô∏è Error in propagation: {e}. Using raw submission.")

    if os.path.exists(TEMP_FILE): os.rename(TEMP_FILE, FINAL_OUTPUT)


üîÆ B∆Ø·ªöC 3: D·ª∞ ƒêO√ÅN & GO PROPAGATION
Loading Test T5 Embeddings...
Applying StandardScaler to Test Data...
Predicting with 5 models...
‚úÖ Raw Predictions Saved: submission_raw.tsv
Applying GO Propagation (Goatools)...
/kaggle/input/cafa-6-protein-function-prediction/Train/go-basic.obo: fmt(1.2) rel(2025-06-01) 43,448 Terms
‚úÖ DONE! Final Submission: submission.tsv
