### Drawback of traditional ML approach: Kh√¥ng gi·ªØ ƒë∆∞·ª£c order meaning c·ªßa chu·ªói axit amin

#### Sol: s·ª≠ d·ª•ng m√¥ h√¨nh esm2 650M tham s·ªë ƒë·ªÉ embedding c√°c chu·ªói acid amin

## Load model

In [None]:
%pip install transformers

In [3]:
import torch
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")

Running on device: cuda


In [7]:
model_name = "facebook/esm2_t33_650M_UR50D"

print(f"Loading model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval() #read-only to save VRAM

Loading model: facebook/esm2_t33_650M_UR50D...


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

## Feature extraction

In [8]:
def extract_embeddings(fasta_path, save_name, batch_size=8, limit=None):
    """
    Read fasta file -> Run through ESM2 -> output: .npy
    save_name: output file
    """
    ids = []
    sequences = []

    print(f"Reading file: {fasta_path}")
    for i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):
        if limit and i >= limit: break

        #clean id
        pid = str(record.id)
        if "|" in pid:
            pid = pid.split("|")[1]

        ids.append(pid)
        #esm2 limit 1024 token
        sequences.append(str(record.seq)[:1022])

    print(f"{len(sequences)} Proteins")

    #batching
    embeddings = []
    print("Creating embeddings...")

    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i : i + batch_size]

        #tokenize
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        #mean pooling
        last_hidden_state = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask

        embeddings.append(mean_embeddings.cpu().numpy())

    final_embeddings = np.vstack(embeddings)

    np.save(f"/workspace/data/Embeddings/{save_name}.npy", final_embeddings)
    np.save(f"/workspace/data/Embeddings/{save_name}_ids.npy", ids)

    return ids, final_embeddings

In [None]:
train_fasta = "/workspace/data/Train/train_sequences.fasta"

train_ids, X_train = extract_embeddings(
    train_fasta, 
    save_name="train_650M", 
    batch_size=8
)

print(f"Shape X_train: {X_train.shape}")

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

#prepare labels
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])

train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set)]

top_n = 1500
top_terms = train_terms_filtered["term"].value_counts().head(top_n).index.tolist()

Y_matrix = train_terms_filtered[train_terms_filtered["term"].isin(top_terms)] \
            .pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [None]:
clf = RidgeClassifier(alpha=1.0)
clf.fit(X_tr, Y_tr)

In [None]:
Y_pred_val = clf.predict(X_val)
score = f1_score(Y_val, Y_pred_val, average='micro')
print(f"Local F1-Score: {score:.4f}")

In [None]:
# --- Cell ch·∫©n ƒëo√°n l·ªói ---
import numpy as np
from sklearn.metrics import precision_score, recall_score

# 1. Ki·ªÉm tra xem m√¥ h√¨nh c√≥ d·ª± ƒëo√°n ra c√°i g√¨ kh√¥ng?
print(f"T·ªïng s·ªë m·∫´u trong t·∫≠p Val: {Y_val.shape[0]}")
print(f"T·ªïng s·ªë nh√£n c·∫ßn d·ª± ƒëo√°n: {Y_val.shape[0] * Y_val.shape[1]}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (Th·ª±c t·∫ø): {Y_val.sum()}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (M√¥ h√¨nh d·ª± ƒëo√°n): {Y_pred_val.sum()}")

# 2. N·∫øu s·ªë d·ª± ƒëo√°n qu√° th·∫•p (g·∫ßn b·∫±ng 0), ta c·∫ßn h·∫° ng∆∞·ª°ng (Threshold)
print("\n--- Th·ª≠ ch·ªânh ng∆∞·ª°ng th·ªß c√¥ng ---")
# L·∫•y ƒëi·ªÉm s·ªë th√¥ thay v√¨ nh√£n c·ª©ng 0/1
decision_scores = clf.decision_function(X_val) 

# Th·ª≠ c√°c ng∆∞·ª°ng kh√°c nhau
for thr in [0, -0.5, -1.0]: # Ridge score c√≥ th·ªÉ √¢m
    y_pred_new = (decision_scores > thr).astype(int)
    new_f1 = f1_score(Y_val, y_pred_new, average='micro')
    print(f"Ng∆∞·ª°ng {thr}: F1-Score = {new_f1:.4f}")

In [11]:
import os
import gc
import torch
import numpy as np
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

FASTA_PATH = "/workspace/data/Test/testsuperset.fasta" 
SAVE_DIR = "/workspace/data/Embeddings/embeddings_chunks"
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

CHUNK_SIZE = 5000  
BATCH_SIZE = 8    

# --- SETUP ---
os.makedirs(SAVE_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def process_and_save(seqs, ids, part_idx):
    embeddings = []
    for i in range(0, len(seqs), BATCH_SIZE):
        batch_seqs = seqs[i : i + BATCH_SIZE]
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask, 1)
            sum_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
        embeddings.append(mean_embeddings.cpu().numpy())
    
    final_emb = np.vstack(embeddings)
    np.save(f"{SAVE_DIR}/test_part_{part_idx}.npy", final_emb)
    np.save(f"{SAVE_DIR}/test_ids_{part_idx}.npy", ids)

sequences = []
ids = []
part_counter = 0

pbar = tqdm(total=224309) 

for record in SeqIO.parse(FASTA_PATH, "fasta"):
    save_path_check = f"{SAVE_DIR}/test_part_{part_counter}.npy"
    
    if os.path.exists(save_path_check):
        sequences.append(1) 
        if len(sequences) >= CHUNK_SIZE:
            sequences = [] 
            ids = []
            part_counter += 1
            pbar.update(CHUNK_SIZE)
        continue

    pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
    ids.append(pid)
    sequences.append(str(record.seq)[:1022])
    pbar.update(1)
    
    if len(sequences) >= CHUNK_SIZE: 
        process_and_save(sequences, ids, part_counter)
        part_counter += 1
        
        sequences = []
        ids = []
        gc.collect()

if len(sequences) > 0:
    process_and_save(sequences, ids, part_counter)

print("Finished")

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [1:46:48<00:00, 47.25it/s]

Finished


In [None]:
import numpy as np
import pandas as pd
import glob
import gc
from tqdm import tqdm
from sklearn.linear_model import RidgeClassifier

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy" 
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
TEST_CHUNKS_DIR = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE = "submission_level4_FINAL_fixed.tsv"

# Re-training
print("Loading data and training model...")
X_train = np.load(TRAIN_EMB_PATH)
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)

train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])
top_n = 1500
top_terms = train_terms["term"].value_counts().head(top_n).index.tolist()
train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set) & train_terms["term"].isin(top_terms)]

Y_matrix = train_terms_filtered.pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)
terms_columns = Y_train.columns 

clf = RidgeClassifier(alpha=1.0)
clf.fit(X_train, Y_train)

del X_train, Y_train, train_terms, train_terms_filtered, Y_matrix
gc.collect()

print("Outputting (Top K)...")

chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []
TOP_K = 50   
THRESHOLD = 0.01 #increased threshold

for f_path in tqdm(chunk_files):
    X_chunk = np.load(f_path)
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_chunk = np.load(id_path, allow_pickle=True)
    
    #predicting
    decision_scores = clf.decision_function(X_chunk)
    probs = 1 / (1 + np.exp(-decision_scores))
    
    for i, pid in enumerate(ids_chunk):
        prob_row = probs[i]
        
        # 1. thresholding
        mask = prob_row > THRESHOLD
        if not np.any(mask):
            indices = np.argsort(prob_row)[-5:]
        else:
            candidates = np.where(mask)[0]
            
            #2: only select the top 50
            if len(candidates) > TOP_K:
                # get candidate scores
                cand_probs = prob_row[candidates]
                # sort for top k
                top_k_local_idx = np.argsort(cand_probs)[-TOP_K:]
                indices = candidates[top_k_local_idx]
            else:
                indices = candidates
            
        for idx in indices:
            term = terms_columns[idx]
            score = prob_row[idx]
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")
            
    del X_chunk, ids_chunk, decision_scores, probs
    gc.collect()

# Output file
print(f"üíæ Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("Finished")

#### Score: 0.192

## Improvement

### GO Hierarchy: Ridge classifier ƒëang h·ªçc c√°c nh√£n 1 c√°ch ƒë·ªôc l·∫≠p, nh√£n con c√≥ th·ªÉ c√≥ score cao, nh∆∞ng nh·ªØng nh√£n cha chung chung th√¨ score l·∫°i th·∫•p

### Sol: Ensemble: Mix v·ªõi naive approach 

In [1]:
import pandas as pd
from tqdm import tqdm

LEVEL4_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"
NAIVE_FILE = "/workspace/notebooks/submission_naive.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_ensemble_boosted.tsv"

print("Reading ESM2 output file....")
preds_l4 = {}

try:
    with open(LEVEL4_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            # Key : (ProteinID, GO_Term)
            key = (parts[0], parts[1])
            preds_l4[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found: {LEVEL4_FILE}.")
    raise

print("Reading file Naive...")
preds_naive = {}
try:
    with open(NAIVE_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            key = (parts[0], parts[1])
            preds_naive[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found {NAIVE_FILE}")
    raise

print("Ensembling...")

#Select all id-term pairs in 2 files
all_keys = set(preds_l4.keys()) | set(preds_naive.keys())
output_lines = []

W_L4 = 0.6
W_NAIVE = 0.4

for key in tqdm(all_keys):
    pid, term = key
    
    # L·∫•y ƒëi·ªÉm s·ªë, n·∫øu file n√†o kh√¥ng c√≥ th√¨ coi l√† 0
    score_l4 = preds_l4.get(key, 0.0)
    score_naive = preds_naive.get(key, 0.0)
    
    # C√¥ng th·ª©c c·ªông g·ªôp
    final_score = (score_l4 * W_L4) + (score_naive * W_NAIVE)
    
    # Ch·ªâ ghi nh·ªØng d√≤ng c√≥ ƒëi·ªÉm s·ªë > 0.001 ƒë·ªÉ file ƒë·ª° n·∫∑ng
    if final_score > 0.001:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

# Ghi ra file
print(f"Saving: {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print(f"Finisehd.")

Reading ESM2 output file....


11215450it [00:07, 1548396.54it/s]


Reading file Naive...


10093905it [00:05, 1690414.90it/s]


Ensembling...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17589919/17589919 [00:20<00:00, 844235.35it/s]


Saving: /workspace/notebooks/submission_ensemble_boosted.tsv
Finisehd.
