### Drawback of traditional ML approach: Kh√¥ng gi·ªØ ƒë∆∞·ª£c order meaning c·ªßa chu·ªói axit amin

#### Sol: s·ª≠ d·ª•ng m√¥ h√¨nh esm2 650M tham s·ªë ƒë·ªÉ embedding c√°c chu·ªói acid amin

## Load model

In [None]:
%pip install transformers

In [3]:
import torch
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")

Running on device: cuda


In [7]:
model_name = "facebook/esm2_t33_650M_UR50D"

print(f"Loading model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval() #read-only to save VRAM

Loading model: facebook/esm2_t33_650M_UR50D...


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

## Feature extraction

In [8]:
def extract_embeddings(fasta_path, save_name, batch_size=8, limit=None):
    """
    Read fasta file -> Run through ESM2 -> output: .npy
    save_name: output file
    """
    ids = []
    sequences = []

    print(f"Reading file: {fasta_path}")
    for i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):
        if limit and i >= limit: break

        #clean id
        pid = str(record.id)
        if "|" in pid:
            pid = pid.split("|")[1]

        ids.append(pid)
        #esm2 limit 1024 token
        sequences.append(str(record.seq)[:1022])

    print(f"{len(sequences)} Proteins")

    #batching
    embeddings = []
    print("Creating embeddings...")

    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i : i + batch_size]

        #tokenize
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        #mean pooling
        last_hidden_state = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask

        embeddings.append(mean_embeddings.cpu().numpy())

    final_embeddings = np.vstack(embeddings)

    np.save(f"/workspace/data/Embeddings/{save_name}.npy", final_embeddings)
    np.save(f"/workspace/data/Embeddings/{save_name}_ids.npy", ids)

    return ids, final_embeddings

In [None]:
train_fasta = "/workspace/data/Train/train_sequences.fasta"

train_ids, X_train = extract_embeddings(
    train_fasta, 
    save_name="train_650M", 
    batch_size=8
)

print(f"Shape X_train: {X_train.shape}")

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

#prepare labels
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])

train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set)]

top_n = 1500
top_terms = train_terms_filtered["term"].value_counts().head(top_n).index.tolist()

Y_matrix = train_terms_filtered[train_terms_filtered["term"].isin(top_terms)] \
            .pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [None]:
clf = RidgeClassifier(alpha=1.0)
clf.fit(X_tr, Y_tr)

In [None]:
Y_pred_val = clf.predict(X_val)
score = f1_score(Y_val, Y_pred_val, average='micro')
print(f"Local F1-Score: {score:.4f}")

In [None]:
# --- Cell ch·∫©n ƒëo√°n l·ªói ---
import numpy as np
from sklearn.metrics import precision_score, recall_score

# 1. Ki·ªÉm tra xem m√¥ h√¨nh c√≥ d·ª± ƒëo√°n ra c√°i g√¨ kh√¥ng?
print(f"T·ªïng s·ªë m·∫´u trong t·∫≠p Val: {Y_val.shape[0]}")
print(f"T·ªïng s·ªë nh√£n c·∫ßn d·ª± ƒëo√°n: {Y_val.shape[0] * Y_val.shape[1]}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (Th·ª±c t·∫ø): {Y_val.sum()}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (M√¥ h√¨nh d·ª± ƒëo√°n): {Y_pred_val.sum()}")

# 2. N·∫øu s·ªë d·ª± ƒëo√°n qu√° th·∫•p (g·∫ßn b·∫±ng 0), ta c·∫ßn h·∫° ng∆∞·ª°ng (Threshold)
print("\n--- Th·ª≠ ch·ªânh ng∆∞·ª°ng th·ªß c√¥ng ---")
# L·∫•y ƒëi·ªÉm s·ªë th√¥ thay v√¨ nh√£n c·ª©ng 0/1
decision_scores = clf.decision_function(X_val) 

# Th·ª≠ c√°c ng∆∞·ª°ng kh√°c nhau
for thr in [0, -0.5, -1.0]: # Ridge score c√≥ th·ªÉ √¢m
    y_pred_new = (decision_scores > thr).astype(int)
    new_f1 = f1_score(Y_val, y_pred_new, average='micro')
    print(f"Ng∆∞·ª°ng {thr}: F1-Score = {new_f1:.4f}")

In [11]:
import os
import gc
import torch
import numpy as np
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

FASTA_PATH = "/workspace/data/Test/testsuperset.fasta" 
SAVE_DIR = "/workspace/data/Embeddings/embeddings_chunks"
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

CHUNK_SIZE = 5000  
BATCH_SIZE = 8    

# --- SETUP ---
os.makedirs(SAVE_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def process_and_save(seqs, ids, part_idx):
    embeddings = []
    for i in range(0, len(seqs), BATCH_SIZE):
        batch_seqs = seqs[i : i + BATCH_SIZE]
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask, 1)
            sum_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
        embeddings.append(mean_embeddings.cpu().numpy())
    
    final_emb = np.vstack(embeddings)
    np.save(f"{SAVE_DIR}/test_part_{part_idx}.npy", final_emb)
    np.save(f"{SAVE_DIR}/test_ids_{part_idx}.npy", ids)

sequences = []
ids = []
part_counter = 0

pbar = tqdm(total=224309) 

for record in SeqIO.parse(FASTA_PATH, "fasta"):
    save_path_check = f"{SAVE_DIR}/test_part_{part_counter}.npy"
    
    if os.path.exists(save_path_check):
        sequences.append(1) 
        if len(sequences) >= CHUNK_SIZE:
            sequences = [] 
            ids = []
            part_counter += 1
            pbar.update(CHUNK_SIZE)
        continue

    pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
    ids.append(pid)
    sequences.append(str(record.seq)[:1022])
    pbar.update(1)
    
    if len(sequences) >= CHUNK_SIZE: 
        process_and_save(sequences, ids, part_counter)
        part_counter += 1
        
        sequences = []
        ids = []
        gc.collect()

if len(sequences) > 0:
    process_and_save(sequences, ids, part_counter)

print("Finished")

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [1:46:48<00:00, 47.25it/s]

Finished


In [None]:
import numpy as np
import pandas as pd
import glob
import gc
from tqdm import tqdm
from sklearn.linear_model import RidgeClassifier

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy" 
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
TEST_CHUNKS_DIR = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE = "submission_level4_FINAL_fixed.tsv"

# Re-training
print("Loading data and training model...")
X_train = np.load(TRAIN_EMB_PATH)
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)

train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])
top_n = 1500
top_terms = train_terms["term"].value_counts().head(top_n).index.tolist()
train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set) & train_terms["term"].isin(top_terms)]

Y_matrix = train_terms_filtered.pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)
terms_columns = Y_train.columns 

clf = RidgeClassifier(alpha=1.0)
clf.fit(X_train, Y_train)

del X_train, Y_train, train_terms, train_terms_filtered, Y_matrix
gc.collect()

print("Outputting (Top K)...")

chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []
TOP_K = 50   
THRESHOLD = 0.01 #increased threshold

for f_path in tqdm(chunk_files):
    X_chunk = np.load(f_path)
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_chunk = np.load(id_path, allow_pickle=True)
    
    #predicting
    decision_scores = clf.decision_function(X_chunk)
    probs = 1 / (1 + np.exp(-decision_scores))
    
    for i, pid in enumerate(ids_chunk):
        prob_row = probs[i]
        
        # 1. thresholding
        mask = prob_row > THRESHOLD
        if not np.any(mask):
            indices = np.argsort(prob_row)[-5:]
        else:
            candidates = np.where(mask)[0]
            
            #2: only select the top 50
            if len(candidates) > TOP_K:
                # get candidate scores
                cand_probs = prob_row[candidates]
                # sort for top k
                top_k_local_idx = np.argsort(cand_probs)[-TOP_K:]
                indices = candidates[top_k_local_idx]
            else:
                indices = candidates
            
        for idx in indices:
            term = terms_columns[idx]
            score = prob_row[idx]
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")
            
    del X_chunk, ids_chunk, decision_scores, probs
    gc.collect()

# Output file
print(f"üíæ Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("Finished")

#### Score: 0.192

## Improvement

### GO Hierarchy: Ridge classifier ƒëang h·ªçc c√°c nh√£n 1 c√°ch ƒë·ªôc l·∫≠p, nh√£n con c√≥ th·ªÉ c√≥ score cao, nh∆∞ng nh·ªØng nh√£n cha chung chung th√¨ score l·∫°i th·∫•p

### Sol: Ensemble: Mix v·ªõi naive approach 

In [1]:
import pandas as pd
from tqdm import tqdm

LEVEL4_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"
NAIVE_FILE = "/workspace/notebooks/submission_naive.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_ensemble_boosted.tsv"

print("Reading ESM2 output file....")
preds_l4 = {}

try:
    with open(LEVEL4_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            # Key : (ProteinID, GO_Term)
            key = (parts[0], parts[1])
            preds_l4[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found: {LEVEL4_FILE}.")
    raise

print("Reading file Naive...")
preds_naive = {}
try:
    with open(NAIVE_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            key = (parts[0], parts[1])
            preds_naive[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found {NAIVE_FILE}")
    raise

print("Ensembling...")

#Select all id-term pairs in 2 files
all_keys = set(preds_l4.keys()) | set(preds_naive.keys())
output_lines = []

W_L4 = 0.6
W_NAIVE = 0.4

for key in tqdm(all_keys):
    pid, term = key
    
    # L·∫•y ƒëi·ªÉm s·ªë, n·∫øu file n√†o kh√¥ng c√≥ th√¨ coi l√† 0
    score_l4 = preds_l4.get(key, 0.0)
    score_naive = preds_naive.get(key, 0.0)
    
    # C√¥ng th·ª©c c·ªông g·ªôp
    final_score = (score_l4 * W_L4) + (score_naive * W_NAIVE)
    
    # Ch·ªâ ghi nh·ªØng d√≤ng c√≥ ƒëi·ªÉm s·ªë > 0.001 ƒë·ªÉ file ƒë·ª° n·∫∑ng
    if final_score > 0.001:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

# Ghi ra file
print(f"Saving: {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print(f"Finisehd.")

Reading ESM2 output file....


11215450it [00:07, 1548396.54it/s]


Reading file Naive...


10093905it [00:05, 1690414.90it/s]


Ensembling...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17589919/17589919 [00:20<00:00, 844235.35it/s]


Saving: /workspace/notebooks/submission_ensemble_boosted.tsv
Finisehd.


### Ensemble ESM2 + BLAST/Diamond

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

LEVEL4_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"

DIAMOND_BIN = "/usr/bin/diamond"
TRAIN_FASTA = "/workspace/data/Train/train_sequences.fasta"
TEST_FASTA = "/workspace/data/Test/testsuperset.fasta"
DB_PATH = "/workspace/data/Traintrain_data.dmnd"
BLAST_RESULT = "/workspace/notebooks/diamond_results.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_hybrid_blast_esm2.tsv"

print("Runnign BLAST...")

#create db
if not os.path.exists(DB_PATH):
    print("ƒêang t·∫°o database...")
    os.system(f"{DIAMOND_BIN} makedb --in {TRAIN_FASTA} -d {DB_PATH} --quiet")

#alignment
cmd = f"{DIAMOND_BIN} blastp -d {DB_PATH} -q {TEST_FASTA} -o {BLAST_RESULT} --sensitive --top 1 -f 6 qseqid sseqid pident"
os.system(cmd)
print("Finished blasting")

print("Handling output of BLAST...")
blast_preds = {}

# Load train terms ƒë·ªÉ map nh√£n
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])
train_terms_grouped = train_terms.groupby("EntryID")["term"].apply(list).to_dict()

# ƒê·ªçc k·∫øt qu·∫£ BLAST
df_blast = pd.read_csv(BLAST_RESULT, sep="\t", names=["test_id", "train_id", "pident"])

for _, row in tqdm(df_blast.iterrows(), total=len(df_blast)):
    # Clean ID (S·ª≠a l·ªói ID b·ªã d√≠nh sp|...)
    test_id = str(row['test_id']).split("|")[1] if "|" in str(row['test_id']) else str(row['test_id'])
    train_id = str(row['train_id']).split("|")[1] if "|" in str(row['train_id']) else str(row['train_id'])
    
    score = row['pident'] / 100.0
    
    # Ch·ªâ l·∫•y nh·ªØng th·∫±ng gi·ªëng nhau > 30% (Ng∆∞·ª°ng an to√†n)
    if score < 0.3: continue
    
    if train_id in train_terms_grouped:
        for term in train_terms_grouped[train_id]:
            key = (test_id, term)
            # BLAST r·∫•t uy t√≠n, n√™n gi·ªØ nguy√™n score cao
            blast_preds[key] = score

print("Reading file Level 4...")
esm_preds = {}
try:
    with open(LEVEL4_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            key = (parts[0], parts[1])
            esm_preds[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found {LEVEL4_FILE}.")
    raise

print("Mixing...")

all_keys = set(blast_preds.keys()) | set(esm_preds.keys())
output_lines = []

for key in tqdm(all_keys):
    pid, term = key
    
    s_blast = blast_preds.get(key, 0.0)
    s_esm = esm_preds.get(key, 0.0)
    
    # CHI·∫æN THU·∫¨T QUAN TR·ªåNG:
    # N·∫øu BLAST t√¨m th·∫•y -> Tin BLAST (v√¨ n√≥ so kh·ªõp ch√≠nh x√°c)
    # N·∫øu BLAST kh√¥ng th·∫•y -> Tin ESM (v√¨ n√≥ suy lu·∫≠n t·ªët)
    # => L·∫•y MAX
    final_score = max(s_blast, s_esm)
    
    output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

#GHI FILE
print(f"Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print(f"Finished")

Runnign BLAST...


diamond v2.1.8.162 (C) Max Planck Society for the Advancement of Science, Benjamin Buchfink, University of Tuebingen
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Temporary directory: /workspace/notebooks
Percentage range of top alignment score to report hits: 1
Opening the database...  [0.06s]
Database: /workspace/data/Traintrain_data.dmnd (type: Diamond database, sequences: 82404, letters: 43327058)
Block size = 2000000000
Opening the input file...  [0.035s]
Opening the output file...  [0s]
Loading query sequences...  [0.215s]
Masking queries...  [0.141s]
Algorithm: Double-indexed
Building query histograms...  [0.513s]
Seeking in database...  [0s]
Loading reference sequences...  [0.051s]
Masking reference...  [0.061s]
Initializing temporary storage...  [0s]
Building reference histogra

Finished blasting
Handling output of BLAST...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 262530/262530 [00:03<00:00, 72338.79it/s]


Reading file Level 4...


11215450it [00:07, 1560616.37it/s]


Mixing...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11968960/11968960 [00:12<00:00, 950665.04it/s]


Saving /workspace/notebooks/submission_hybrid_blast_esm2.tsv...
Finished


### Propagation

In [3]:
%pip install obonet networkx

Collecting obonet
  Downloading obonet-1.1.1-py3-none-any.whl.metadata (6.7 kB)
Downloading obonet-1.1.1-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.1.1
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm

INPUT_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_level4_propagated.tsv"
OBO_PATH = "/workspace/data/Train/go-basic.obo"

In [5]:
print("ƒê·ªçc c√¢y ph·∫£ h·ªá...")
graph = obonet.read_obo(OBO_PATH)

ƒê·ªçc c√¢y ph·∫£ h·ªá...


In [6]:
print("X√¢y map quan h·ªá cha-con...")
ancestors_map = {}
for node in tqdm(graph.nodes()):
    # networkx tr·∫£ v·ªÅ danh s√°ch t·ªï ti√™n
    try:
        ancestors = networkx.descendants(graph, node) # Trong obonet, chi·ªÅu m≈©i t√™n ng∆∞·ª£c (Con -> Cha)
        ancestors_map[node] = ancestors
    except:
        pass

X√¢y map quan h·ªá cha-con...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 132343.17it/s]


In [7]:
print(f"ƒêang ƒë·ªçc {INPUT_FILE}...")
# ƒê·ªçc v√†o Dict: {ProteinID: {Term: Score}}
preds = {}
with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        if pid not in preds: preds[pid] = {}
        preds[pid][term] = score

ƒêang ƒë·ªçc /workspace/notebooks/submission_level4_FINAL_fixed.tsv...


11215450it [00:04, 2284548.44it/s]


In [8]:
print("Lan truy·ªÅn ng∆∞·ª£c...")
final_lines = []

for pid, term_scores in tqdm(preds.items()):
    # term_scores l√† dict {Term: Score g·ªëc}
    # new_scores s·∫Ω ch·ª©a c·∫£ ƒëi·ªÉm c·ªßa cha √¥ng
    new_scores = term_scores.copy()
    
    for term, score in term_scores.items():
        # L·∫•y danh s√°ch cha √¥ng c·ªßa term n√†y
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # Quy t·∫Øc: ƒêi·ªÉm c·ªßa cha = MAX(ƒêi·ªÉm cha c≈©, ƒêi·ªÉm c·ªßa con)
                current_parent_score = new_scores.get(parent, 0.0)
                new_scores[parent] = max(current_parent_score, score)
    
    # Ghi ra k·∫øt qu·∫£ (L·∫°i ph·∫£i l·ªçc Top K v√¨ gi·ªù n√≥ ph√¨nh to ra)
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn theo ƒëi·ªÉm
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    
    # L·∫•y Top 70 (tƒÉng l√™n ch√∫t v√¨ gi·ªù c√≥ c·∫£ cha √¥ng)
    for term, score in sorted_terms[:70]:
        final_lines.append(f"{pid}\t{term}\t{score:.3f}")

Lan truy·ªÅn ng∆∞·ª£c...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:22<00:00, 9829.63it/s] 


In [9]:
print(f"Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(final_lines))

print("Finished")

Saving /workspace/notebooks/submission_level4_propagated.tsv...
Finished


### S·ª≠ d·ª•ng KNN clf tr√™n embeddings

In [21]:
import torch
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
import glob

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy"
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
TEST_CHUNKS_DIR = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE = "submission_knn_esm2.tsv"
TOP_K = 5  # L·∫•y 5 ng∆∞·ªùi h√†ng x√≥m gi·ªëng nh·∫•t
device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
print("Load Train Embeddings...")
# Load vector
X_train = np.load(TRAIN_EMB_PATH)
X_train = torch.from_numpy(X_train).to(device)

# Chu·∫©n h√≥a vector v·ªÅ ƒë∆°n v·ªã (ƒë·ªÉ t√≠nh Cosine Similarity nhanh b·∫±ng ph√©p nh√¢n ma tr·∫≠n)
# C√¥ng th·ª©c: v = v / |v|
norm = X_train.norm(p=2, dim=1, keepdim=True)
X_train = X_train.div(norm)

# Load ID v√† Map Nh√£n
print("Loading labels...")
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)
train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])

# Gom nh√≥m: TrainID -> Set(Terms)
train_labels_map = train_terms.groupby("EntryID")["term"].apply(set).to_dict()

# Map Index -> ID (ƒë·ªÉ truy xu·∫•t nhanh t·ª´ k·∫øt qu·∫£ KNN)
idx_to_trainid = {i: pid for i, pid in enumerate(train_ids)}

print("Train xong tren GPU!")

Load Train Embeddings...
Loading labels...
Train xong tren GPU!


In [23]:
print("B·∫Øt ƒë·∫ßu ch·∫°y KNN (T√¨m h√†ng x√≥m)...")

chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []

for f_path in tqdm(chunk_files):
    # 1. Load 1 c·ª•c Test l√™n GPU
    X_test_np = np.load(f_path)
    X_test = torch.from_numpy(X_test_np).to(device)
    
    # Chu·∫©n h√≥a Test
    norm_test = X_test.norm(p=2, dim=1, keepdim=True)
    X_test = X_test.div(norm_test)
    
    # Load ID Test t∆∞∆°ng ·ª©ng
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_test = np.load(id_path, allow_pickle=True)
    
    # 2. T√≠nh ƒë·ªô t∆∞∆°ng ƒë·ªìng (Matrix Multiplication)
    # [Batch, 1280] x [1280, All_Train] = [Batch, All_Train]
    # ƒê√¢y l√† b∆∞·ªõc n·∫∑ng nh·∫•t, nh∆∞ng GPU x·ª≠ l√Ω t·ªët
    sim_matrix = torch.mm(X_test, X_train.t())
    
    # 3. L·∫•y Top K h√†ng x√≥m
    # values: ƒë·ªô t∆∞∆°ng ƒë·ªìng (score), indices: v·ªã tr√≠ c·ªßa h√†ng x√≥m
    topk_values, topk_indices = torch.topk(sim_matrix, k=TOP_K, dim=1)
    
    # Chuy·ªÉn v·ªÅ CPU ƒë·ªÉ x·ª≠ l√Ω logic g√°n nh√£n (Python x·ª≠ l√Ω dict nhanh h∆°n)
    topk_indices = topk_indices.cpu().numpy()
    topk_values = topk_values.cpu().numpy()
    
    # 4. T·ªïng h·ª£p nh√£n t·ª´ h√†ng x√≥m (Weighted Voting)
    for i, test_pid in enumerate(ids_test):
        # Dict l∆∞u ƒëi·ªÉm s·ªë cho t·ª´ng nh√£n: {Term: Score}
        term_scores = {}
        
        for k in range(TOP_K):
            neighbor_idx = topk_indices[i, k]
            score = topk_values[i, k] # ƒê·ªô gi·ªëng nhau (v√≠ d·ª• 0.95)
            
            neighbor_id = idx_to_trainid[neighbor_idx]
            
            # N·∫øu h√†ng x√≥m n√†y c√≥ nh√£n (c√≥ trong file train_terms)
            if neighbor_id in train_labels_map:
                neighbor_terms = train_labels_map[neighbor_id]
                for term in neighbor_terms:
                    # C·ªông d·ªìn ƒëi·ªÉm (Weighted Sum)
                    if term not in term_scores:
                        term_scores[term] = 0.0
                    term_scores[term] += score
        
        # Chu·∫©n h√≥a ƒëi·ªÉm s·ªë (Chia cho t·ªïng tr·ªçng s·ªë ho·∫∑c K)
        # ·ªû ƒë√¢y ta chia cho K ƒë·ªÉ score n·∫±m trong kho·∫£ng 0-1
        # Ho·∫∑c ƒë∆°n gi·∫£n l√† gi·ªØ nguy√™n v√¨ CAFA ch·∫•m rank
        
        # L·∫•y Top 50 nh√£n ƒëi·ªÉm cao nh·∫•t ƒë·ªÉ ghi file
        sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)[:50]
        
        for term, total_score in sorted_terms:
            # Normalize heuristic: Score trung b√¨nh
            final_score = total_score / TOP_K 
            # Ch·ªâ ghi n·∫øu score ƒë·ªß l·ªõn
            if final_score > 0.01: 
                output_lines.append(f"{test_pid}\t{term}\t{final_score:.3f}")

    # D·ªçn d·∫πp b·ªô nh·ªõ GPU
    del X_test, sim_matrix, topk_values, topk_indices
    torch.cuda.empty_cache()

print("Finished")

B·∫Øt ƒë·∫ßu ch·∫°y KNN (T√¨m h√†ng x√≥m)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [00:05<00:00,  8.25it/s]

Finished





In [24]:
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

ƒêang l∆∞u submission_knn_esm2.tsv...


#### + Propagation

In [25]:
import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm

INPUT_FILE = "/workspace/notebooks/submission_knn_esm2.tsv" 
OUTPUT_FILE = "/workspace/notebooks/submission_knn_propagated.tsv"
OBO_PATH = "/workspace/data/Train/go-basic.obo"

In [26]:
# 1. Load c√¢y Gene Ontology
print("ƒê·ªçc file go-basic.obo...")
graph = obonet.read_obo(OBO_PATH)

# 2. X√¢y d·ª±ng b·∫£n ƒë·ªì Cha-Con
print("ƒêang x√¢y d·ª±ng quan h·ªá t·ªï ti√™n...")
ancestors_map = {}
# Ch·ªâ quan t√¢m ƒë·∫øn c√°c node c√≥ quan h·ªá 'is_a' v√† 'part_of'
for node in tqdm(graph.nodes()):
    try:
        # L·∫•y t·∫•t c·∫£ t·ªï ti√™n c·ªßa node hi·ªán t·∫°i
        ancestors = networkx.descendants(graph, node)
        ancestors_map[node] = ancestors
    except:
        pass

# 3. ƒê·ªçc file KNN
print(f"ƒêang ƒë·ªçc {INPUT_FILE}...")
preds = {} # {ProteinID: {Term: Score}}

with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        if pid not in preds: preds[pid] = {}
        preds[pid][term] = score

# 4. Lan truy·ªÅn ƒëi·ªÉm s·ªë (Propagation)
print("ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...")
output_lines = []

for pid, term_scores in tqdm(preds.items()):
    # Copy dict ƒëi·ªÉm c≈©
    new_scores = term_scores.copy()
    
    # Duy·ªát qua t·ª´ng term ƒëang c√≥
    for term, score in term_scores.items():
        # N·∫øu term n√†y c√≥ t·ªï ti√™n
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # Quy t·∫Øc c·ªët l√µi: ƒêi·ªÉm c·ªßa Cha lu√¥n >= ƒêi·ªÉm c·ªßa Con
                current_p_score = new_scores.get(parent, 0.0)
                new_scores[parent] = max(current_p_score, score)
    
    # L·ªçc v√† Ghi file
    # Sau khi lan truy·ªÅn, s·ªë l∆∞·ª£ng nh√£n s·∫Ω ph√¨nh to ra (v√¨ th√™m cha √¥ng)
    # Ta ch·ªâ l·∫•y Top 70 nh√£n ƒëi·ªÉm cao nh·∫•t ƒë·ªÉ file kh√¥ng qu√° n·∫∑ng
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)[:70]
    
    for term, score in sorted_terms:
        # L·ªçc b·ªõt r√°c: Ch·ªâ l·∫•y > 0.01
        if score > 0.01:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

# 5. L∆∞u file
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("XONG!")

ƒê·ªçc file go-basic.obo...
ƒêang x√¢y d·ª±ng quan h·ªá t·ªï ti√™n...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 158229.79it/s]


ƒêang ƒë·ªçc /workspace/notebooks/submission_knn_esm2.tsv...


4549325it [00:01, 2344697.58it/s]


ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:12<00:00, 18368.44it/s]


ƒêang l∆∞u /workspace/notebooks/submission_knn_propagated.tsv...
XONG!
