### Drawback of traditional ML approach: Kh√¥ng gi·ªØ ƒë∆∞·ª£c order meaning c·ªßa chu·ªói axit amin

#### Sol: s·ª≠ d·ª•ng m√¥ h√¨nh esm2 650M tham s·ªë ƒë·ªÉ embedding c√°c chu·ªói acid amin

## Load model

In [None]:
%pip install transformers

In [3]:
import torch
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")

Running on device: cuda


In [7]:
model_name = "facebook/esm2_t33_650M_UR50D"

print(f"Loading model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval() #read-only to save VRAM

Loading model: facebook/esm2_t33_650M_UR50D...


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EsmModel(
  (embeddings): EsmEmbeddings(
    (word_embeddings): Embedding(33, 1280, padding_idx=1)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): EsmEncoder(
    (layer): ModuleList(
      (0-32): 33 x EsmLayer(
        (attention): EsmAttention(
          (self): EsmSelfAttention(
            (query): Linear(in_features=1280, out_features=1280, bias=True)
            (key): Linear(in_features=1280, out_features=1280, bias=True)
            (value): Linear(in_features=1280, out_features=1280, bias=True)
            (rotary_embeddings): RotaryEmbedding()
          )
          (output): EsmSelfOutput(
            (dense): Linear(in_features=1280, out_features=1280, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        )
        (intermediate): EsmIntermediate(
          (dense): Linear(in_features=1280, out_features=5120, bias=True)
        )
        (output): EsmOut

## Feature extraction

In [8]:
def extract_embeddings(fasta_path, save_name, batch_size=8, limit=None):
    """
    Read fasta file -> Run through ESM2 -> output: .npy
    save_name: output file
    """
    ids = []
    sequences = []

    print(f"Reading file: {fasta_path}")
    for i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):
        if limit and i >= limit: break

        #clean id
        pid = str(record.id)
        if "|" in pid:
            pid = pid.split("|")[1]

        ids.append(pid)
        #esm2 limit 1024 token
        sequences.append(str(record.seq)[:1022])

    print(f"{len(sequences)} Proteins")

    #batching
    embeddings = []
    print("Creating embeddings...")

    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i : i + batch_size]

        #tokenize
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        #mean pooling
        last_hidden_state = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask

        embeddings.append(mean_embeddings.cpu().numpy())

    final_embeddings = np.vstack(embeddings)

    np.save(f"/workspace/data/Embeddings/{save_name}.npy", final_embeddings)
    np.save(f"/workspace/data/Embeddings/{save_name}_ids.npy", ids)

    return ids, final_embeddings

In [None]:
train_fasta = "/workspace/data/Train/train_sequences.fasta"

train_ids, X_train = extract_embeddings(
    train_fasta, 
    save_name="train_650M", 
    batch_size=8
)

print(f"Shape X_train: {X_train.shape}")

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

#prepare labels
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])

train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set)]

top_n = 1500
top_terms = train_terms_filtered["term"].value_counts().head(top_n).index.tolist()

Y_matrix = train_terms_filtered[train_terms_filtered["term"].isin(top_terms)] \
            .pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [None]:
clf = RidgeClassifier(alpha=1.0)
clf.fit(X_tr, Y_tr)

In [None]:
Y_pred_val = clf.predict(X_val)
score = f1_score(Y_val, Y_pred_val, average='micro')
print(f"Local F1-Score: {score:.4f}")

In [None]:
# --- Cell ch·∫©n ƒëo√°n l·ªói ---
import numpy as np
from sklearn.metrics import precision_score, recall_score

# 1. Ki·ªÉm tra xem m√¥ h√¨nh c√≥ d·ª± ƒëo√°n ra c√°i g√¨ kh√¥ng?
print(f"T·ªïng s·ªë m·∫´u trong t·∫≠p Val: {Y_val.shape[0]}")
print(f"T·ªïng s·ªë nh√£n c·∫ßn d·ª± ƒëo√°n: {Y_val.shape[0] * Y_val.shape[1]}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (Th·ª±c t·∫ø): {Y_val.sum()}")
print(f"S·ªë l∆∞·ª£ng nh√£n 1 (M√¥ h√¨nh d·ª± ƒëo√°n): {Y_pred_val.sum()}")

# 2. N·∫øu s·ªë d·ª± ƒëo√°n qu√° th·∫•p (g·∫ßn b·∫±ng 0), ta c·∫ßn h·∫° ng∆∞·ª°ng (Threshold)
print("\n--- Th·ª≠ ch·ªânh ng∆∞·ª°ng th·ªß c√¥ng ---")
# L·∫•y ƒëi·ªÉm s·ªë th√¥ thay v√¨ nh√£n c·ª©ng 0/1
decision_scores = clf.decision_function(X_val) 

# Th·ª≠ c√°c ng∆∞·ª°ng kh√°c nhau
for thr in [0, -0.5, -1.0]: # Ridge score c√≥ th·ªÉ √¢m
    y_pred_new = (decision_scores > thr).astype(int)
    new_f1 = f1_score(Y_val, y_pred_new, average='micro')
    print(f"Ng∆∞·ª°ng {thr}: F1-Score = {new_f1:.4f}")

In [11]:
import os
import gc
import torch
import numpy as np
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

FASTA_PATH = "/workspace/data/Test/testsuperset.fasta" 
SAVE_DIR = "/workspace/data/Embeddings/embeddings_chunks"
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

CHUNK_SIZE = 5000  
BATCH_SIZE = 8    

# --- SETUP ---
os.makedirs(SAVE_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def process_and_save(seqs, ids, part_idx):
    embeddings = []
    for i in range(0, len(seqs), BATCH_SIZE):
        batch_seqs = seqs[i : i + BATCH_SIZE]
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask, 1)
            sum_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
        embeddings.append(mean_embeddings.cpu().numpy())
    
    final_emb = np.vstack(embeddings)
    np.save(f"{SAVE_DIR}/test_part_{part_idx}.npy", final_emb)
    np.save(f"{SAVE_DIR}/test_ids_{part_idx}.npy", ids)

sequences = []
ids = []
part_counter = 0

pbar = tqdm(total=224309) 

for record in SeqIO.parse(FASTA_PATH, "fasta"):
    save_path_check = f"{SAVE_DIR}/test_part_{part_counter}.npy"
    
    if os.path.exists(save_path_check):
        sequences.append(1) 
        if len(sequences) >= CHUNK_SIZE:
            sequences = [] 
            ids = []
            part_counter += 1
            pbar.update(CHUNK_SIZE)
        continue

    pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
    ids.append(pid)
    sequences.append(str(record.seq)[:1022])
    pbar.update(1)
    
    if len(sequences) >= CHUNK_SIZE: 
        process_and_save(sequences, ids, part_counter)
        part_counter += 1
        
        sequences = []
        ids = []
        gc.collect()

if len(sequences) > 0:
    process_and_save(sequences, ids, part_counter)

print("Finished")

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [1:46:48<00:00, 47.25it/s]

Finished


In [None]:
import numpy as np
import pandas as pd
import glob
import gc
from tqdm import tqdm
from sklearn.linear_model import RidgeClassifier

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy" 
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
TEST_CHUNKS_DIR = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE = "submission_level4_FINAL_fixed.tsv"

# Re-training
print("Loading data and training model...")
X_train = np.load(TRAIN_EMB_PATH)
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)

train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])
top_n = 1500
top_terms = train_terms["term"].value_counts().head(top_n).index.tolist()
train_ids_set = set(train_ids)
train_terms_filtered = train_terms[train_terms["EntryID"].isin(train_ids_set) & train_terms["term"].isin(top_terms)]

Y_matrix = train_terms_filtered.pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
Y_train = Y_matrix.reindex(train_ids).fillna(0).astype(int)
terms_columns = Y_train.columns 

clf = RidgeClassifier(alpha=1.0)
clf.fit(X_train, Y_train)

del X_train, Y_train, train_terms, train_terms_filtered, Y_matrix
gc.collect()

print("Outputting (Top K)...")

chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []
TOP_K = 50   
THRESHOLD = 0.01 #increased threshold

for f_path in tqdm(chunk_files):
    X_chunk = np.load(f_path)
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_chunk = np.load(id_path, allow_pickle=True)
    
    #predicting
    decision_scores = clf.decision_function(X_chunk)
    probs = 1 / (1 + np.exp(-decision_scores))
    
    for i, pid in enumerate(ids_chunk):
        prob_row = probs[i]
        
        # 1. thresholding
        mask = prob_row > THRESHOLD
        if not np.any(mask):
            indices = np.argsort(prob_row)[-5:]
        else:
            candidates = np.where(mask)[0]
            
            #2: only select the top 50
            if len(candidates) > TOP_K:
                # get candidate scores
                cand_probs = prob_row[candidates]
                # sort for top k
                top_k_local_idx = np.argsort(cand_probs)[-TOP_K:]
                indices = candidates[top_k_local_idx]
            else:
                indices = candidates
            
        for idx in indices:
            term = terms_columns[idx]
            score = prob_row[idx]
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")
            
    del X_chunk, ids_chunk, decision_scores, probs
    gc.collect()

# Output file
print(f"üíæ Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("Finished")

#### Score: 0.192

## Improvement

### GO Hierarchy: Ridge classifier ƒëang h·ªçc c√°c nh√£n 1 c√°ch ƒë·ªôc l·∫≠p, nh√£n con c√≥ th·ªÉ c√≥ score cao, nh∆∞ng nh·ªØng nh√£n cha chung chung th√¨ score l·∫°i th·∫•p

### Sol: Ensemble: Mix v·ªõi naive approach 

In [1]:
import pandas as pd
from tqdm import tqdm

LEVEL4_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"
NAIVE_FILE = "/workspace/notebooks/submission_naive.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_ensemble_boosted.tsv"

print("Reading ESM2 output file....")
preds_l4 = {}

try:
    with open(LEVEL4_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            # Key : (ProteinID, GO_Term)
            key = (parts[0], parts[1])
            preds_l4[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found: {LEVEL4_FILE}.")
    raise

print("Reading file Naive...")
preds_naive = {}
try:
    with open(NAIVE_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            key = (parts[0], parts[1])
            preds_naive[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found {NAIVE_FILE}")
    raise

print("Ensembling...")

#Select all id-term pairs in 2 files
all_keys = set(preds_l4.keys()) | set(preds_naive.keys())
output_lines = []

W_L4 = 0.6
W_NAIVE = 0.4

for key in tqdm(all_keys):
    pid, term = key
    
    # L·∫•y ƒëi·ªÉm s·ªë, n·∫øu file n√†o kh√¥ng c√≥ th√¨ coi l√† 0
    score_l4 = preds_l4.get(key, 0.0)
    score_naive = preds_naive.get(key, 0.0)
    
    # C√¥ng th·ª©c c·ªông g·ªôp
    final_score = (score_l4 * W_L4) + (score_naive * W_NAIVE)
    
    # Ch·ªâ ghi nh·ªØng d√≤ng c√≥ ƒëi·ªÉm s·ªë > 0.001 ƒë·ªÉ file ƒë·ª° n·∫∑ng
    if final_score > 0.001:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

# Ghi ra file
print(f"Saving: {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print(f"Finisehd.")

Reading ESM2 output file....


11215450it [00:07, 1548396.54it/s]


Reading file Naive...


10093905it [00:05, 1690414.90it/s]


Ensembling...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17589919/17589919 [00:20<00:00, 844235.35it/s]


Saving: /workspace/notebooks/submission_ensemble_boosted.tsv
Finisehd.


### Ensemble ESM2 + BLAST/Diamond

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

LEVEL4_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"

DIAMOND_BIN = "/usr/bin/diamond"
TRAIN_FASTA = "/workspace/data/Train/train_sequences.fasta"
TEST_FASTA = "/workspace/data/Test/testsuperset.fasta"
DB_PATH = "/workspace/data/Traintrain_data.dmnd"
BLAST_RESULT = "/workspace/notebooks/diamond_results.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_hybrid_blast_esm2.tsv"

print("Runnign BLAST...")

#create db
if not os.path.exists(DB_PATH):
    print("ƒêang t·∫°o database...")
    os.system(f"{DIAMOND_BIN} makedb --in {TRAIN_FASTA} -d {DB_PATH} --quiet")

#alignment
cmd = f"{DIAMOND_BIN} blastp -d {DB_PATH} -q {TEST_FASTA} -o {BLAST_RESULT} --sensitive --top 1 -f 6 qseqid sseqid pident"
os.system(cmd)
print("Finished blasting")

print("Handling output of BLAST...")
blast_preds = {}

# Load train terms ƒë·ªÉ map nh√£n
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])
train_terms_grouped = train_terms.groupby("EntryID")["term"].apply(list).to_dict()

# ƒê·ªçc k·∫øt qu·∫£ BLAST
df_blast = pd.read_csv(BLAST_RESULT, sep="\t", names=["test_id", "train_id", "pident"])

for _, row in tqdm(df_blast.iterrows(), total=len(df_blast)):
    # Clean ID (S·ª≠a l·ªói ID b·ªã d√≠nh sp|...)
    test_id = str(row['test_id']).split("|")[1] if "|" in str(row['test_id']) else str(row['test_id'])
    train_id = str(row['train_id']).split("|")[1] if "|" in str(row['train_id']) else str(row['train_id'])
    
    score = row['pident'] / 100.0
    
    # Ch·ªâ l·∫•y nh·ªØng th·∫±ng gi·ªëng nhau > 30% (Ng∆∞·ª°ng an to√†n)
    if score < 0.3: continue
    
    if train_id in train_terms_grouped:
        for term in train_terms_grouped[train_id]:
            key = (test_id, term)
            # BLAST r·∫•t uy t√≠n, n√™n gi·ªØ nguy√™n score cao
            blast_preds[key] = score

print("Reading file Level 4...")
esm_preds = {}
try:
    with open(LEVEL4_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            key = (parts[0], parts[1])
            esm_preds[key] = float(parts[2])
except FileNotFoundError:
    print(f"File not found {LEVEL4_FILE}.")
    raise

print("Mixing...")

all_keys = set(blast_preds.keys()) | set(esm_preds.keys())
output_lines = []

for key in tqdm(all_keys):
    pid, term = key
    
    s_blast = blast_preds.get(key, 0.0)
    s_esm = esm_preds.get(key, 0.0)
    
    # CHI·∫æN THU·∫¨T QUAN TR·ªåNG:
    # N·∫øu BLAST t√¨m th·∫•y -> Tin BLAST (v√¨ n√≥ so kh·ªõp ch√≠nh x√°c)
    # N·∫øu BLAST kh√¥ng th·∫•y -> Tin ESM (v√¨ n√≥ suy lu·∫≠n t·ªët)
    # => L·∫•y MAX
    final_score = max(s_blast, s_esm)
    
    output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

#GHI FILE
print(f"Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print(f"Finished")

Runnign BLAST...


diamond v2.1.8.162 (C) Max Planck Society for the Advancement of Science, Benjamin Buchfink, University of Tuebingen
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 32
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Temporary directory: /workspace/notebooks
Percentage range of top alignment score to report hits: 1
Opening the database...  [0.06s]
Database: /workspace/data/Traintrain_data.dmnd (type: Diamond database, sequences: 82404, letters: 43327058)
Block size = 2000000000
Opening the input file...  [0.035s]
Opening the output file...  [0s]
Loading query sequences...  [0.215s]
Masking queries...  [0.141s]
Algorithm: Double-indexed
Building query histograms...  [0.513s]
Seeking in database...  [0s]
Loading reference sequences...  [0.051s]
Masking reference...  [0.061s]
Initializing temporary storage...  [0s]
Building reference histogra

Finished blasting
Handling output of BLAST...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 262530/262530 [00:03<00:00, 72338.79it/s]


Reading file Level 4...


11215450it [00:07, 1560616.37it/s]


Mixing...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11968960/11968960 [00:12<00:00, 950665.04it/s]


Saving /workspace/notebooks/submission_hybrid_blast_esm2.tsv...
Finished


### Propagation

In [3]:
%pip install obonet networkx

Collecting obonet
  Downloading obonet-1.1.1-py3-none-any.whl.metadata (6.7 kB)
Downloading obonet-1.1.1-py3-none-any.whl (9.2 kB)
Installing collected packages: obonet
Successfully installed obonet-1.1.1
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm

INPUT_FILE = "/workspace/notebooks/submission_level4_FINAL_fixed.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_level4_propagated.tsv"
OBO_PATH = "/workspace/data/Train/go-basic.obo"

In [5]:
print("ƒê·ªçc c√¢y ph·∫£ h·ªá...")
graph = obonet.read_obo(OBO_PATH)

ƒê·ªçc c√¢y ph·∫£ h·ªá...


In [6]:
print("X√¢y map quan h·ªá cha-con...")
ancestors_map = {}
for node in tqdm(graph.nodes()):
    # networkx tr·∫£ v·ªÅ danh s√°ch t·ªï ti√™n
    try:
        ancestors = networkx.descendants(graph, node) # Trong obonet, chi·ªÅu m≈©i t√™n ng∆∞·ª£c (Con -> Cha)
        ancestors_map[node] = ancestors
    except:
        pass

X√¢y map quan h·ªá cha-con...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 132343.17it/s]


In [7]:
print(f"ƒêang ƒë·ªçc {INPUT_FILE}...")
# ƒê·ªçc v√†o Dict: {ProteinID: {Term: Score}}
preds = {}
with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        if pid not in preds: preds[pid] = {}
        preds[pid][term] = score

ƒêang ƒë·ªçc /workspace/notebooks/submission_level4_FINAL_fixed.tsv...


11215450it [00:04, 2284548.44it/s]


In [8]:
print("Lan truy·ªÅn ng∆∞·ª£c...")
final_lines = []

for pid, term_scores in tqdm(preds.items()):
    # term_scores l√† dict {Term: Score g·ªëc}
    # new_scores s·∫Ω ch·ª©a c·∫£ ƒëi·ªÉm c·ªßa cha √¥ng
    new_scores = term_scores.copy()
    
    for term, score in term_scores.items():
        # L·∫•y danh s√°ch cha √¥ng c·ªßa term n√†y
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # Quy t·∫Øc: ƒêi·ªÉm c·ªßa cha = MAX(ƒêi·ªÉm cha c≈©, ƒêi·ªÉm c·ªßa con)
                current_parent_score = new_scores.get(parent, 0.0)
                new_scores[parent] = max(current_parent_score, score)
    
    # Ghi ra k·∫øt qu·∫£ (L·∫°i ph·∫£i l·ªçc Top K v√¨ gi·ªù n√≥ ph√¨nh to ra)
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn theo ƒëi·ªÉm
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    
    # L·∫•y Top 70 (tƒÉng l√™n ch√∫t v√¨ gi·ªù c√≥ c·∫£ cha √¥ng)
    for term, score in sorted_terms[:70]:
        final_lines.append(f"{pid}\t{term}\t{score:.3f}")

Lan truy·ªÅn ng∆∞·ª£c...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:22<00:00, 9829.63it/s] 


In [9]:
print(f"Saving {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(final_lines))

print("Finished")

Saving /workspace/notebooks/submission_level4_propagated.tsv...
Finished


### S·ª≠ d·ª•ng KNN clf tr√™n embeddings

In [21]:
import torch
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
import glob

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy"
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
TEST_CHUNKS_DIR = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE = "submission_knn_esm2.tsv"
TOP_K = 5  # L·∫•y 5 ng∆∞·ªùi h√†ng x√≥m gi·ªëng nh·∫•t
device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
print("Load Train Embeddings...")
# Load vector
X_train = np.load(TRAIN_EMB_PATH)
X_train = torch.from_numpy(X_train).to(device)

# Chu·∫©n h√≥a vector v·ªÅ ƒë∆°n v·ªã (ƒë·ªÉ t√≠nh Cosine Similarity nhanh b·∫±ng ph√©p nh√¢n ma tr·∫≠n)
# C√¥ng th·ª©c: v = v / |v|
norm = X_train.norm(p=2, dim=1, keepdim=True)
X_train = X_train.div(norm)

# Load ID v√† Map Nh√£n
print("Loading labels...")
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)
train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])

# Gom nh√≥m: TrainID -> Set(Terms)
train_labels_map = train_terms.groupby("EntryID")["term"].apply(set).to_dict()

# Map Index -> ID (ƒë·ªÉ truy xu·∫•t nhanh t·ª´ k·∫øt qu·∫£ KNN)
idx_to_trainid = {i: pid for i, pid in enumerate(train_ids)}

print("Train xong tren GPU!")

Load Train Embeddings...
Loading labels...
Train xong tren GPU!


In [23]:
print("B·∫Øt ƒë·∫ßu ch·∫°y KNN (T√¨m h√†ng x√≥m)...")

chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []

for f_path in tqdm(chunk_files):
    # 1. Load 1 c·ª•c Test l√™n GPU
    X_test_np = np.load(f_path)
    X_test = torch.from_numpy(X_test_np).to(device)
    
    # Chu·∫©n h√≥a Test
    norm_test = X_test.norm(p=2, dim=1, keepdim=True)
    X_test = X_test.div(norm_test)
    
    # Load ID Test t∆∞∆°ng ·ª©ng
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_test = np.load(id_path, allow_pickle=True)
    
    # 2. T√≠nh ƒë·ªô t∆∞∆°ng ƒë·ªìng (Matrix Multiplication)
    # [Batch, 1280] x [1280, All_Train] = [Batch, All_Train]
    # ƒê√¢y l√† b∆∞·ªõc n·∫∑ng nh·∫•t, nh∆∞ng GPU x·ª≠ l√Ω t·ªët
    sim_matrix = torch.mm(X_test, X_train.t())
    
    # 3. L·∫•y Top K h√†ng x√≥m
    # values: ƒë·ªô t∆∞∆°ng ƒë·ªìng (score), indices: v·ªã tr√≠ c·ªßa h√†ng x√≥m
    topk_values, topk_indices = torch.topk(sim_matrix, k=TOP_K, dim=1)
    
    # Chuy·ªÉn v·ªÅ CPU ƒë·ªÉ x·ª≠ l√Ω logic g√°n nh√£n (Python x·ª≠ l√Ω dict nhanh h∆°n)
    topk_indices = topk_indices.cpu().numpy()
    topk_values = topk_values.cpu().numpy()
    
    # 4. T·ªïng h·ª£p nh√£n t·ª´ h√†ng x√≥m (Weighted Voting)
    for i, test_pid in enumerate(ids_test):
        # Dict l∆∞u ƒëi·ªÉm s·ªë cho t·ª´ng nh√£n: {Term: Score}
        term_scores = {}
        
        for k in range(TOP_K):
            neighbor_idx = topk_indices[i, k]
            score = topk_values[i, k] # ƒê·ªô gi·ªëng nhau (v√≠ d·ª• 0.95)
            
            neighbor_id = idx_to_trainid[neighbor_idx]
            
            # N·∫øu h√†ng x√≥m n√†y c√≥ nh√£n (c√≥ trong file train_terms)
            if neighbor_id in train_labels_map:
                neighbor_terms = train_labels_map[neighbor_id]
                for term in neighbor_terms:
                    # C·ªông d·ªìn ƒëi·ªÉm (Weighted Sum)
                    if term not in term_scores:
                        term_scores[term] = 0.0
                    term_scores[term] += score
        
        # Chu·∫©n h√≥a ƒëi·ªÉm s·ªë (Chia cho t·ªïng tr·ªçng s·ªë ho·∫∑c K)
        # ·ªû ƒë√¢y ta chia cho K ƒë·ªÉ score n·∫±m trong kho·∫£ng 0-1
        # Ho·∫∑c ƒë∆°n gi·∫£n l√† gi·ªØ nguy√™n v√¨ CAFA ch·∫•m rank
        
        # L·∫•y Top 50 nh√£n ƒëi·ªÉm cao nh·∫•t ƒë·ªÉ ghi file
        sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)[:50]
        
        for term, total_score in sorted_terms:
            # Normalize heuristic: Score trung b√¨nh
            final_score = total_score / TOP_K 
            # Ch·ªâ ghi n·∫øu score ƒë·ªß l·ªõn
            if final_score > 0.01: 
                output_lines.append(f"{test_pid}\t{term}\t{final_score:.3f}")

    # D·ªçn d·∫πp b·ªô nh·ªõ GPU
    del X_test, sim_matrix, topk_values, topk_indices
    torch.cuda.empty_cache()

print("Finished")

B·∫Øt ƒë·∫ßu ch·∫°y KNN (T√¨m h√†ng x√≥m)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [00:05<00:00,  8.25it/s]

Finished





In [24]:
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

ƒêang l∆∞u submission_knn_esm2.tsv...


#### + Propagation

In [25]:
import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm

INPUT_FILE = "/workspace/notebooks/submission_knn_esm2.tsv" 
OUTPUT_FILE = "/workspace/notebooks/submission_knn_propagated.tsv"
OBO_PATH = "/workspace/data/Train/go-basic.obo"

In [26]:
# 1. Load c√¢y Gene Ontology
print("ƒê·ªçc file go-basic.obo...")
graph = obonet.read_obo(OBO_PATH)

# 2. X√¢y d·ª±ng b·∫£n ƒë·ªì Cha-Con
print("ƒêang x√¢y d·ª±ng quan h·ªá t·ªï ti√™n...")
ancestors_map = {}
# Ch·ªâ quan t√¢m ƒë·∫øn c√°c node c√≥ quan h·ªá 'is_a' v√† 'part_of'
for node in tqdm(graph.nodes()):
    try:
        # L·∫•y t·∫•t c·∫£ t·ªï ti√™n c·ªßa node hi·ªán t·∫°i
        ancestors = networkx.descendants(graph, node)
        ancestors_map[node] = ancestors
    except:
        pass

# 3. ƒê·ªçc file KNN
print(f"ƒêang ƒë·ªçc {INPUT_FILE}...")
preds = {} # {ProteinID: {Term: Score}}

with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        if pid not in preds: preds[pid] = {}
        preds[pid][term] = score

# 4. Lan truy·ªÅn ƒëi·ªÉm s·ªë (Propagation)
print("ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...")
output_lines = []

for pid, term_scores in tqdm(preds.items()):
    # Copy dict ƒëi·ªÉm c≈©
    new_scores = term_scores.copy()
    
    # Duy·ªát qua t·ª´ng term ƒëang c√≥
    for term, score in term_scores.items():
        # N·∫øu term n√†y c√≥ t·ªï ti√™n
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # Quy t·∫Øc c·ªët l√µi: ƒêi·ªÉm c·ªßa Cha lu√¥n >= ƒêi·ªÉm c·ªßa Con
                current_p_score = new_scores.get(parent, 0.0)
                new_scores[parent] = max(current_p_score, score)
    
    # L·ªçc v√† Ghi file
    # Sau khi lan truy·ªÅn, s·ªë l∆∞·ª£ng nh√£n s·∫Ω ph√¨nh to ra (v√¨ th√™m cha √¥ng)
    # Ta ch·ªâ l·∫•y Top 70 nh√£n ƒëi·ªÉm cao nh·∫•t ƒë·ªÉ file kh√¥ng qu√° n·∫∑ng
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)[:70]
    
    for term, score in sorted_terms:
        # L·ªçc b·ªõt r√°c: Ch·ªâ l·∫•y > 0.01
        if score > 0.01:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

# 5. L∆∞u file
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("XONG!")

ƒê·ªçc file go-basic.obo...
ƒêang x√¢y d·ª±ng quan h·ªá t·ªï ti√™n...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 158229.79it/s]


ƒêang ƒë·ªçc /workspace/notebooks/submission_knn_esm2.tsv...


4549325it [00:01, 2344697.58it/s]


ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:12<00:00, 18368.44it/s]


ƒêang l∆∞u /workspace/notebooks/submission_knn_propagated.tsv...
XONG!


#### Score: 0.211

### Taking Taxonomy into consideration
M·ªôt protein c·ªßa Ng∆∞·ªùi (Human) v√† c·ªßa Vi khu·∫©n (E. coli) c√≥ th·ªÉ nh√¨n h∆°i gi·ªëng nhau (tr√¨nh t·ª± t∆∞∆°ng ƒë·ªìng), nh∆∞ng ch·ª©c nƒÉng th√¨ kh√°c m·ªôt tr·ªùi m·ªôt v·ª±c.

N·∫øu KNN t√¨m th·∫•y "h√†ng x√≥m" l√† vi khu·∫©n ƒë·ªÉ g√°n nh√£n cho ng∆∞·ªùi, ta s·∫Ω sai.

### Taxonomy-aware KNN

#### Strategy: ∆∞u ti√™n c√πng lo√†i: Trong c√°c neighbours c·ªßa ƒëi·ªÉm c·∫ßn label, 
N·∫øu Test Protein l√† Ng∆∞·ªùi (9606) v√† h√†ng x√≥m t√¨m ƒë∆∞·ª£c c≈©ng l√† Ng∆∞·ªùi (9606) $\rightarrow$ Nh√¢n ƒëi·ªÉm s·ªë l√™n 1.5 l·∫ßn (∆Øu ti√™n c·ª±c m·∫°nh).N·∫øu kh√°c lo√†i $\rightarrow$ Gi·ªØ nguy√™n ho·∫∑c gi·∫£m nh·∫π.

In [1]:
import torch
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
import glob
import os

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy"
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"

TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"    
TRAIN_TAX_PATH   = "/workspace/data/Train/train_taxonomy.tsv"  
TEST_TAX_PATH    = "/workspace/data/Test/testsuperset-taxon-list.tsv" 

TEST_CHUNKS_DIR  = "/workspace/data/Embeddings/embeddings_chunks"
OUTPUT_FILE      = "/workspace/notebooks/submission_knn_tax_v2.tsv"

TOP_K = 10
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
print("ƒêang load Taxonomy Map...")

# 1. Load Train Taxonomy
# File n√†y format: EntryID \t TaxID
try:
    df_train_tax = pd.read_csv(TRAIN_TAX_PATH, sep="\t")
    # C·∫ßn ƒë·∫£m b·∫£o t√™n c·ªôt ƒë√∫ng. Th∆∞·ªùng l√† 'EntryID' v√† 'TaxonomyID' (ho·∫∑c 'taxon_id')
    # Ta convert sang dict cho nhanh: {PID: TaxID}
    cols = df_train_tax.columns
    train_tax_map = dict(zip(df_train_tax[cols[0]], df_train_tax[cols[1]]))
    print(f"ƒê√£ load {len(train_tax_map)} train TaxIDs.")
except FileNotFoundError:
    # Fallback n·∫øu ƒë∆∞·ªùng d·∫´n sai
    print(f"Kh√¥ng t√¨m th·∫•y {TRAIN_TAX_PATH}, th·ª≠ t√¨m ·ªü th∆∞ m·ª•c g·ªëc...")
    TRAIN_TAX_PATH = "/workspace/data/Train/train_taxonomy.tsv"
    df_train_tax = pd.read_csv(TRAIN_TAX_PATH, sep="\t")
    cols = df_train_tax.columns
    train_tax_map = dict(zip(df_train_tax[cols[0]], df_train_tax[cols[1]]))

# 2. Load Test Taxonomy
# File n√†y th∆∞·ªùng kh√¥ng c√≥ header ho·∫∑c header l·∫°, ta c·ª© ƒë·ªçc th·ª≠
print(f"ƒêang ƒë·ªçc {TEST_TAX_PATH}...")
try:
    # Th·ª≠ ƒë·ªçc header=None tr∆∞·ªõc xem sao
    df_test_tax = pd.read_csv(TEST_TAX_PATH, sep="\t", header=None, names=["EntryID", "TaxID"])
    # N·∫øu d√≤ng ƒë·∫ßu ti√™n l√† ch·ªØ (header), ta drop n√≥ ƒëi
    if not str(df_test_tax.iloc[0, 1]).isdigit():
        df_test_tax = pd.read_csv(TEST_TAX_PATH, sep="\t")
        # ƒê·ªïi t√™n c·ªôt cho chu·∫©n
        df_test_tax.columns = ["EntryID", "TaxID"]
    
    test_tax_map = dict(zip(df_test_tax["EntryID"], df_test_tax["TaxID"]))
    print(f"ƒê√£ load {len(test_tax_map)} test TaxIDs.")
except Exception as e:
    print(f"L·ªói ƒë·ªçc file Test Tax: {e}")
    raise

ƒêang load Taxonomy Map...
ƒê√£ load 82403 train TaxIDs.
ƒêang ƒë·ªçc /workspace/data/Test/testsuperset-taxon-list.tsv...
ƒê√£ load 8453 test TaxIDs.


In [3]:
print("ƒêang load Embeddings & Labels...")
# Load Train Embeddings
X_train = np.load(TRAIN_EMB_PATH)
X_train = torch.from_numpy(X_train).to(device)
norm = X_train.norm(p=2, dim=1, keepdim=True)
X_train = X_train.div(norm)

# Load Train IDs & Terms
train_ids = np.load(TRAIN_IDS_PATH, allow_pickle=True)
train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])
train_labels_map = train_terms.groupby("EntryID")["term"].apply(set).to_dict()
idx_to_trainid = {i: pid for i, pid in enumerate(train_ids)}

ƒêang load Embeddings & Labels...


In [4]:
print("B·∫Øt ƒë·∫ßu ch·∫°y Taxonomy-Aware KNN...")
chunk_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []

for f_path in tqdm(chunk_files):
    # Load 1 c·ª•c Test
    X_test_np = np.load(f_path)
    X_test = torch.from_numpy(X_test_np).to(device)
    norm_test = X_test.norm(p=2, dim=1, keepdim=True)
    X_test = X_test.div(norm_test)
    
    # Load ID
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_test = np.load(id_path, allow_pickle=True)
    
    # T√≠nh Cosine Sim
    sim_matrix = torch.mm(X_test, X_train.t())
    topk_values, topk_indices = torch.topk(sim_matrix, k=TOP_K, dim=1)
    
    topk_indices = topk_indices.cpu().numpy()
    topk_values = topk_values.cpu().numpy()
    
    # X·ª≠ l√Ω Logic (Ph·∫ßn quan tr·ªçng nh·∫•t)
    for i, test_pid in enumerate(ids_test):
        # L·∫•y TaxID c·ªßa con test n√†y (M·∫∑c ƒë·ªãnh -1 n·∫øu kh√¥ng t√¨m th·∫•y)
        test_taxon = test_tax_map.get(test_pid, -1)
        
        term_scores = {}
        
        for k in range(TOP_K):
            neighbor_idx = topk_indices[i, k]
            raw_score = topk_values[i, k]
            
            neighbor_pid = idx_to_trainid[neighbor_idx]
            neighbor_taxon = train_tax_map.get(neighbor_pid, -2)
            
            # --- LOGIC TH∆Ø·ªûNG/PH·∫†T ---
            weight = 1.0
            
            # N·∫øu c·∫£ 2 ƒë·ªÅu c√≥ th√¥ng tin lo√†i
            if test_taxon != -1 and neighbor_taxon != -2:
                if test_taxon == neighbor_taxon:
                    # C√ôNG LO√ÄI: TƒÉng ƒë·ªô tin c·∫≠y l√™n 1.3 l·∫ßn
                    weight = 1.3 
                else:
                    # KH√ÅC LO√ÄI: Gi·∫£m nh·∫π (v√¨ v·∫´n c√≥ th·ªÉ b·∫£o t·ªìn ch·ª©c nƒÉng)
                    weight = 0.9
            
            final_score = raw_score * weight
            
            # C·ªông d·ªìn ƒëi·ªÉm cho c√°c ch·ª©c nƒÉng
            if neighbor_pid in train_labels_map:
                for term in train_labels_map[neighbor_pid]:
                    if term not in term_scores: term_scores[term] = 0.0
                    term_scores[term] += final_score
        
        # L·∫•y k·∫øt qu·∫£ (Top 60)
        sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)[:60]
        
        for term, total_score in sorted_terms:
            # Normalize ƒëi·ªÉm s·ªë (chia cho s·ªë h√†ng x√≥m ho·∫∑c tr·ªçng s·ªë)
            # ·ªû ƒë√¢y chia cho TOP_K ƒë·ªÉ ƒë∆∞a v·ªÅ range 0-1 (t∆∞∆°ng ƒë·ªëi)
            norm_score = total_score / TOP_K
            
            # N·∫øu c√≥ boost th√¨ norm_score c√≥ th·ªÉ > 1.0 -> Clip v·ªÅ 1.0
            norm_score = min(norm_score, 1.0)
            
            if norm_score > 0.01:
                output_lines.append(f"{test_pid}\t{term}\t{norm_score:.3f}")

    del X_test, sim_matrix
    torch.cuda.empty_cache()

B·∫Øt ƒë·∫ßu ch·∫°y Taxonomy-Aware KNN...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [00:11<00:00,  3.99it/s]


In [5]:
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("XONG!")

ƒêang l∆∞u /workspace/notebooks/submission_knn_tax_v2.tsv...
XONG!


#### + Propagation

In [6]:
%pip install -q obonet networkx

import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

INPUT_FILE = "/workspace/notebooks/submission_knn_tax_v2.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_knn_tax_propagated.tsv"

OBO_PATH = "/workspace/data/Train/go-basic.obo"

# --- B∆Ø·ªöC 1: X√ÇY D·ª∞NG B·∫¢N ƒê·ªí T·ªî TI√äN ---
print("ƒêang ƒë·ªçc c√¢y Gene Ontology...")
graph = obonet.read_obo(OBO_PATH)

print("ƒêang map quan h·ªá Con -> Cha √îng...")
ancestors_map = {}
# Ch·ªâ quan t√¢m c√°c node c√≥ trong graph
for node in tqdm(graph.nodes()):
    try:
        # networkx.descendants trong obonet tr·∫£ v·ªÅ c√°c node "cha/√¥ng" (superclasses)
        ancestors = networkx.descendants(graph, node)
        ancestors_map[node] = ancestors
    except:
        pass

# --- B∆Ø·ªöC 2: ƒê·ªåC FILE D·ª∞ ƒêO√ÅN ---
print(f"ƒêang ƒë·ªçc {INPUT_FILE}...")
preds = {} # {PID: {Term: Score}}

try:
    with open(INPUT_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            pid, term, score = parts[0], parts[1], float(parts[2])
            
            if pid not in preds: preds[pid] = {}
            preds[pid][term] = score
except FileNotFoundError:
    print(f"L·ªñI: Kh√¥ng t√¨m th·∫•y file {INPUT_FILE}. B·∫°n ƒë√£ ch·∫°y xong b∆∞·ªõc KNN Taxonomy ch∆∞a?")
    raise

# --- B∆Ø·ªöC 3: LAN TRUY·ªÄN (PROPAGATION) ---
print("ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...")
output_lines = []

for pid, term_scores in tqdm(preds.items()):
    # Copy dict ƒëi·ªÉm c≈© sang dict m·ªõi ƒë·ªÉ c·∫≠p nh·∫≠t
    new_scores = term_scores.copy()
    
    # Duy·ªát qua t·ª´ng term g·ªëc
    for term, score in term_scores.items():
        # N·∫øu term n√†y c√≥ t·ªï ti√™n trong c√¢y
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # QUY T·∫ÆC V√ÄNG: ƒêi·ªÉm Cha = Max(ƒêi·ªÉm Cha c≈©, ƒêi·ªÉm Con)
                current_p_score = new_scores.get(parent, 0.0)
                if score > current_p_score:
                    new_scores[parent] = score
    
    # --- B∆Ø·ªöC 4: L·ªåC & GHI ---
    # Sau khi lan truy·ªÅn, s·ªë l∆∞·ª£ng nh√£n s·∫Ω tƒÉng l√™n r·∫•t nhi·ªÅu.
    # Ta c·∫ßn l·ªçc l·∫•y Top K ƒëi·ªÉm cao nh·∫•t ƒë·ªÉ file kh√¥ng b·ªã qu√° n·∫∑ng (g√¢y l·ªói submit).
    
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn theo ƒëi·ªÉm
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    
    # L·∫•y Top 75 (ƒê·ªß ƒë·ªÉ bao ph·ªß c·∫£ cha l·∫´n con)
    # L·∫•y nhi·ªÅu h∆°n 75 file s·∫Ω > 150MB, d·ªÖ b·ªã l·ªói timeout khi n·ªôp
    for term, score in sorted_terms[:75]:
        # Ch·ªâ l·∫•y ƒëi·ªÉm > 0.01 (b·ªè r√°c)
        if score > 0.01:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

# L∆∞u file
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("-" * 30)
print(f"XONG!")

[0mNote: you may need to restart the kernel to use updated packages.
ƒêang ƒë·ªçc c√¢y Gene Ontology...
ƒêang map quan h·ªá Con -> Cha √îng...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 153223.66it/s]


ƒêang ƒë·ªçc /workspace/notebooks/submission_knn_tax_v2.tsv...


7872336it [00:03, 2271903.00it/s]


ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Fill Parents)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:13<00:00, 16100.11it/s]


ƒêang l∆∞u /workspace/notebooks/submission_knn_tax_propagated.tsv...
------------------------------
XONG!


#### Score: 0.234

#### Drawbacks of KNN: "Hai protein g·∫ßn nhau th√¨ ch·ª©c nƒÉng g·∫ßn nhau", th·ª±c t·∫ø c√≥ nh·ª©ng sequences ch·ªâ c√≥ 1-2 acid amin gi·ªëng nhau nh∆∞ng l·∫°i c√≥ ch·ª©c nƒÉng t∆∞∆°ng t·ª± and vice versa. Plus, KNN coi 1280 chi·ªÅu l√† nh∆∞ nhau, major vote c≈©ng ch∆∞a c√≥ tr·ªçng s·ªë.

### MLP

#### X√¢y d·ª±ng 1 m·∫°ng neuron ƒë·∫∑t l√™n c√°c vector embeddings. ƒê·∫ßu ra: 1500 nh√£n (sigmoid)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import gc

TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy"
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
MODEL_SAVE_PATH = "/workspace/models/mlp_model.pth"
BATCH_SIZE = 128 # TƒÉng batch size l√™n v√¨ embedding nh·∫π
EPOCHS = 15      # Train k·ªπ m·ªôt ch√∫t
LEARNING_RATE = 1e-3
TOP_LABELS = 1500 
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
print("ƒêang load d·ªØ li·ªáu l√™n RAM...")

# 1. Load Embeddings (X)
X_all = np.load(TRAIN_EMB_PATH)
ids_all = np.load(TRAIN_IDS_PATH, allow_pickle=True)

# 2. Load Labels (Y)
print("ƒêang x·ª≠ l√Ω nh√£n...")
train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])

# L·ªçc IDs
ids_set = set(ids_all)
train_terms = train_terms[train_terms["EntryID"].isin(ids_set)]

# L·∫•y Top 1500 terms
top_term_list = train_terms["term"].value_counts().head(TOP_LABELS).index.tolist()
term_to_idx = {t: i for i, t in enumerate(top_term_list)}

# T·∫°o Y matrix (Numpy)
# C√°ch t·∫°o nhanh h∆°n pivot_table:
Y_all = np.zeros((len(ids_all), TOP_LABELS), dtype=np.float32)
id_to_idx = {pid: i for i, pid in enumerate(ids_all)}

# Duy·ªát qua file terms ƒë·ªÉ ƒëi·ªÅn s·ªë 1 v√†o ma tr·∫≠n
# (C√°ch n√†y ti·∫øt ki·ªám RAM h∆°n pivot table)
grouped = train_terms[train_terms["term"].isin(top_term_list)].groupby("EntryID")["term"].apply(list)

for pid, terms in tqdm(grouped.items()):
    if pid in id_to_idx:
        row_idx = id_to_idx[pid]
        col_idxs = [term_to_idx[t] for t in terms]
        Y_all[row_idx, col_idxs] = 1.0

print(f"X shape: {X_all.shape}, Y shape: {Y_all.shape}")

# 3. Chia Train/Val
X_train, X_val, Y_train, Y_val = train_test_split(X_all, Y_all, test_size=0.1, random_state=42)

# Chuy·ªÉn sang Tensor
train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).float())
val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(Y_val).float())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# D·ªçn RAM
del X_all, Y_all, train_terms, grouped
gc.collect()

ƒêang load d·ªØ li·ªáu l√™n RAM...
ƒêang x·ª≠ l√Ω nh√£n...


76297it [00:00, 278218.30it/s]


X shape: (82404, 1280), Y shape: (82404, 1500)


0

In [9]:
#X√ÇY D·ª∞NG MODEL (MLP)
class ProteinClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ProteinClassifier, self).__init__()
        
        self.network = nn.Sequential(
            # Layer 1
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512), # Gi√∫p h·ªôi t·ª• nhanh
            nn.ReLU(),
            nn.Dropout(0.3),     # Ch·ªëng h·ªçc v·∫πt (Overfitting)
            
            # Layer 2
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Output Layer
            nn.Linear(256, num_classes)
            # Kh√¥ng d√πng Sigmoid ·ªü ƒë√¢y v√¨ BCEWithLogitsLoss ƒë√£ t√≠ch h·ª£p s·∫µn
        )
        
    def forward(self, x):
        return self.network(x)

model = ProteinClassifier(input_dim=1280, num_classes=TOP_LABELS).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss() # Loss chu·∫©n cho Multi-label

In [10]:
print("B·∫Øt ƒë·∫ßu Train Neural Network...")

best_val_loss = float('inf')

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f} | Val Loss = {avg_val_loss:.4f}")
    
    # Save Best Model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print("  --> Saved Best Model")

print("Train xong!")

B·∫Øt ƒë·∫ßu Train Neural Network...


                                                              

Epoch 1: Train Loss = 0.0319 | Val Loss = 0.0140
  --> Saved Best Model


                                                              

Epoch 2: Train Loss = 0.0139 | Val Loss = 0.0130
  --> Saved Best Model


                                                              

Epoch 3: Train Loss = 0.0131 | Val Loss = 0.0125
  --> Saved Best Model


                                                              

Epoch 4: Train Loss = 0.0126 | Val Loss = 0.0121
  --> Saved Best Model


                                                              

Epoch 5: Train Loss = 0.0123 | Val Loss = 0.0119
  --> Saved Best Model


                                                              

Epoch 6: Train Loss = 0.0121 | Val Loss = 0.0117
  --> Saved Best Model


                                                              

Epoch 7: Train Loss = 0.0119 | Val Loss = 0.0116
  --> Saved Best Model


                                                              

Epoch 8: Train Loss = 0.0118 | Val Loss = 0.0115
  --> Saved Best Model


                                                              

Epoch 9: Train Loss = 0.0117 | Val Loss = 0.0114
  --> Saved Best Model


                                                               

Epoch 10: Train Loss = 0.0116 | Val Loss = 0.0114
  --> Saved Best Model


                                                               

Epoch 11: Train Loss = 0.0114 | Val Loss = 0.0113
  --> Saved Best Model


                                                               

Epoch 12: Train Loss = 0.0113 | Val Loss = 0.0113


                                                               

Epoch 13: Train Loss = 0.0113 | Val Loss = 0.0112
  --> Saved Best Model


                                                               

Epoch 14: Train Loss = 0.0112 | Val Loss = 0.0112
  --> Saved Best Model


                                                               

Epoch 15: Train Loss = 0.0111 | Val Loss = 0.0111
  --> Saved Best Model
Train xong!




In [11]:
print("ƒêang d·ª± ƒëo√°n t·∫≠p Test...")
import glob

# Load l·∫°i best model
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

chunk_files = sorted(glob.glob("/workspace/data/Embeddings/embeddings_chunks/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []
# Danh s√°ch t√™n c√°c nh√£n
terms_columns = top_term_list

for f_path in tqdm(chunk_files):
    X_chunk = np.load(f_path)
    # Chuy·ªÉn sang Tensor
    X_tensor = torch.from_numpy(X_chunk).float().to(device)
    
    id_path = f_path.replace("test_part_", "test_ids_")
    ids_chunk = np.load(id_path, allow_pickle=True)
    
    with torch.no_grad():
        logits = model(X_tensor)
        probs = torch.sigmoid(logits).cpu().numpy() # Chuy·ªÉn v·ªÅ x√°c su·∫•t 0-1
        
    # Ghi k·∫øt qu·∫£
    for i, pid in enumerate(ids_chunk):
        prob_row = probs[i]
        # L·∫•y Top 50 cao nh·∫•t
        top_indices = np.argsort(prob_row)[-50:]
        
        for idx in top_indices:
            score = prob_row[idx]
            term = terms_columns[idx]
            if score > 0.01:
                output_lines.append(f"{pid}\t{term}\t{score:.3f}")
                
    del X_chunk, X_tensor
    gc.collect()

ƒêang d·ª± ƒëo√°n t·∫≠p Test...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [00:21<00:00,  2.05it/s]


In [14]:
OUTPUT_FILE = "/workspace/notebooks/submission_mlp_esm2.tsv"
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("HO√ÄN T·∫§T")

üíæ ƒêang l∆∞u /workspace/notebooks/submission_mlp_esm2.tsv...
HO√ÄN T·∫§T


#### + Propagation

In [15]:
!pip install -q obonet networkx

import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

INPUT_FILE = "/workspace/notebooks/submission_mlp_esm2.tsv"
OUTPUT_FILE = "/workspace/notebooks/submission_mlp_propagated.tsv"

OBO_PATH = "/workspace/data/Train/go-basic.obo"

# --- B∆Ø·ªöC 1: X√ÇY D·ª∞NG B·∫¢N ƒê·ªí T·ªî TI√äN ---
print("ƒêang ƒë·ªçc c·∫•u tr√∫c Gene Ontology...")
graph = obonet.read_obo(OBO_PATH)

print("ƒêang map quan h·ªá (Con -> Cha √îng)...")
ancestors_map = {}
# Duy·ªát qua t·∫•t c·∫£ c√°c node ƒë·ªÉ t√¨m t·ªï ti√™n tr∆∞·ªõc (Pre-compute)
for node in tqdm(graph.nodes()):
    try:
        # networkx.descendants tr·∫£ v·ªÅ t·∫≠p h·ª£p c√°c node cha/√¥ng
        ancestors = networkx.descendants(graph, node)
        ancestors_map[node] = ancestors
    except:
        pass

# --- B∆Ø·ªöC 2: ƒê·ªåC FILE D·ª∞ ƒêO√ÅN ---
print(f"ƒêang ƒë·ªçc file {INPUT_FILE}...")
preds = {} # C·∫•u tr√∫c: {ProteinID: {Term: Score}}

try:
    with open(INPUT_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            pid, term, score = parts[0], parts[1], float(parts[2])
            
            if pid not in preds: preds[pid] = {}
            preds[pid][term] = score
except FileNotFoundError:
    print(f"L·ªói: Kh√¥ng t√¨m th·∫•y file {INPUT_FILE}. B·∫°n ƒë√£ ch·∫°y xong b∆∞·ªõc Train MLP ch∆∞a?")
    raise

# --- B∆Ø·ªöC 3: LAN TRUY·ªÄN (PROPAGATION) ---
print("ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Logic: ƒêi·ªÉm Cha = Max(Cha c≈©, Con))...")
output_lines = []

for pid, term_scores in tqdm(preds.items()):
    # T·∫°o b·∫£n sao ƒë·ªÉ c·∫≠p nh·∫≠t ƒëi·ªÉm
    new_scores = term_scores.copy()
    
    # Duy·ªát qua t·ª´ng term con ƒëang c√≥ ƒëi·ªÉm
    for term, score in term_scores.items():
        # N·∫øu term n√†y c√≥ cha √¥ng trong b·∫£n ƒë·ªì
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # C·∫≠p nh·∫≠t ƒëi·ªÉm cho cha
                current_p_score = new_scores.get(parent, 0.0)
                # Ch·ªâ c·∫≠p nh·∫≠t n·∫øu ƒëi·ªÉm m·ªõi cao h∆°n ƒëi·ªÉm c≈©
                if score > current_p_score:
                    new_scores[parent] = score
    
    # --- B∆Ø·ªöC 4: L·ªåC TOP K ƒê·ªÇ FILE KH√îNG B·ªä N·∫∂NG ---
    # Sau khi lan truy·ªÅn, m·ªôt protein c√≥ th·ªÉ c√≥ h√†ng trƒÉm nh√£n cha.
    # Ta c·∫ßn l·ªçc l·∫•y Top 75 ƒëi·ªÉm cao nh·∫•t.
    
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn theo ƒëi·ªÉm
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    
    # L·∫•y Top 75
    for term, score in sorted_terms[:75]:
        # Ch·ªâ l·∫•y ƒëi·ªÉm > 0.01 ƒë·ªÉ b·ªè r√°c
        if score > 0.01:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

# --- B∆Ø·ªöC 5: L∆ØU FILE ---
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("-" * 30)
print(f"HO√ÄN TH√ÄNH!")

[0mƒêang ƒë·ªçc c·∫•u tr√∫c Gene Ontology...
ƒêang map quan h·ªá (Con -> Cha √îng)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 160248.07it/s]


ƒêang ƒë·ªçc file /workspace/notebooks/submission_mlp_esm2.tsv...


7982025it [00:03, 2331406.83it/s]


ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Logic: ƒêi·ªÉm Cha = Max(Cha c≈©, Con))...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:12<00:00, 18075.72it/s]


ƒêang l∆∞u /workspace/notebooks/submission_mlp_propagated.tsv...
------------------------------
HO√ÄN TH√ÄNH!


#### Score: 0.204 -> H·ªçc v·∫πt ?

### Ensemble: KNN + BLAST

In [17]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# --- C·∫§U H√åNH ---
# 1. File KNN t·ªët nh·∫•t c·ªßa b·∫°n (File ƒë·∫°t 0.234)
# L∆∞u √Ω: Ch·ªçn ƒë√∫ng file output c·ªßa b∆∞·ªõc KNN c√≥ Taxonomy + Propagation
KNN_FILE = "/workspace/notebooks/submission_knn_tax_propagated.tsv" 

# 2. File k·∫øt qu·∫£ BLAST (Level 2)
# N·∫øu b·∫°n ch∆∞a ch·∫°y BLAST ho·∫∑c ƒë√£ x√≥a, ƒëo·∫°n code d∆∞·ªõi s·∫Ω t·ª± ch·∫°y l·∫°i (m·∫•t 3 ph√∫t)
BLAST_RESULT = "/workspace/data/diamond_results.tsv"
TRAIN_FASTA = "/workspace/data/Train/train_sequences.fasta"
TEST_FASTA = "/workspace/data/Test/testsuperset.fasta"
DB_PATH = "/workspace/data/Train/train_data.dmnd"
DIAMOND_BIN = "/usr/bin/diamond" # Ho·∫∑c "diamond" n·∫øu ƒë√£ add path

OUTPUT_FILE = "/workspace/notebooks/submission_hybrid_best_v1.tsv"

# --- B∆Ø·ªöC 1: KI·ªÇM TRA V√Ä CH·∫†Y BLAST N·∫æU C·∫¶N ---
if not os.path.exists(BLAST_RESULT):
    print("Kh√¥ng th·∫•y file BLAST c≈©, ƒëang ch·∫°y l·∫°i...")
    if not os.path.exists(DB_PATH):
        os.system(f"{DIAMOND_BIN} makedb --in {TRAIN_FASTA} -d {DB_PATH} --quiet")
    
    # Ch·∫°y so kh·ªõp
    cmd = f"{DIAMOND_BIN} blastp -d {DB_PATH} -q {TEST_FASTA} -o {BLAST_RESULT} --sensitive --top 1 -f 6 qseqid sseqid pident"
    os.system(cmd)
    print("BLAST xong!")
else:
    print("ƒê√£ c√≥ s·∫µn file BLAST.")

# --- B∆Ø·ªöC 2: ƒê·ªåC D·ªÆ LI·ªÜU ---
print("ƒêang load d·ªØ li·ªáu...")

# 1. Load BLAST Predictions
# Logic: Ch·ªâ tin t∆∞·ªüng BLAST n·∫øu ƒë·ªô gi·ªëng nhau > 30%
# Map TrainID sang Terms
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])
train_terms_grouped = train_terms.groupby("EntryID")["term"].apply(list).to_dict()

blast_preds = {}
df_blast = pd.read_csv(BLAST_RESULT, sep="\t", names=["test_id", "train_id", "pident"])

print("ƒêang x·ª≠ l√Ω BLAST...")
for _, row in df_blast.iterrows():
    pident = row['pident'] / 100.0
    if pident < 0.3: continue # B·ªè qua n·∫øu gi·ªëng nhau qu√° √≠t
    
    test_id = str(row['test_id']).split("|")[1] if "|" in str(row['test_id']) else str(row['test_id'])
    train_id = str(row['train_id']).split("|")[1] if "|" in str(row['train_id']) else str(row['train_id'])
    
    if train_id in train_terms_grouped:
        for term in train_terms_grouped[train_id]:
            # BLAST score r·∫•t uy t√≠n
            blast_preds[(test_id, term)] = pident

# 2. Load KNN Predictions (File 0.234)
print(f"ƒêang ƒë·ªçc file KNN: {KNN_FILE}...")
knn_preds = {}
try:
    with open(KNN_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            pid, term, score = parts[0], parts[1], float(parts[2])
            knn_preds[(pid, term)] = score
except FileNotFoundError:
    print(f"L·ªñI: Kh√¥ng t√¨m th·∫•y file {KNN_FILE}. B·∫°n h√£y ch·∫Øc ch·∫Øn ƒë√£ ch·∫°y xong b∆∞·ªõc KNN Tax Propagation.")
    raise

# --- B∆Ø·ªöC 3: TR·ªòN (MAX STRATEGY) ---
print("ƒêang tr·ªôn BLAST + KNN (L·∫•y ƒëi·ªÉm cao nh·∫•t)...")
# Logic: Final_Score = Max(BLAST, KNN)
# N·∫øu BLAST t√¨m th·∫•y (ƒë·ªô tin c·∫≠y cao), n√≥ s·∫Ω ghi ƒë√® l√™n KNN.
# N·∫øu BLAST b√≥ tay, ta d√πng KNN.

all_keys = set(blast_preds.keys()) | set(knn_preds.keys())
output_lines = []

for key in tqdm(all_keys):
    pid, term = key
    
    s_blast = blast_preds.get(key, 0.0)
    s_knn = knn_preds.get(key, 0.0)
    
    final_score = max(s_blast, s_knn)
    
    # L·ªçc r√°c l·∫ßn cu·ªëi
    if final_score > 0.01:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

# --- B∆Ø·ªöC 4: L∆ØU FILE ---
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("XONG! H√£y n·ªôp file n√†y.")

ƒê√£ c√≥ s·∫µn file BLAST.
ƒêang load d·ªØ li·ªáu...
ƒêang x·ª≠ l√Ω BLAST...
ƒêang ƒë·ªçc file KNN: /workspace/notebooks/submission_knn_tax_propagated.tsv...


16226847it [00:10, 1523809.83it/s]


ƒêang tr·ªôn BLAST + KNN (L·∫•y ƒëi·ªÉm cao nh·∫•t)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16710313/16710313 [00:19<00:00, 849071.23it/s]


ƒêang l∆∞u /workspace/notebooks/submission_hybrid_best_v1.tsv...
XONG! H√£y n·ªôp file n√†y.


#### Score: 0.225

### MLP with early stopping

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gc
import os

# --- C·∫§U H√åNH ---
TRAIN_EMB_PATH = "/workspace/data/Embeddings/train_650M.npy"
TRAIN_IDS_PATH = "/workspace/data/Embeddings/train_650M_ids.npy"
TRAIN_TERMS_PATH = "/workspace/data/Train/train_terms.tsv"
MODEL_SAVE_PATH = "/workspace/models/best_mlp_model.pth"
OUTPUT_FILE = "/workspace/notebooks/submission_mlp_tuned.tsv"

BATCH_SIZE = 256 # TƒÉng batch size ƒë·ªÉ gradient ·ªïn ƒë·ªãnh h∆°n
EPOCHS = 50      # Cho max 50, nh∆∞ng Early Stop s·∫Ω d·ª´ng s·ªõm
LEARNING_RATE = 5e-4 # Gi·∫£m LR xu·ªëng ƒë·ªÉ h·ªçc ch·∫≠m m√† ch·∫Øc (tr√°nh v·ªçt x√†)
TOP_LABELS = 1500 
PATIENCE = 5     # Cho ph√©p kh√¥ng ti·∫øn b·ªô 5 l·∫ßn, qu√° 5 l·∫ßn l√† c·∫Øt

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU (Gi·ªëng h·ªát c≈©) ---
print("ƒêang load d·ªØ li·ªáu...")
X_all = np.load(TRAIN_EMB_PATH)
ids_all = np.load(TRAIN_IDS_PATH, allow_pickle=True)

train_terms = pd.read_csv(TRAIN_TERMS_PATH, sep="\t", usecols=["EntryID", "term"])
ids_set = set(ids_all)
train_terms = train_terms[train_terms["EntryID"].isin(ids_set)]
top_term_list = train_terms["term"].value_counts().head(TOP_LABELS).index.tolist()
term_to_idx = {t: i for i, t in enumerate(top_term_list)}

Y_all = np.zeros((len(ids_all), TOP_LABELS), dtype=np.float32)
id_to_idx = {pid: i for i, pid in enumerate(ids_all)}
grouped = train_terms[train_terms["term"].isin(top_term_list)].groupby("EntryID")["term"].apply(list)
for pid, terms in grouped.items():
    if pid in id_to_idx:
        Y_all[id_to_idx[pid], [term_to_idx[t] for t in terms]] = 1.0

# Chia t·∫≠p (Quan tr·ªçng: Stratify ƒë·ªÉ chia ƒë·ªÅu nh√£n kh√≥/d·ªÖ n·∫øu c√≥ th·ªÉ, nh∆∞ng multilabel kh√≥ stratify chu·∫©n n√™n random)
X_train, X_val, Y_train, Y_val = train_test_split(X_all, Y_all, test_size=0.15, random_state=42) # TƒÉng Val l√™n 15%

train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).float())
val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(Y_val).float())
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

del X_all, Y_all, train_terms
gc.collect()

# --- B∆Ø·ªöC 2: MODEL C·∫¢I TI·∫æN ---
class ProteinClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ProteinClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 1024), # TƒÉng layer ƒë·∫ßu
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),     # Dropout m·∫°nh tay (50%) ƒë·ªÉ ph·∫°t h·ªçc v·∫πt
            
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            
            nn.Linear(512, num_classes)
        )
    def forward(self, x):
        return self.network(x)

model = ProteinClassifier(1280, TOP_LABELS).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) # AdamW ch·ªëng overfitting t·ªët h∆°n
criterion = nn.BCEWithLogitsLoss()

# --- B∆Ø·ªöC 3: TRAIN V·ªöI EARLY STOPPING ---
print("B·∫Øt ƒë·∫ßu Train (C√≥ Early Stopping)...")
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for bx, by in train_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        loss = criterion(model(bx), by)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for bx, by in val_loader:
            bx, by = bx.to(device), by.to(device)
            val_loss += criterion(model(bx), by).item()
            
    avg_train = train_loss / len(train_loader)
    avg_val = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}: Train={avg_train:.4f} | Val={avg_val:.4f}", end="")
    
    # LOGIC EARLY STOPPING
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        patience_counter = 0
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(" --> Saved Best Model")
    else:
        patience_counter += 1
        print(f" | Patience {patience_counter}/{PATIENCE}")
        
    if patience_counter >= PATIENCE:
        print(f"D·ª™NG S·ªöM! Model b·∫Øt ƒë·∫ßu h·ªçc v·∫πt t·∫°i Epoch {epoch+1}")
        break

# --- B∆Ø·ªöC 4: D·ª∞ ƒêO√ÅN & GHI FILE ---
# (Ph·∫ßn n√†y gi·ªØ nguy√™n logic c≈©, ch·ªâ load l·∫°i best model)
print("ƒêang d·ª± ƒëo√°n t·∫≠p Test v·ªõi Best Model...")
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

import glob
from tqdm import tqdm
chunk_files = sorted(glob.glob("/workspace/data/Embeddings/embeddings_chunks/test_part_*.npy"), 
                     key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []
for f_path in tqdm(chunk_files):
    X_chunk = np.load(f_path)
    X_tensor = torch.from_numpy(X_chunk).float().to(device)
    ids_chunk = np.load(f_path.replace("test_part_", "test_ids_"), allow_pickle=True)
    
    with torch.no_grad():
        probs = torch.sigmoid(model(X_tensor)).cpu().numpy()
        
    for i, pid in enumerate(ids_chunk):
        # L·∫•y Top 50, ng∆∞·ª°ng th·∫•p h∆°n 1 ch√∫t
        top_idxs = np.argsort(probs[i])[-50:]
        for idx in top_idxs:
            s = probs[i][idx]
            if s > 0.005: # H·∫° threshold xu·ªëng ch√∫t
                output_lines.append(f"{pid}\t{top_term_list[idx]}\t{s:.3f}")
                
    del X_chunk, X_tensor
    gc.collect()

with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))
print("Done! H√£y ch·∫°y ti·∫øp Propagation cho file 'submission_mlp_tuned.tsv'")

ƒêang load d·ªØ li·ªáu...
B·∫Øt ƒë·∫ßu Train (C√≥ Early Stopping)...
Epoch 1: Train=0.0540 | Val=0.0158 --> Saved Best Model
Epoch 2: Train=0.0149 | Val=0.0142 --> Saved Best Model
Epoch 3: Train=0.0141 | Val=0.0136 --> Saved Best Model
Epoch 4: Train=0.0136 | Val=0.0132 --> Saved Best Model
Epoch 5: Train=0.0132 | Val=0.0129 --> Saved Best Model
Epoch 6: Train=0.0129 | Val=0.0126 --> Saved Best Model
Epoch 7: Train=0.0126 | Val=0.0124 --> Saved Best Model
Epoch 8: Train=0.0124 | Val=0.0123 --> Saved Best Model
Epoch 9: Train=0.0122 | Val=0.0121 --> Saved Best Model
Epoch 10: Train=0.0121 | Val=0.0119 --> Saved Best Model
Epoch 11: Train=0.0119 | Val=0.0119 | Patience 1/5
Epoch 12: Train=0.0118 | Val=0.0118 --> Saved Best Model
Epoch 13: Train=0.0117 | Val=0.0118 | Patience 1/5
Epoch 14: Train=0.0116 | Val=0.0116 --> Saved Best Model
Epoch 15: Train=0.0115 | Val=0.0115 --> Saved Best Model
Epoch 16: Train=0.0114 | Val=0.0115 --> Saved Best Model
Epoch 17: Train=0.0114 | Val=0.0115 | Pa

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [00:23<00:00,  1.95it/s]


Done! H√£y ch·∫°y ti·∫øp Propagation cho file 'submission_mlp_tuned.tsv'


In [5]:
%pip install -q obonet networkx

import networkx
import obonet
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# --- C·∫§U H√åNH ---
# 1. File ƒë·∫ßu v√†o: K·∫øt qu·∫£ c·ªßa MLP Tuned v·ª´a ch·∫°y
INPUT_FILE = "/workspace/notebooks/submission_mlp_tuned.tsv"

# 2. File ƒë·∫ßu ra: File n·ªôp cu·ªëi c√πng
OUTPUT_FILE = "/workspace/notebooks/submission_mlp_tuned_propagated.tsv"

# 3. T√¨m file OBO (C√¢y ph·∫£ h·ªá)
if os.path.exists("/workspace/data/Train/go-basic.obo"):
    OBO_PATH = "/workspace/data/Train/go-basic.obo"
else:
    OBO_PATH = "/workspace/data/go-basic.obo"

print(f"ƒêang d√πng file OBO t·∫°i: {OBO_PATH}")

# --- B∆Ø·ªöC 1: X√ÇY D·ª∞NG B·∫¢N ƒê·ªí T·ªî TI√äN ---
print("ƒêang ƒë·ªçc c·∫•u tr√∫c Gene Ontology...")
graph = obonet.read_obo(OBO_PATH)

print("ƒêang map quan h·ªá (Con -> Cha √îng)...")
ancestors_map = {}
# Duy·ªát qua t·∫•t c·∫£ c√°c node ƒë·ªÉ t√¨m t·ªï ti√™n (Pre-compute cho nhanh)
for node in tqdm(graph.nodes()):
    try:
        # networkx.descendants tr·∫£ v·ªÅ t·∫≠p h·ª£p c√°c node cha/√¥ng
        ancestors = networkx.descendants(graph, node)
        ancestors_map[node] = ancestors
    except:
        pass

# --- B∆Ø·ªöC 2: ƒê·ªåC FILE D·ª∞ ƒêO√ÅN ---
print(f"ƒêang ƒë·ªçc file {INPUT_FILE}...")
preds = {} # C·∫•u tr√∫c: {ProteinID: {Term: Score}}

try:
    with open(INPUT_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            pid, term, score = parts[0], parts[1], float(parts[2])
            
            if pid not in preds: preds[pid] = {}
            preds[pid][term] = score
except FileNotFoundError:
    print(f"L·ªói: Kh√¥ng t√¨m th·∫•y file {INPUT_FILE}. B·∫°n ƒë√£ ch·∫°y xong b∆∞·ªõc Train MLP Tuned ch∆∞a?")
    raise

# --- B∆Ø·ªöC 3: LAN TRUY·ªÄN (PROPAGATION) ---
print("ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Logic: ƒêi·ªÉm Cha = Max(Cha c≈©, Con))...")
output_lines = []

for pid, term_scores in tqdm(preds.items()):
    # T·∫°o b·∫£n sao ƒë·ªÉ c·∫≠p nh·∫≠t ƒëi·ªÉm
    new_scores = term_scores.copy()
    
    # Duy·ªát qua t·ª´ng term con ƒëang c√≥ ƒëi·ªÉm
    for term, score in term_scores.items():
        # N·∫øu term n√†y c√≥ cha √¥ng trong b·∫£n ƒë·ªì
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                # C·∫≠p nh·∫≠t ƒëi·ªÉm cho cha
                current_p_score = new_scores.get(parent, 0.0)
                # Ch·ªâ c·∫≠p nh·∫≠t n·∫øu ƒëi·ªÉm m·ªõi cao h∆°n ƒëi·ªÉm c≈©
                if score > current_p_score:
                    new_scores[parent] = score
    
    # --- B∆Ø·ªöC 4: L·ªåC & GHI FILE ---
    # S·∫Øp x·∫øp gi·∫£m d·∫ßn theo ƒëi·ªÉm
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    
    # L·∫•y Top 75 ƒëi·ªÉm cao nh·∫•t (ƒë·ªÉ file kh√¥ng qu√° n·∫∑ng)
    for term, score in sorted_terms[:75]:
        # Ch·ªâ l·∫•y ƒëi·ªÉm > 0.005
        if score > 0.005:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")

# --- B∆Ø·ªöC 5: L∆ØU FILE ---
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("-" * 30)
print(f"HO√ÄN TH√ÄNH! File '{OUTPUT_FILE}' ƒë√£ s·∫µn s√†ng.")
print("H√£y n·ªôp file n√†y l√™n Kaggle v√† xem s·ª± kh√°c bi·ªát!")

[0mNote: you may need to restart the kernel to use updated packages.
ƒêang d√πng file OBO t·∫°i: /workspace/data/Train/go-basic.obo
ƒêang ƒë·ªçc c·∫•u tr√∫c Gene Ontology...
ƒêang map quan h·ªá (Con -> Cha √îng)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 154454.33it/s]


ƒêang ƒë·ªçc file /workspace/notebooks/submission_mlp_tuned.tsv...


9935666it [00:04, 2323627.72it/s]


ƒêang lan truy·ªÅn ƒëi·ªÉm s·ªë (Logic: ƒêi·ªÉm Cha = Max(Cha c≈©, Con))...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 224309/224309 [00:14<00:00, 14988.68it/s]


ƒêang l∆∞u /workspace/notebooks/submission_mlp_tuned_propagated.tsv...
------------------------------
HO√ÄN TH√ÄNH! File '/workspace/notebooks/submission_mlp_tuned_propagated.tsv' ƒë√£ s·∫µn s√†ng.
H√£y n·ªôp file n√†y l√™n Kaggle v√† xem s·ª± kh√°c bi·ªát!


#### Score: 0.217

### Taxonomy-aware KNN + BLAST

In [6]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# --- C·∫§U H√åNH ---
# 1. File KNN x·ªãn nh·∫•t (File ƒë·∫°t 0.234)
# (ƒê·∫£m b·∫£o b·∫°n d√πng ƒë√∫ng file ƒë√£ ch·∫°y Propagation)
KNN_FILE = "/workspace/notebooks/submission_knn_tax_propagated.tsv"

# 2. C·∫•u h√¨nh ƒë·ªÉ ch·∫°y l·∫°i BLAST (n·∫øu c·∫ßn)
BLAST_RESULT = "/workspace/data/diamond_results.tsv"
TRAIN_FASTA = "/workspace/data/Train/train_sequences.fasta"
TEST_FASTA = "/workspace/data/Test/testsuperset.fasta"
DB_PATH = "/workspace/data/Train/train_data.dmnd"
DIAMOND_BIN = "/usr/bin/diamond" # Ho·∫∑c ƒë∆∞·ªùng d·∫´n t·ªõi file diamond

OUTPUT_FILE = "/workspace/notebooks/submission_FINAL_ENSEMBLE.tsv"

# --- B∆Ø·ªöC 1: ƒê·∫¢M B·∫¢O C√ì K·∫æT QU·∫¢ BLAST ---
if not os.path.exists(BLAST_RESULT):
    print("ƒêang ch·∫°y l·∫°i BLAST l·∫ßn cu·ªëi (cho ch·∫Øc ƒÉn)...")
    if not os.path.exists(DB_PATH):
        os.system(f"{DIAMOND_BIN} makedb --in {TRAIN_FASTA} -d {DB_PATH} --quiet")
    
    # Ch·∫°y Diamond (Sensitive mode)
    cmd = f"{DIAMOND_BIN} blastp -d {DB_PATH} -q {TEST_FASTA} -o {BLAST_RESULT} --sensitive --top 1 -f 6 qseqid sseqid pident"
    os.system(cmd)
else:
    print("ƒê√£ c√≥ s·∫µn k·∫øt qu·∫£ BLAST.")

# --- B∆Ø·ªöC 2: LOAD D·ªÆ LI·ªÜU BLAST ---
print("ƒêang x·ª≠ l√Ω BLAST...")
# Load mapping ID -> Terms
train_terms = pd.read_csv("/workspace/data/Train/train_terms.tsv", sep="\t", usecols=["EntryID", "term"])
train_terms_grouped = train_terms.groupby("EntryID")["term"].apply(list).to_dict()

blast_preds = {}
df_blast = pd.read_csv(BLAST_RESULT, sep="\t", names=["test_id", "train_id", "pident"])

for _, row in tqdm(df_blast.iterrows(), total=len(df_blast)):
    pident = row['pident'] / 100.0
    # Ch·ªâ tin t∆∞·ªüng BLAST n·∫øu ƒë·ªô gi·ªëng > 35%
    if pident < 0.35: continue
    
    # Clean ID
    tid = str(row['test_id']).split("|")[1] if "|" in str(row['test_id']) else str(row['test_id'])
    trid = str(row['train_id']).split("|")[1] if "|" in str(row['train_id']) else str(row['train_id'])
    
    if trid in train_terms_grouped:
        for term in train_terms_grouped[trid]:
            # L∆∞u ƒëi·ªÉm BLAST
            blast_preds[(tid, term)] = pident

# --- B∆Ø·ªöC 3: LOAD D·ªÆ LI·ªÜU KNN (0.234) ---
print(f"ƒêang ƒë·ªçc file KNN t·ªët nh·∫•t ({KNN_FILE})...")
knn_preds = {}
try:
    with open(KNN_FILE) as f:
        for line in tqdm(f):
            parts = line.strip().split('\t')
            if len(parts) < 3: continue
            knn_preds[(parts[0], parts[1])] = float(parts[2])
except FileNotFoundError:
    print(f"L·ªói: Kh√¥ng t√¨m th·∫•y file {KNN_FILE}. B·∫°n check l·∫°i t√™n file KNN 0.234 nh√©.")
    raise

# --- B∆Ø·ªöC 4: H·ª¢P TH·ªÇ (MAX STRATEGY) ---
print("ƒêang tr·ªôn (Max Strategy)...")
all_keys = set(blast_preds.keys()) | set(knn_preds.keys())
output_lines = []

for key in tqdm(all_keys):
    pid, term = key
    
    s_blast = blast_preds.get(key, 0.0)
    s_knn = knn_preds.get(key, 0.0)
    
    # L·∫•y ƒëi·ªÉm cao nh·∫•t gi·ªØa 2 thu·∫≠t to√°n
    final_score = max(s_blast, s_knn)
    
    if final_score > 0.01:
        output_lines.append(f"{pid}\t{term}\t{final_score:.3f}")

# --- B∆Ø·ªöC 5: L∆ØU FILE ---
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("DONE!")

ƒê√£ c√≥ s·∫µn k·∫øt qu·∫£ BLAST.
ƒêang x·ª≠ l√Ω BLAST...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 262530/262530 [00:03<00:00, 74232.65it/s]


ƒêang ƒë·ªçc file KNN t·ªët nh·∫•t (/workspace/notebooks/submission_knn_tax_propagated.tsv)...


16226847it [00:09, 1634421.68it/s]


ƒêang tr·ªôn (Max Strategy)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16693341/16693341 [00:18<00:00, 904218.13it/s]


ƒêang l∆∞u /workspace/notebooks/submission_FINAL_ENSEMBLE.tsv...
DONE!


#### Score: 0.224

### Weighted F1: Nh√£n hi·∫øm -> ƒëi·ªÉm cao. Nh∆∞ng KNN ƒëang x·ª≠ l√≠ c√°c nh√£n l√† nh∆∞ nhau

### Sol: IC weighting (Information Content weighting): Nh√£n n√†o hi·∫øm -> b∆°m ƒëi·ªÉm

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math

# --- C·∫§U H√åNH ---
# D√πng l·∫°i file t·ªët nh·∫•t c·ªßa b·∫°n (File ƒë·∫°t 0.234)
# (ƒê·∫£m b·∫£o file n√†y ƒëang n·∫±m trong th∆∞ m·ª•c hi·ªán t·∫°i)
INPUT_FILE = "submission_knn_tax_propagated.tsv"
OUTPUT_FILE = "submission_knn_tax_IC_weighted.tsv"
TRAIN_TERMS = "/workspace/data/Train/train_terms.tsv"

# --- B∆Ø·ªöC 1: T√çNH ƒê·ªò HI·∫æM (INFORMATION CONTENT) ---
print("ƒêang t√≠nh to√°n ƒë·ªô hi·∫øm c·ªßa t·ª´ng nh√£n (IC)...")
df_terms = pd.read_csv(TRAIN_TERMS, sep="\t", usecols=['term'])
term_counts = df_terms['term'].value_counts().to_dict()
total_proteins = len(df_terms)

# H√†m t√≠nh IC: C√†ng hi·∫øm c√†ng cao
# IC = -log2(Count / Total)
term_weights = {}
for term, count in term_counts.items():
    prob = count / total_proteins
    ic = -math.log2(prob)
    # Chu·∫©n h√≥a nh·∫π ƒë·ªÉ kh√¥ng b∆°m qu√° l·ªë (Scale v·ªÅ kho·∫£ng 1.0 - 2.0)
    # Nh√£n c·ª±c hi·∫øm (IC cao) s·∫Ω ƒë∆∞·ª£c nh√¢n h·ªá s·ªë l·ªõn
    term_weights[term] = ic

print(f"‚úÖ ƒê√£ t√≠nh tr·ªçng s·ªë cho {len(term_weights)} nh√£n.")

# --- B∆Ø·ªöC 2: B∆†M ƒêI·ªÇM CHO FILE C≈® ---
print(f"ƒêang ƒë·ªçc v√† boost ƒëi·ªÉm cho {INPUT_FILE}...")
output_lines = []

with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        # L·∫•y tr·ªçng s·ªë IC (N·∫øu kh√¥ng c√≥ trong train th√¨ m·∫∑c ƒë·ªãnh l√† hi·∫øm -> weight cao)
        # Weight trung b√¨nh th∆∞·ªùng t·∫ßm 5-10. Ta scale xu·ªëng ch√∫t cho an to√†n.
        weight = term_weights.get(term, 10.0) 
        
        # LOGIC BOOST:
        # Score m·ªõi = Score c≈© * (1 + Factor * IC)
        # Factor 0.05 nghƒ©a l√† nh√£n hi·∫øm ƒë∆∞·ª£c th∆∞·ªüng th√™m t·∫ßm 20-50% ƒëi·ªÉm
        boost_factor = 0.05 
        new_score = score * (1 + boost_factor * weight)
        
        # Clip v·ªÅ 1.0
        new_score = min(new_score, 1.0)
        
        # Ch·ªâ gi·ªØ l·∫°i n·∫øu ƒëi·ªÉm s·ªë ƒë·ªß cao (L·ªçc r√°c k·ªπ h∆°n)
        if new_score > 0.015:
            output_lines.append(f"{pid}\t{term}\t{new_score:.3f}")

# --- B∆Ø·ªöC 3: L∆ØU FILE ---
print(f"ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("XONG!")

ƒêang t√≠nh to√°n ƒë·ªô hi·∫øm c·ªßa t·ª´ng nh√£n (IC)...
‚úÖ ƒê√£ t√≠nh tr·ªçng s·ªë cho 26125 nh√£n.
ƒêang ƒë·ªçc v√† boost ƒëi·ªÉm cho submission_knn_tax_propagated.tsv...


16226847it [00:11, 1391612.00it/s]


ƒêang l∆∞u submission_knn_tax_IC_weighted.tsv...
XONG!


#### Score: 0.24

#### V√©t

In [2]:
import pandas as pd
from tqdm import tqdm
import math

# --- C·∫§U H√åNH ---
# D√πng file KNN g·ªëc (File 0.234 - tr∆∞·ªõc khi IC weight)
# L∆∞u √Ω: Ph·∫£i l√† file "submission_knn_tax_propagated.tsv"
INPUT_FILE = "submission_knn_tax_propagated.tsv"
TRAIN_TERMS = "/workspace/data/Train/train_terms.tsv"

# --- B∆Ø·ªöC 1: T√çNH L·∫†I IC (Gi·ªØ nguy√™n) ---
print("ƒêang t√≠nh IC...")
df_terms = pd.read_csv(TRAIN_TERMS, sep="\t", usecols=['term'])
term_counts = df_terms['term'].value_counts().to_dict()
total_proteins = len(df_terms)
term_weights = {}
for term, count in term_counts.items():
    term_weights[term] = -math.log2(count / total_proteins)

# --- B∆Ø·ªöC 2: T·∫†O 3 BI·∫æN TH·ªÇ ---
# C·∫•u h√¨nh c√°c chi·∫øn thu·∫≠t
strategies = [
    {"name": "v1_conservative", "boost": 0.02, "thr": 0.015}, # B∆°m nh·∫π
    {"name": "v2_aggressive",   "boost": 0.10, "thr": 0.015}, # B∆°m m·∫°nh tay
    {"name": "v3_loose_thr",    "boost": 0.05, "thr": 0.010}, # B∆°m v·ª´a, nh∆∞ng l·∫•y nhi·ªÅu nh√£n h∆°n
]

print("ƒêang t·∫°o 3 file bi·∫øn th·ªÉ...")

# ƒê·ªçc file g·ªëc v√†o b·ªô nh·ªõ cho nhanh
with open(INPUT_FILE) as f:
    lines = f.readlines()

for strat in strategies:
    out_name = f"submission_IC_{strat['name']}.tsv"
    boost = strat['boost']
    thr = strat['thr']
    
    print(f"-> T·∫°o {out_name} (Boost={boost}, Thr={thr})...")
    out_lines = []
    
    for line in tqdm(lines, leave=False):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        weight = term_weights.get(term, 10.0)
        
        # C√¥ng th·ª©c boost
        new_score = score * (1 + boost * weight)
        new_score = min(new_score, 1.0)
        
        if new_score > thr:
            out_lines.append(f"{pid}\t{term}\t{new_score:.3f}")
            
    with open(out_name, "w") as f:
        f.write("\n".join(out_lines))

print("ƒê√£ t·∫°o xong 3 file! H√£y n·ªôp l·∫ßn l∆∞·ª£t ƒë·ªÉ t√¨m ra c·∫•u h√¨nh t·ªët nh·∫•t.")

ƒêang t√≠nh IC...
ƒêang t·∫°o 3 file bi·∫øn th·ªÉ...
-> T·∫°o submission_IC_v1_conservative.tsv (Boost=0.02, Thr=0.015)...


                                                                

-> T·∫°o submission_IC_v2_aggressive.tsv (Boost=0.1, Thr=0.015)...


                                                                

-> T·∫°o submission_IC_v3_loose_thr.tsv (Boost=0.05, Thr=0.01)...


                                                                

ƒê√£ t·∫°o xong 3 file! H√£y n·ªôp l·∫ßn l∆∞·ª£t ƒë·ªÉ t√¨m ra c·∫•u h√¨nh t·ªët nh·∫•t.
