## Get and load data

In [1]:
import os
import gzip
import shutil
from Bio import SeqIO
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- C·∫§U H√åNH ---
DATA_DIR = "/workspace/data/External"
os.makedirs(DATA_DIR, exist_ok=True)

# Link t·∫£i Swiss-Prot (File FASTA ch·ª©a tr√¨nh t·ª±)
SPROT_URL = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
SPROT_GZ = f"{DATA_DIR}/uniprot_sprot.fasta.gz"
SPROT_FASTA = f"{DATA_DIR}/uniprot_sprot.fasta"

# 1. T·∫£i file (Kho·∫£ng 90MB n√©n -> 300MB gi·∫£i n√©n)
if not os.path.exists(SPROT_FASTA):
    print("ƒêang t·∫£i Swiss-Prot t·ª´ UniProt FTP...")
    os.system(f"wget -q {SPROT_URL} -O {SPROT_GZ}")
    
    print("ƒêang gi·∫£i n√©n...")
    with gzip.open(SPROT_GZ, 'rb') as f_in:
        with open(SPROT_FASTA, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("ƒê√£ t·∫£i xong file Fasta!")
else:
    print("File Swiss-Prot ƒë√£ c√≥ s·∫µn.")

# --- L·ªåC TR√ôNG (QUAN TR·ªåNG) ---
# Ta kh√¥ng mu·ªën ch·∫°y embedding l·∫°i cho 142k protein ƒë√£ c√≥ trong Kaggle Train.
# H√£y l·ªçc ch√∫ng ra.

# Load Kaggle Train IDs
print("ƒêang load danh s√°ch ID c·ªßa Kaggle Train...")
train_ids = np.load("/workspace/data/Embeddings/train_650M_ids.npy", allow_pickle=True)
kaggle_ids_set = set(train_ids)

# T·∫°o file Fasta m·ªõi ch·ªâ ch·ª©a protein "L·∫†"
NEW_FASTA = f"{DATA_DIR}/new_sprot_sequences.fasta"
new_ids = []

print(f"ƒêang l·ªçc protein m·ªõi v√† ghi v√†o {NEW_FASTA}...")
count = 0
with open(NEW_FASTA, "w") as f_out:
    for record in tqdm(SeqIO.parse(SPROT_FASTA, "fasta")):
        # Header UniProt: >sp|Q8K9I1|...
        pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
        
        # N·∫øu protein n√†y CH∆ØA c√≥ trong Kaggle -> Gi·ªØ l·∫°i
        if pid not in kaggle_ids_set:
            # Ghi v√†o file
            SeqIO.write(record, f_out, "fasta")
            new_ids.append(pid)
            count += 1

print(f"ƒê√£ l·ªçc xong! T√¨m th·∫•y {count} protein m·ªõi (Ngo√†i 142k c√°i c≈©).")
print(f"File n√†y s·∫Ω ƒë∆∞·ª£c d√πng ƒë·ªÉ ch·∫°y Embedding.")

ƒêang t·∫£i Swiss-Prot t·ª´ UniProt FTP...
ƒêang gi·∫£i n√©n...
ƒê√£ t·∫£i xong file Fasta!
ƒêang load danh s√°ch ID c·ªßa Kaggle Train...
ƒêang l·ªçc protein m·ªõi v√† ghi v√†o /workspace/data/External/new_sprot_sequences.fasta...


573661it [00:02, 245402.44it/s]

ƒê√£ l·ªçc xong! T√¨m th·∫•y 491257 protein m·ªõi (Ngo√†i 142k c√°i c≈©).
File n√†y s·∫Ω ƒë∆∞·ª£c d√πng ƒë·ªÉ ch·∫°y Embedding.





## Chunking

In [2]:
import os
import gc
import torch
import numpy as np
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# --- C·∫§U H√åNH ---
FASTA_PATH = "/workspace/data/External/new_sprot_sequences.fasta" # File m·ªõi t·∫°o
SAVE_DIR = "/workspace/data/External/embeddings_chunks"           # L∆∞u ri√™ng ra folder kh√°c
MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

CHUNK_SIZE = 5000  
BATCH_SIZE = 8    

os.makedirs(SAVE_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"ƒêang load model {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# H√†m x·ª≠ l√Ω (Y h·ªát c≈©)
def process_and_save(seqs, ids, part_idx):
    embeddings = []
    for i in range(0, len(seqs), BATCH_SIZE):
        batch_seqs = seqs[i : i + BATCH_SIZE]
        inputs = tokenizer(batch_seqs, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * mask, 1)
            sum_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            
        embeddings.append(mean_embeddings.cpu().numpy())
    
    final_emb = np.vstack(embeddings)
    np.save(f"{SAVE_DIR}/sprot_part_{part_idx}.npy", final_emb)
    np.save(f"{SAVE_DIR}/sprot_ids_{part_idx}.npy", ids)

# --- V√íNG L·∫∂P CH√çNH ---
sequences = []
ids = []
part_counter = 0

print(f"B·∫Øt ƒë·∫ßu Embedding Swiss-Prot (ƒê√¢y l√† b∆∞·ªõc t·ªën th·ªùi gian nh·∫•t)...")
# ƒê·∫øm s·ªë d√≤ng (∆∞·ªõc l∆∞·ª£ng)
# B·∫°n v·ª´a in ra bi·∫øn 'count' ·ªü cell tr√™n, thay s·ªë ƒë√≥ v√†o total n·∫øu mu·ªën thanh progress chu·∫©n
total_seqs = 427000 # ∆Ø·ªõc l∆∞·ª£ng
pbar = tqdm(total=total_seqs)

for record in SeqIO.parse(FASTA_PATH, "fasta"):
    # Resume Logic
    save_path_check = f"{SAVE_DIR}/sprot_part_{part_counter}.npy"
    if os.path.exists(save_path_check):
        sequences.append(1)
        if len(sequences) >= CHUNK_SIZE:
            sequences = [] 
            ids = []
            part_counter += 1
            pbar.update(CHUNK_SIZE)
        continue

    pid = str(record.id).split("|")[1] if "|" in str(record.id) else str(record.id)
    ids.append(pid)
    # C·∫Øt 1022
    sequences.append(str(record.seq)[:1022])
    pbar.update(1)
    
    if len(sequences) >= CHUNK_SIZE:
        process_and_save(sequences, ids, part_counter)
        part_counter += 1
        sequences = []
        ids = []
        gc.collect()

if len(sequences) > 0:
    process_and_save(sequences, ids, part_counter)

print("\SI√äU D·ªÆ LI·ªÜU ƒê√É S·∫¥N S√ÄNG!")

  print("\SI√äU D·ªÆ LI·ªÜU ƒê√É S·∫¥N S√ÄNG!")


ƒêang load model facebook/esm2_t33_650M_UR50D...


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


B·∫Øt ƒë·∫ßu Embedding Swiss-Prot (ƒê√¢y l√† b∆∞·ªõc t·ªën th·ªùi gian nh·∫•t)...


491257it [2:11:48, 58.31it/s]                              

\SI√äU D·ªÆ LI·ªÜU ƒê√É S·∫¥N S√ÄNG!


In [13]:
# --- B∆Ø·ªöC 3: CH·∫†Y KNN CU·ªêN CHI·∫æU ---
test_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                    key=lambda x: int(x.split('_')[-1].replace('.npy','')))

output_lines = []

# Duy·ªát qua t·ª´ng c·ª•c Test
print("B·∫Øt ƒë·∫ßu qu√©t...")
for test_f in tqdm(test_files, desc="Test Chunks"):
    # 1. Load 1 c·ª•c Test l√™n GPU
    X_test_np = np.load(test_f)
    ids_test = np.load(test_f.replace("test_part_", "test_ids_"), allow_pickle=True)
    
    X_test = torch.from_numpy(X_test_np).to(device)
    norm_test = X_test.norm(p=2, dim=1, keepdim=True)
    X_test = X_test.div(norm_test)
    
    # Bi·∫øn l∆∞u tr·ªØ Top K ·ª©ng vi√™n t·∫°m th·ªùi cho batch test n√†y
    # C·∫•u tr√∫c list of list: candidates[i] = [(score, pid), (score, pid)...]
    candidates = [[] for _ in range(len(ids_test))]
    
    # 2. Qu√©t qua t·ª´ng c·ª•c Train (Streaming)
    for train_chunk in train_chunks:
        # Load c·ª•c Train n√†y l√™n RAM -> GPU
        X_train_np = np.load(train_chunk["emb"])
        ids_train = np.load(train_chunk["ids"], allow_pickle=True)
        
        X_train = torch.from_numpy(X_train_np).to(device)
        norm_train = X_train.norm(p=2, dim=1, keepdim=True)
        X_train = X_train.div(norm_train)
        
        # T√≠nh Sim
        sim_matrix = torch.mm(X_test, X_train.t())
        
        # L·∫•y Top K c·ªßa ri√™ng c·ª•c n√†y
        # (L·∫•y nhi·ªÅu h∆°n ch√∫t ƒë·ªÅ ph√≤ng tr√πng l·∫∑p, v√≠ d·ª• K=20)
        curr_k = min(TOP_K, X_train.size(0))
        topk_vals, topk_idxs = torch.topk(sim_matrix, k=curr_k, dim=1)
        
        topk_vals = topk_vals.cpu().numpy()
        topk_idxs = topk_idxs.cpu().numpy()
        
        # C·∫≠p nh·∫≠t v√†o danh s√°ch ·ª©ng vi√™n
        for i in range(len(ids_test)):
            for k in range(curr_k):
                score = topk_vals[i, k]
                train_idx = topk_idxs[i, k]
                train_pid = ids_train[train_idx]
                
                candidates[i].append((score, train_pid))
        
        # D·ªçn d·∫πp GPU ngay sau khi d√πng xong c·ª•c train n√†y
        del X_train, sim_matrix, topk_vals, topk_idxs, X_train_np, ids_train
        torch.cuda.empty_cache()
        
    # 3. T·ªïng h·ª£p k·∫øt qu·∫£ cho c·ª•c Test n√†y
    # Sau khi qu√©t h·∫øt c√°c file train, ta sort l·∫°i v√† l·∫•y Top K x·ªãn nh·∫•t
    for i, test_pid in enumerate(ids_test):
        # Sort gi·∫£m d·∫ßn theo score
        my_candidates = sorted(candidates[i], key=lambda x: x[0], reverse=True)
        
        # L·∫•y Top K √¥ng h√†ng x√≥m t·ªët nh·∫•t to√†n c·ª•c
        best_neighbors = my_candidates[:TOP_K]
        
        # Voting Logic
        term_scores = {}
        for score, neighbor_pid in best_neighbors:
            if neighbor_pid in labels_map:
                for term in labels_map[neighbor_pid]:
                    if term not in term_scores: term_scores[term] = 0.0
                    term_scores[term] += score
        
        # Ghi k·∫øt qu·∫£
        sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)[:60]
        for term, total_score in sorted_terms:
            final_score = total_score / TOP_K
            if final_score > 0.01:
                output_lines.append(f"{test_pid}\t{term}\t{final_score:.3f}")

    # X√≥a c·ª•c Test
    del X_test, X_test_np, ids_test, candidates
    gc.collect()

B·∫Øt ƒë·∫ßu qu√©t...



Test Chunks:   0%|          | 0/45 [00:00<?, ?it/s][A
Test Chunks:   2%|‚ñè         | 1/45 [00:05<03:57,  5.40s/it][A
Test Chunks:   4%|‚ñç         | 2/45 [00:10<03:50,  5.35s/it][A
Test Chunks:   7%|‚ñã         | 3/45 [00:16<03:43,  5.32s/it][A
Test Chunks:   9%|‚ñâ         | 4/45 [00:21<03:37,  5.30s/it][A
Test Chunks:  11%|‚ñà         | 5/45 [00:26<03:31,  5.28s/it][A
Test Chunks:  13%|‚ñà‚ñé        | 6/45 [00:31<03:25,  5.26s/it][A
Test Chunks:  16%|‚ñà‚ñå        | 7/45 [00:36<03:19,  5.24s/it][A
Test Chunks:  18%|‚ñà‚ñä        | 8/45 [00:42<03:14,  5.26s/it][A
Test Chunks:  20%|‚ñà‚ñà        | 9/45 [00:47<03:08,  5.23s/it][A
Test Chunks:  22%|‚ñà‚ñà‚ñè       | 10/45 [00:52<03:02,  5.21s/it][A
Test Chunks:  24%|‚ñà‚ñà‚ñç       | 11/45 [00:57<02:56,  5.18s/it][A
Test Chunks:  27%|‚ñà‚ñà‚ñã       | 12/45 [01:02<02:49,  5.14s/it][A
Test Chunks:  29%|‚ñà‚ñà‚ñâ       | 13/45 [01:07<02:43,  5.10s/it][A
Test Chunks:  31%|‚ñà‚ñà‚ñà       | 14/45 [01:12<02:37,  5.07s/it][A
T

In [17]:
import torch
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
import glob
import os
import networkx
import obonet

# ================= C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N =================
# 1. D·ªØ li·ªáu Train (Kaggle + External)
KAGGLE_TRAIN_EMB = "/workspace/data/Embeddings/train_650M.npy"
KAGGLE_TRAIN_IDS = "/workspace/data/Embeddings/train_650M_ids.npy"
KAGGLE_TERMS     = "/workspace/data/Train/train_terms.tsv"

EXT_CHUNKS_DIR   = "/workspace/data/External/embeddings_chunks"
SPROT_TERMS_FILE = "/workspace/data/External/uniprot_sprot_terms.tsv"

# 2. D·ªØ li·ªáu Test
TEST_CHUNKS_DIR  = "/workspace/data/Embeddings/embeddings_chunks"

# 3. File C√¢y ph·∫£ h·ªá (OBO)
# Ki·ªÉm tra xem file n·∫±m ·ªü ƒë√¢u
if os.path.exists("/workspace/data/Train/go-basic.obo"):
    OBO_PATH = "/workspace/data/Train/go-basic.obo"
else:
    OBO_PATH = "/workspace/data/go-basic.obo"

# 4. File ƒë·∫ßu ra cu·ªëi c√πng
FINAL_OUTPUT = "submission_level10_complete.tsv"

# 5. Tham s·ªë
TOP_K = 15 
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"‚öôÔ∏è Thi·∫øt b·ªã: {device}")
print(f"üìÇ Output s·∫Ω l∆∞u t·∫°i: {FINAL_OUTPUT}")

# ================= PH·∫¶N 1: LOAD NH√ÉN (LABELS) =================
print("\n[1/4] ‚è≥ ƒêang load b·∫£n ƒë·ªì nh√£n (Label Map)...")
labels_map = {}

# A. Load Kaggle Labels
print("   -> Loading Kaggle Labels...")
df_k = pd.read_csv(KAGGLE_TERMS, sep="\t", usecols=["EntryID", "term"])
# Update v√†o dict
for pid, group in df_k.groupby("EntryID"):
    labels_map[pid] = set(group["term"].values)
del df_k
gc.collect()

# B. Load Swiss-Prot Labels (Streaming)
print("   -> Parsing Swiss-Prot Labels...")
if os.path.exists(SPROT_TERMS_FILE):
    chunksize = 200000
    for chunk in pd.read_csv(SPROT_TERMS_FILE, sep="\t", chunksize=chunksize):
        for _, row in chunk.iterrows():
            pid = row.iloc[0]
            go_string = str(row.iloc[1])
            if go_string == "nan": continue
            terms = set([t.strip() for t in go_string.split(";") if t.strip().startswith("GO:")])
            if len(terms) > 0:
                labels_map[pid] = terms
else:
    print("‚ö†Ô∏è C·∫£nh b√°o: Kh√¥ng th·∫•y file nh√£n Swiss-Prot!")

print(f"‚úÖ ƒê√£ load nh√£n cho {len(labels_map)} protein.")

# ================= PH·∫¶N 2: CHU·∫®N B·ªä DANH S√ÅCH FILE =================
print("\n[2/4] ‚è≥ Chu·∫©n b·ªã danh s√°ch file Train/Test...")
train_chunks = []

# Th√™m Kaggle Train g·ªëc
if os.path.exists(KAGGLE_TRAIN_EMB):
    train_chunks.append({"emb": KAGGLE_TRAIN_EMB, "ids": KAGGLE_TRAIN_IDS})

# Th√™m External Chunks
sprot_files = sorted(glob.glob(f"{EXT_CHUNKS_DIR}/sprot_part_*.npy"))
for f in sprot_files:
    train_chunks.append({"emb": f, "ids": f.replace("sprot_part_", "sprot_ids_")})

print(f"   -> T·ªïng c·ªông: {len(train_chunks)} m·∫£nh d·ªØ li·ªáu Train.")

# L·∫•y danh s√°ch Test
test_files = sorted(glob.glob(f"{TEST_CHUNKS_DIR}/test_part_*.npy"), 
                    key=lambda x: int(x.split('_')[-1].replace('.npy','')))
if len(test_files) == 0:
    raise FileNotFoundError(f"Kh√¥ng th·∫•y file Test n√†o trong {TEST_CHUNKS_DIR}")

# ================= PH·∫¶N 3: CH·∫†Y KNN STREAMING =================
print("\n[3/4] üöÄ B·∫Øt ƒë·∫ßu qu√©t KNN (Qu√° tr√¨nh n√†y m·∫•t kho·∫£ng 30-45 ph√∫t)...")

# Bi·∫øn l∆∞u k·∫øt qu·∫£ th√¥: {PID: {Term: Score}}
raw_predictions = {}

for test_f in tqdm(test_files, desc="Processing Test Chunks"):
    # 1. Load 1 c·ª•c Test
    X_test_np = np.load(test_f)
    ids_test = np.load(test_f.replace("test_part_", "test_ids_"), allow_pickle=True)
    
    X_test = torch.from_numpy(X_test_np).to(device)
    norm_test = X_test.norm(p=2, dim=1, keepdim=True)
    X_test = X_test.div(norm_test)
    
    # List l∆∞u ·ª©ng vi√™n: candidates[i] = [(score, pid), ...]
    candidates = [[] for _ in range(len(ids_test))]
    
    # 2. Qu√©t qua TO√ÄN B·ªò Train Chunks
    for train_chunk in train_chunks:
        try:
            X_train_np = np.load(train_chunk["emb"])
            ids_train = np.load(train_chunk["ids"], allow_pickle=True)
        except:
            continue 
            
        X_train = torch.from_numpy(X_train_np).to(device)
        norm_train = X_train.norm(p=2, dim=1, keepdim=True)
        X_train = X_train.div(norm_train)
        
        # T√≠nh Sim
        sim_matrix = torch.mm(X_test, X_train.t())
        
        # L·∫•y Top K c·ª•c b·ªô
        curr_k = min(TOP_K, X_train.size(0))
        topk_vals, topk_idxs = torch.topk(sim_matrix, k=curr_k, dim=1)
        
        topk_vals = topk_vals.cpu().numpy()
        topk_idxs = topk_idxs.cpu().numpy()
        
        # L∆∞u l·∫°i
        for i in range(len(ids_test)):
            for k in range(curr_k):
                candidates[i].append((topk_vals[i, k], ids_train[topk_idxs[i, k]]))
        
        del X_train, sim_matrix, topk_vals, topk_idxs, X_train_np, ids_train
        torch.cuda.empty_cache()
    
    # 3. T·ªïng h·ª£p & Voting cho c·ª•c Test n√†y
    for i, test_pid in enumerate(ids_test):
        # Sort l·∫•y Top K to√†n c·ª•c
        best_neighbors = sorted(candidates[i], key=lambda x: x[0], reverse=True)[:TOP_K]
        
        term_scores = {}
        for score, neighbor_pid in best_neighbors:
            if neighbor_pid in labels_map:
                for term in labels_map[neighbor_pid]:
                    if term not in term_scores: term_scores[term] = 0.0
                    term_scores[term] += score
        
        # Normalize score (chia cho K)
        final_term_scores = {t: s/TOP_K for t, s in term_scores.items()}
        raw_predictions[test_pid] = final_term_scores

    del X_test, X_test_np, ids_test, candidates
    gc.collect()

print(f"‚úÖ KNN Xong! ƒê√£ d·ª± ƒëo√°n cho {len(raw_predictions)} protein.")

# ================= PH·∫¶N 4: PROPAGATION & GHI FILE =================
print("\n[4/4] üöÄ ƒêang Lan truy·ªÅn ƒëi·ªÉm (Propagation) & Ghi File...")

# Load OBO
if not os.path.exists(OBO_PATH):
    print("‚ö†Ô∏è ƒêang t·∫£i OBO file...")
    os.system("wget -q http://purl.obolibrary.org/obo/go/go-basic.obo -O go-basic.obo")
    OBO_PATH = "go-basic.obo"

graph = obonet.read_obo(OBO_PATH)
ancestors_map = {}
for node in tqdm(graph.nodes(), desc="Mapping Ancestors"):
    try:
        ancestors_map[node] = networkx.descendants(graph, node)
    except:
        pass

output_lines = []

# Duy·ªát qua t·ª´ng protein ƒë√£ d·ª± ƒëo√°n
for pid, term_scores in tqdm(raw_predictions.items(), desc="Propagating"):
    new_scores = term_scores.copy()
    
    # Logic lan truy·ªÅn
    for term, score in term_scores.items():
        if term in ancestors_map:
            parents = ancestors_map[term]
            for parent in parents:
                current_p_score = new_scores.get(parent, 0.0)
                if score > current_p_score:
                    new_scores[parent] = score
    
    # L·ªçc Top 75 ƒë·ªÉ file nh·∫π
    sorted_terms = sorted(new_scores.items(), key=lambda x: x[1], reverse=True)
    count = 0
    for term, score in sorted_terms:
        if score > 0.01:
            output_lines.append(f"{pid}\t{term}\t{score:.3f}")
            count += 1
        if count >= 75: break

# Ghi file
print(f"üíæ ƒêang l∆∞u xu·ªëng {FINAL_OUTPUT}...")
with open(FINAL_OUTPUT, "w") as f:
    f.write("\n".join(output_lines))

print("\n" + "="*40)
print(f"üéâ HO√ÄN T·∫§T! File '{FINAL_OUTPUT}' ƒë√£ s·∫µn s√†ng.")
print("üëâ H√£y n·ªôp file n√†y. ƒê√¢y l√† k·∫øt qu·∫£ t·ªëi ∆∞u nh·∫•t!")

‚öôÔ∏è Thi·∫øt b·ªã: cuda
üìÇ Output s·∫Ω l∆∞u t·∫°i: submission_level10_complete.tsv

[1/4] ‚è≥ ƒêang load b·∫£n ƒë·ªì nh√£n (Label Map)...
   -> Loading Kaggle Labels...
   -> Parsing Swiss-Prot Labels...
‚úÖ ƒê√£ load nh√£n cho 553381 protein.

[2/4] ‚è≥ Chu·∫©n b·ªã danh s√°ch file Train/Test...
   -> T·ªïng c·ªông: 100 m·∫£nh d·ªØ li·ªáu Train.

[3/4] üöÄ B·∫Øt ƒë·∫ßu qu√©t KNN (Qu√° tr√¨nh n√†y m·∫•t kho·∫£ng 30-45 ph√∫t)...



Processing Test Chunks:   0%|          | 0/45 [00:00<?, ?it/s][A
Processing Test Chunks:   2%|‚ñè         | 1/45 [00:04<03:39,  5.00s/it][A
Processing Test Chunks:   4%|‚ñç         | 2/45 [00:09<03:31,  4.92s/it][A
Processing Test Chunks:   7%|‚ñã         | 3/45 [00:14<03:25,  4.89s/it][A
Processing Test Chunks:   9%|‚ñâ         | 4/45 [00:19<03:20,  4.89s/it][A
Processing Test Chunks:  11%|‚ñà         | 5/45 [00:24<03:14,  4.86s/it][A
Processing Test Chunks:  13%|‚ñà‚ñé        | 6/45 [00:29<03:09,  4.85s/it][A
Processing Test Chunks:  16%|‚ñà‚ñå        | 7/45 [00:34<03:04,  4.84s/it][A
Processing Test Chunks:  18%|‚ñà‚ñä        | 8/45 [00:38<02:58,  4.84s/it][A
Processing Test Chunks:  20%|‚ñà‚ñà        | 9/45 [00:43<02:54,  4.84s/it][A
Processing Test Chunks:  22%|‚ñà‚ñà‚ñè       | 10/45 [00:48<02:48,  4.83s/it][A
Processing Test Chunks:  24%|‚ñà‚ñà‚ñç       | 11/45 [00:53<02:43,  4.82s/it][A
Processing Test Chunks:  27%|‚ñà‚ñà‚ñã       | 12/45 [00:58<02:38,  4.81s/it][

‚úÖ KNN Xong! ƒê√£ d·ª± ƒëo√°n cho 224309 protein.

[4/4] üöÄ ƒêang Lan truy·ªÅn ƒëi·ªÉm (Propagation) & Ghi File...



Mapping Ancestors:   0%|          | 0/40122 [00:00<?, ?it/s][A
Mapping Ancestors:  28%|‚ñà‚ñà‚ñä       | 11368/40122 [00:00<00:01, 20333.28it/s][A
Mapping Ancestors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40122/40122 [00:00<00:00, 53198.80it/s][A

Propagating:   0%|          | 0/224309 [00:00<?, ?it/s][A
Propagating:   0%|          | 559/224309 [00:00<00:40, 5588.47it/s][A
Propagating:   1%|          | 1158/224309 [00:00<00:38, 5822.55it/s][A
Propagating:   1%|          | 1763/224309 [00:00<00:37, 5920.15it/s][A
Propagating:   1%|          | 2378/224309 [00:00<00:36, 6010.01it/s][A
Propagating:   1%|‚ñè         | 2989/224309 [00:00<00:36, 6044.65it/s][A
Propagating:   2%|‚ñè         | 3594/224309 [00:00<00:36, 5993.30it/s][A
Propagating:   2%|‚ñè         | 4194/224309 [00:00<00:37, 5921.83it/s][A
Propagating:   2%|‚ñè         | 4827/224309 [00:00<00:36, 6049.77it/s][A
Propagating:   2%|‚ñè         | 5461/224309 [00:00<00:35, 6135.53it/s][A
Propagating:   3%|‚ñé         | 60

üíæ ƒêang l∆∞u xu·ªëng submission_level10_complete.tsv...

üéâ HO√ÄN T·∫§T! File 'submission_level10_complete.tsv' ƒë√£ s·∫µn s√†ng.
üëâ H√£y n·ªôp file n√†y. ƒê√¢y l√† k·∫øt qu·∫£ t·ªëi ∆∞u nh·∫•t!


In [18]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

# --- C·∫§U H√åNH ---
# 1. File d·ª± ƒëo√°n t·ªët nh·∫•t hi·ªán c√≥ (File Level 10 ƒë√£ Propagation - 0.249)
INPUT_FILE = "submission_level10_complete.tsv"

# 2. File tr·ªçng s·ªë chu·∫©n c·ªßa BTC
IA_FILE = "/workspace/data/IA.tsv"

# 3. File ƒë·∫ßu ra
OUTPUT_FILE = "submission_level11_IA_official.tsv"

# --- B∆Ø·ªöC 1: LOAD FILE IA.TSV ---
print(f"üìñ ƒêang ƒë·ªçc b·∫£ng tr·ªçng s·ªë chu·∫©n t·ª´ {IA_FILE}...")

# File IA.tsv th∆∞·ªùng kh√¥ng c√≥ header, c·ªôt 1 l√† Term, c·ªôt 2 l√† IA Score
# N·∫øu file n·∫±m trong th∆∞ m·ª•c Train th√¨ s·ª≠a ƒë∆∞·ªùng d·∫´n l·∫°i nh√©
if not os.path.exists(IA_FILE):
    # Th·ª≠ t√¨m trong th∆∞ m·ª•c Train n·∫øu kh√¥ng th·∫•y ·ªü ngo√†i
    IA_FILE = "/workspace/data/Train/IA.tsv"

try:
    # ƒê·ªçc file IA
    df_ia = pd.read_csv(IA_FILE, sep="\t", names=["term", "ia_score"])
    
    # Chuy·ªÉn th√†nh Dict cho nhanh: {GO:0001 : 5.23}
    ia_weights = dict(zip(df_ia["term"], df_ia["ia_score"]))
    print(f"‚úÖ ƒê√£ load tr·ªçng s·ªë cho {len(ia_weights)} nh√£n.")
    
except FileNotFoundError:
    print("‚ùå L·ªñI: Kh√¥ng t√¨m th·∫•y file IA.tsv! B·∫°n check l·∫°i xem n√≥ n·∫±m ·ªü folder data g·ªëc hay data/Train?")
    raise

# --- B∆Ø·ªöC 2: B∆†M ƒêI·ªÇM D·ª∞A TR√äN TR·ªåNG S·ªê CHU·∫®N ---
print(f"üöÄ ƒêang boost ƒëi·ªÉm cho {INPUT_FILE}...")
output_lines = []

# H·ªá s·ªë boost (Th·ª≠ m·ª©c nh·∫π nh√†ng tr∆∞·ªõc)
BOOST_FACTOR = 0.02 
THRESHOLD = 0.015 

with open(INPUT_FILE) as f:
    for line in tqdm(f):
        parts = line.strip().split('\t')
        if len(parts) < 3: continue
        pid, term, score = parts[0], parts[1], float(parts[2])
        
        # L·∫•y tr·ªçng s·ªë t·ª´ file IA
        # N·∫øu nh√£n kh√¥ng c√≥ trong file IA (nh√£n l·∫°), ta cho m·∫∑c ƒë·ªãnh l√† 0 (kh√¥ng boost)
        weight = ia_weights.get(term, 0.0)
        
        # C√¥ng th·ª©c Boost: Score * (1 + h·ªá_s·ªë * ƒë·ªô_hi·∫øm)
        # Nh√£n c√†ng hi·∫øm (IA cao), ƒëi·ªÉm c√†ng ƒë∆∞·ª£c nh√¢n l√™n nhi·ªÅu
        new_score = score * (1 + BOOST_FACTOR * weight)
        new_score = min(new_score, 1.0)
        
        # L·ªçc ng∆∞·ª°ng
        if new_score > THRESHOLD:
            output_lines.append(f"{pid}\t{term}\t{new_score:.3f}")

# --- B∆Ø·ªöC 3: L∆ØU FILE ---
print(f"üíæ ƒêang l∆∞u {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    f.write("\n".join(output_lines))

print("üéâ XONG! ƒê√¢y m·ªõi l√† c√°ch d√πng tr·ªçng s·ªë 'chu·∫©n b√†i' nh·∫•t.")

üìñ ƒêang ƒë·ªçc b·∫£ng tr·ªçng s·ªë chu·∫©n t·ª´ /workspace/data/IA.tsv...
‚úÖ ƒê√£ load tr·ªçng s·ªë cho 40122 nh√£n.
üöÄ ƒêang boost ƒëi·ªÉm cho submission_level10_complete.tsv...



0it [00:00, ?it/s][A
112935it [00:00, 1129332.23it/s][A
230673it [00:00, 1157587.97it/s][A
346733it [00:00, 1158939.85it/s][A
462627it [00:00, 1153247.02it/s][A
583442it [00:00, 1172976.30it/s][A
700749it [00:00, 1151758.40it/s][A
828186it [00:00, 1191292.92it/s][A
967163it [00:00, 1253884.94it/s][A
1099450it [00:00, 1275307.24it/s][A
1236092it [00:01, 1303306.42it/s][A
1367548it [00:01, 1306737.42it/s][A
1498446it [00:01, 1307414.75it/s][A
1629229it [00:01, 1307453.99it/s][A
1767649it [00:01, 1330606.01it/s][A
1903456it [00:01, 1338874.65it/s][A
2037360it [00:01, 1329407.69it/s][A
2170328it [00:01, 1325677.06it/s][A
2303024it [00:01, 1326055.06it/s][A
2436644it [00:01, 1329086.30it/s][A
2570810it [00:02, 1332847.31it/s][A
2704104it [00:02, 1328665.78it/s][A
2836979it [00:02, 1327720.12it/s][A
2969757it [00:02, 1327566.07it/s][A
3104234it [00:02, 1332710.52it/s][A
3243809it [00:02, 1351590.08it/s][A
3383844it [00:02, 1366201.89it/s][A
3520470it [00:02, 13538

üíæ ƒêang l∆∞u submission_level11_IA_official.tsv...
üéâ XONG! ƒê√¢y m·ªõi l√† c√°ch d√πng tr·ªçng s·ªë 'chu·∫©n b√†i' nh·∫•t.
