In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer
import gc
import time
import zipfile
from tqdm.auto import tqdm # Thêm thư viện này để đếm thời gian

print("--- KHỞI CHẠY: ML (K-MER) + NAIVE ENSEMBLE (CÓ BỘ ĐẾM THỜI GIAN) ---", flush=True)
start_time = time.time()

# --- 1. CẤU HÌNH ---
TRAIN_SEQ = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta'
TRAIN_TERMS = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv'
TEST_SEQ = '/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta'

NUM_TERMS = 500  
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"

# --- 2. HÀM XỬ LÝ ID & K-MERS ---
def clean_id(header):
    header = header.replace('>', '').strip()
    if '|' in header:
        parts = header.split('|')
        if len(parts) >= 2: return parts[1]
    return header.split()[0]

def get_kmers(sequence, k=2):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def load_data_kmer(fasta_path):
    print(f"-> Đang đọc file: {os.path.basename(fasta_path)}...", flush=True)
    ids = []
    seqs = []
    
   
    
    try:
        with open(fasta_path, 'r') as f:
            cid, cseq = "", []
            # Dùng tqdm để hiển thị tiến trình đọc file (theo bytes nếu không đếm dòng)
            for line in tqdm(f, desc="Reading Fasta", unit="lines"): 
                line = line.strip()
                if line.startswith(">"):
                    if cid:
                        ids.append(cid); seqs.append(" ".join(get_kmers("".join(cseq))))
                    cid = clean_id(line)
                    cseq = []
                else: cseq.append(line)
            if cid: ids.append(cid); seqs.append(" ".join(get_kmers("".join(cseq))))
    except FileNotFoundError:
        # Fallback
        for root, _, files in os.walk('/kaggle/input'):
            for name in files:
                if 'test' in name.lower() and (name.endswith('.fa') or name.endswith('.fasta')):
                    return load_data_kmer(os.path.join(root, name))
        return [], []
    return ids, seqs

# --- 3. CHUẨN BỊ DỮ LIỆU TRAIN ---
print("\n[1/5] Chuẩn bị dữ liệu Train...", flush=True)
train_ids, train_texts = load_data_kmer(TRAIN_SEQ)
print(f"   Đã load {len(train_ids)} mẫu train.")

print("-> Vector hóa dữ liệu (K-mers)...", flush=True)
vectorizer = CountVectorizer(max_features=800) 
X_train = vectorizer.fit_transform(train_texts)

print("-> Đọc và khớp nhãn...", flush=True)
train_terms = pd.read_csv(TRAIN_TERMS, sep='\t', header=None, skiprows=1, usecols=[0, 1])
train_terms.columns = ["EntryID", "term"]

train_ids_set = set(train_ids)
# Clean ID terms nếu cần
if not train_terms['EntryID'].isin(train_ids_set).any():
    print("   Fixing ID in Terms file...")
    train_terms['EntryID'] = train_terms['EntryID'].apply(lambda x: str(x).strip())

train_terms = train_terms[train_terms['EntryID'].isin(train_ids_set)]

if train_terms.empty:
    raise ValueError("LỖI: Không khớp ID!")

top_terms = train_terms['term'].value_counts().head(NUM_TERMS).index.tolist()
term_to_idx = {t: i for i, t in enumerate(top_terms)}

# Tính Naive
naive_counts = train_terms['term'].value_counts().head(NUM_TERMS)
total_train_ids = len(train_ids)
naive_probs = (naive_counts / total_train_ids).to_dict()

print("-> Tạo ma trận Y...", flush=True)
Y_train = np.zeros((len(train_ids), NUM_TERMS), dtype=np.int8)
id_map = {pid: i for i, pid in enumerate(train_ids)}

grouped = train_terms.groupby('EntryID')['term']
# Dùng tqdm cho vòng lặp tạo nhãn
for pid, terms in tqdm(grouped, desc="Creating Y Matrix"):
    if pid in id_map:
        idx_row = id_map[pid]
        indices = [term_to_idx[t] for t in terms if t in term_to_idx]
        Y_train[idx_row, indices] = 1

# Fix lỗi cột rỗng
col_sums = Y_train.sum(axis=0)
valid_cols = np.where(col_sums > 0)[0]
Y_train = Y_train[:, valid_cols]
top_terms = [top_terms[i] for i in valid_cols]
NUM_TERMS = len(top_terms)

print(f"-> Train shape: {X_train.shape}, Y shape: {Y_train.shape}")

# --- 4. TRAIN MODEL ---
print("\n[2/5] Huấn luyện Logistic Regression...", flush=True)
clf = MultiOutputClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, n_jobs=-1, random_state=42, class_weight='balanced'))
clf.fit(X_train, Y_train)
print("-> ML đã học xong!")

# --- 5. DỰ ĐOÁN TEST ---
print("\n[3/5] Dự đoán Test...", flush=True)
test_ids, test_texts = load_data_kmer(TEST_SEQ)
X_test = vectorizer.transform(test_texts)

batch_size = 5000
ml_preds_matrix = np.zeros((len(test_ids), NUM_TERMS), dtype=np.float32)

# Thêm progress bar cho việc dự đoán
for i in tqdm(range(0, len(test_ids), batch_size), desc="Predicting Batches"):
    end = min(i + batch_size, len(test_ids))
    p = clf.predict_proba(X_test[i:end])
    try:
        ml_preds_matrix[i:end] = np.array([prob[:, 1] for prob in p]).T
    except:
        for j, prob in enumerate(p):
            if prob.shape[1] == 2:
                ml_preds_matrix[i:end, j] = prob[:, 1]
            else:
                ml_preds_matrix[i:end, j] = 0

# --- 6. ENSEMBLE ---
print("\n[4/5] Kết hợp (Ensemble) và Ghi file...", flush=True)
submission_data = []
TOP_K = 50 
W_ML = 0.4
W_NAIVE = 0.6

# Thêm progress bar cho việc trộn kết quả
for i, pid in tqdm(enumerate(test_ids), total=len(test_ids), desc="Ensembling"):
    ml_scores = ml_preds_matrix[i]
    final_scores = {}
    
    for idx, score in enumerate(ml_scores):
        term = top_terms[idx]
        n_score = naive_probs.get(term, 0.0)
        hybrid = (score * W_ML) + (n_score * W_NAIVE)
        if hybrid > 0.01:
            final_scores[term] = hybrid
            
    for term, n_score in list(naive_probs.items())[:10]:
        if term not in final_scores:
            final_scores[term] = n_score * W_NAIVE
    
    sorted_items = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:TOP_K]
    for term, score in sorted_items:
        submission_data.append(f"{pid}\t{term}\t{score:.4f}\n")

# --- 7. LƯU FILE ---
print("\n[5/5] Lưu file...", flush=True)
with open('submission.tsv', 'w') as f:
    f.writelines(submission_data)

with zipfile.ZipFile('submission.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('submission.tsv')

total_time = (time.time() - start_time) / 60
print(f"✅ HOÀN TẤT! Tổng thời gian: {total_time:.1f} phút.")

--- KHỞI CHẠY: ML (K-MER) + NAIVE ENSEMBLE (CÓ BỘ ĐẾM THỜI GIAN) ---

[1/5] Chuẩn bị dữ liệu Train...
-> Đang đọc file: train_sequences.fasta...


Reading Fasta: 0lines [00:00, ?lines/s]

   Đã load 82404 mẫu train.
-> Vector hóa dữ liệu (K-mers)...
-> Đọc và khớp nhãn...
-> Tạo ma trận Y...


Creating Y Matrix:   0%|          | 0/82404 [00:00<?, ?it/s]

-> Train shape: (82404, 480), Y shape: (82404, 500)

[2/5] Huấn luyện Logistic Regression...
-> ML đã học xong!

[3/5] Dự đoán Test...
-> Đang đọc file: testsuperset.fasta...


Reading Fasta: 0lines [00:00, ?lines/s]

Predicting Batches:   0%|          | 0/45 [00:00<?, ?it/s]


[4/5] Kết hợp (Ensemble) và Ghi file...


Ensembling:   0%|          | 0/224309 [00:00<?, ?it/s]


[5/5] Lưu file...
✅ HOÀN TẤT! Tổng thời gian: 34.9 phút.
