In [2]:
!pip install faiss-cpu

import faiss
print(faiss.__version__)

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.7/23.7 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1
1.13.1


In [3]:
import numpy as np
import pandas as pd
import faiss
from tqdm import tqdm
import os
import gc

# ============================================================================
# C·∫§U H√åNH K·ª∏ THU·∫¨T (STRICT MODE)
# ============================================================================
CONFIG = {
    'EMBED_DIR': "/kaggle/input/cafa6-embeds", 
    'TRAIN_TERMS': "/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv",
    'TRAIN_IDS': "/kaggle/input/cafa6-embeds/train_ids.txt",
    'TEST_IDS': "/kaggle/input/cafa6-embeds/test_ids.txt",
    
    'K_NEIGHBORS': 10,       # K nh·ªè
    'HOMOLOGY_THRESHOLD': 0.85, 
    'MIN_TERM_VOTES': 2,
    
    'BATCH_SIZE': 1000,
}

def run_consensus_knn_miner():
    print("üöÄ STARTING CONSENSUS HOMOLOGY MINER (FAISS CPU)...")
    
    # 1. Load Data & Embeddings (Gi·ªØ nguy√™n nh∆∞ tr∆∞·ªõc)
    print("   1. Loading Resources...")
    df = pd.read_csv(CONFIG['TRAIN_TERMS'], sep='\t', usecols=['EntryID', 'term'])
    labels_dict = df.groupby('EntryID')['term'].apply(list).to_dict()
    del df; gc.collect()

    with open(CONFIG['TRAIN_IDS']) as f: train_ids = np.array([l.strip() for l in f])
    with open(CONFIG['TEST_IDS']) as f: test_ids = np.array([l.strip() for l in f])
    
    X_train = np.load(os.path.join(CONFIG['EMBED_DIR'], "train_embeds.npy")).astype('float32')
    X_test = np.load(os.path.join(CONFIG['EMBED_DIR'], "test_embeds.npy")).astype('float32')
    
    faiss.normalize_L2(X_train)
    faiss.normalize_L2(X_test)

    # 2. Build Index
    print("   2. Building FAISS Index...")
    index = faiss.IndexFlatIP(X_train.shape[1])
    index.add(X_train)
    del X_train; gc.collect()

    # 3. Search & Filter (LOGIC C·∫¨P NH·∫¨T)
    print(f"   3. Mining with Consensus Check...")
    output_file = "knn_homology_candidates.tsv"
    
    with open(output_file, "w") as f_out:
        for i in tqdm(range(0, X_test.shape[0], CONFIG['BATCH_SIZE'])):
            batch_test = X_test[i : i + CONFIG['BATCH_SIZE']]
            D, I = index.search(batch_test, CONFIG['K_NEIGHBORS'])
            
            batch_lines = []
            
            for j in range(len(batch_test)):
                # L·ªçc ngay t·ª´ ƒë·∫ßu: H√†ng x√≥m th·ª© 2 ph·∫£i x·ªãn (Sim > 0.85)
                # N·∫øu kh√¥ng th√¨ ch·∫Øc ch·∫Øn kh√¥ng ƒë·ªß 2 vote
                if D[j, 1] < CONFIG['HOMOLOGY_THRESHOLD']: continue
                
                pid = test_ids[i + j]
                term_scores = {} # T·ªïng ƒëi·ªÉm sim
                term_votes = {}  # ƒê·∫øm s·ªë ng∆∞·ªùi vote
                
                # Duy·ªát qua c√°c h√†ng x√≥m
                for k in range(CONFIG['K_NEIGHBORS']):
                    sim = float(D[j, k])
                    if sim < CONFIG['HOMOLOGY_THRESHOLD']: break 
                    
                    neighbor_pid = train_ids[I[j, k]]
                    terms = labels_dict.get(neighbor_pid, [])
                    
                    for t in terms:
                        term_scores[t] = term_scores.get(t, 0.0) + sim
                        term_votes[t] = term_votes.get(t, 0) + 1
                
                if not term_scores: continue
                
                # L·ªçc v√† Ghi
                valid_terms = []
                for t, raw_sum in term_scores.items():
                    votes = term_votes[t]
                    
                    # [QUAN TR·ªåNG] Ch·ªâ l·∫•y nh√£n c√≥ >= 2 ng∆∞·ªùi vote
                    if votes < CONFIG['MIN_TERM_VOTES']: continue
                    
                    # T√≠nh ƒëi·ªÉm trung b√¨nh c·ªßa nh·ªØng ng∆∞·ªùi vote
                    # V√≠ d·ª•: 2 ng∆∞·ªùi vote (0.9, 0.88) -> Avg = 0.89 (R·∫•t cao)
                    avg_score = raw_sum / votes
                    
                    # Ch·∫∑n d∆∞·ªõi l·∫ßn cu·ªëi (ƒë·ªÉ ch·∫Øc ch·∫Øn kh√¥ng c√≥ r√°c)
                    if avg_score >= 0.85:
                        valid_terms.append((t, avg_score))
                
                # Sort l·∫•y top (ch·ªâ c·∫ßn l·∫•y √≠t th√¥i v√¨ ƒë√¢y l√† Homology m·∫°nh)
                valid_terms.sort(key=lambda x: x[1], reverse=True)
                
                for term, score in valid_terms[:50]:
                    batch_lines.append(f"{pid}\t{term}\t{score:.4f}\n")
            
            f_out.write("".join(batch_lines))
            
    print(f"‚úÖ DONE! Consensus Candidates saved to {output_file}")

if __name__ == "__main__":
    run_consensus_knn_miner()

üöÄ STARTING CONSENSUS HOMOLOGY MINER (FAISS CPU)...
   1. Loading Resources...
   2. Building FAISS Index...
   3. Mining with Consensus Check...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 225/225 [07:34<00:00,  2.02s/it]

‚úÖ DONE! Consensus Candidates saved to knn_homology_candidates.tsv





In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# C·∫§U H√åNH
KNN_FILE = "knn_homology_candidates.tsv"
IA_FILE = "/kaggle/input/cafa-6-protein-function-prediction/IA.tsv"

def analyze_knn_candidates():
    print(f"üìä ANALYZING {KNN_FILE}...\n")
    
    # 1. Load Data
    try:
        df = pd.read_csv(KNN_FILE, sep='\t', names=['pid', 'term', 'score'])
    except FileNotFoundError:
        print("‚ùå Error: File not found!")
        return
    except pd.errors.EmptyDataError:
        print("‚ö†Ô∏è Warning: File is EMPTY! (Threshold qu√° g·∫Øt, kh√¥ng t√¨m th·∫•y h√†ng x√≥m n√†o).")
        return

    # 2. Load IA
    print("   Loading IA weights...")
    ia_map = {}
    try:
        with open(IA_FILE, 'r') as f:
            for line in f:
                p = line.strip().split('\t')
                if len(p) >= 2: ia_map[p[0]] = float(p[1])
    except: pass
    
    # Map IA v√†o DataFrame
    df['ia'] = df['term'].map(ia_map).fillna(0)
    
   # =========================================================================
    # 3. B√ÅO C√ÅO TH·ªêNG K√ä (C·∫¨P NH·∫¨T: IA THEO KHO·∫¢NG CHI TI·∫æT)
    # =========================================================================
    
    n_prots = df['pid'].nunique()
    n_terms = df['term'].nunique()
    n_rows = len(df)
    
    print("-" * 40)
    print(f"üîπ T·ªîNG QUAN:")
    print(f"   - T·ªïng s·ªë d√≤ng (Predictions): {n_rows:,}")
    print(f"   - S·ªë Protein ƒë∆∞·ª£c 'C·ª©u' (Covered): {n_prots:,}")
    print(f"   - S·ªë Nh√£n GO xu·∫•t hi·ªán: {n_terms:,}")
    print(f"   - Trung b√¨nh s·ªë nh√£n/protein: {n_rows / n_prots:.1f}")
    
    print("-" * 40)
    print(f"üîπ PH√ÇN PH·ªêI ƒêI·ªÇM S·ªê (SCORE):")
    print(df['score'].describe().to_string())
    print(f"\n   -> Min Score check: {df['score'].min():.4f} (Ph·∫£i >= 0.85 n·∫øu code ƒë√∫ng)")
    
    print("-" * 40)
    print("üîπ PH√ÇN B·ªê ƒê·ªò HI·∫æM (IA) THEO KHO·∫¢NG:")
    
    # ƒê·ªãnh nghƒ©a c√°c bins IA
    bins = [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1e9]
    labels = [
        "IA == 0",
        "0 ‚Üí 1",
        "1 ‚Üí 2",
        "2 ‚Üí 3",
        "3 ‚Üí 4",
        "4 ‚Üí 5",
        "5 ‚Üí 6",
        "6 ‚Üí 7",
        "7 ‚Üí 8",
        "8 ‚Üí 9",
        "9 ‚Üí 10",
        "> 10"
    ]
    
    df['ia_bin'] = pd.cut(df['ia'], bins=bins, labels=labels)
    
    ia_dist = df['ia_bin'].value_counts().sort_index()

    for k in labels:
        count = ia_dist.get(k, 0)
        ratio = count / n_rows * 100
        print(f"   {k:8s} : {count:10,}  |  {ratio:6.2f} %")
    
    # =========================================================================
    # 4. ƒê√ÅNH GI√Å (VERDICT)
    # =========================================================================
    
    print("-" * 40)
    print("üöÄ ƒê√ÅNH GI√Å KH·∫¢ NƒÇNG ENSEMBLE:")
    
    # Ti√™u ch√≠ 1: Min Score
    if df['score'].min() < 0.84:
        print("‚ùå C·∫¢NH B√ÅO: C√≥ ƒëi·ªÉm s·ªë th·∫•p (< 0.85). Code l·ªçc ch∆∞a chu·∫©n!")
    else:
        print("‚úÖ Score Quality: T·ªët (To√†n b·ªô l√† Homology m·∫°nh).")
    
    # T√≠nh Tail theo IA >= 5
    tail = df[df['ia'] >= 5]
    super_rare = df[df['ia'] >= 10]
    
    tail_ratio = len(tail) / n_rows
    super_ratio = len(super_rare) / n_rows
    
    print(f"   - TAIL (IA ‚â• 5)      : {len(tail):,}  |  {tail_ratio:.2%}")
    print(f"   - SUPER RARE (IA ‚â•10): {len(super_rare):,}  |  {super_ratio:.2%}")
    
    if tail_ratio > 0.2:
        print("‚úÖ Rarity: T·ªët. KNN t·∫≠p trung ƒë√∫ng v√πng nh√£n hi·∫øm.")
    else:
        print("‚ö†Ô∏è Rarity: Th·∫•p. KNN v·∫´n thi√™n nhi·ªÅu v·ªÅ head.")
    
    # ƒê√°nh gi√° Coverage
    if n_prots < 500:
        print("‚ö†Ô∏è Coverage: R·∫•t th·∫•p (< 500 proteins). Ng∆∞·ª°ng ƒëang qu√° g·∫Øt.")
    elif n_prots > 50000:
        print("‚ö†Ô∏è Coverage: R·∫•t cao. KNN ƒëang ph·ªß qu√° r·ªông.")
    else:
        print(f"‚úÖ Coverage: H·ª£p l√Ω ({n_prots:,} proteins c√≥ h√†ng x√≥m x·ªãn).")

if __name__ == "__main__":
    analyze_knn_candidates()

üìä ANALYZING knn_homology_candidates.tsv...

   Loading IA weights...
----------------------------------------
üîπ T·ªîNG QUAN:
   - T·ªïng s·ªë d√≤ng (Predictions): 2,006,071
   - S·ªë Protein ƒë∆∞·ª£c 'C·ª©u' (Covered): 223,968
   - S·ªë Nh√£n GO xu·∫•t hi·ªán: 14,504
   - Trung b√¨nh s·ªë nh√£n/protein: 9.0
----------------------------------------
üîπ PH√ÇN PH·ªêI ƒêI·ªÇM S·ªê (SCORE):
count    2.006071e+06
mean     9.845593e-01
std      1.455886e-02
min      8.505000e-01
25%      9.797000e-01
50%      9.884000e-01
75%      9.942000e-01
max      1.000000e+00

   -> Min Score check: 0.8505 (Ph·∫£i >= 0.85 n·∫øu code ƒë√∫ng)
----------------------------------------
üîπ PH√ÇN B·ªê ƒê·ªò HI·∫æM (IA) THEO KHO·∫¢NG:
   IA == 0  :    198,370  |    9.89 %
   0 ‚Üí 1    :  1,014,524  |   50.57 %
   1 ‚Üí 2    :    341,029  |   17.00 %
   2 ‚Üí 3    :    167,992  |    8.37 %
   3 ‚Üí 4    :    117,108  |    5.84 %
   4 ‚Üí 5    :     54,426  |    2.71 %
   5 ‚Üí 6    :     45,386  |    2

In [5]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
from tqdm import tqdm

# =============================================================================
# 1. C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N & THAM S·ªê
# =============================================================================
CONFIG = {
    # File KNN Homology Candidate c·ªßa b·∫°n
    'KNN_FILE': "knn_homology_candidates.tsv",
    
    # File Vocab c·ªßa hai m√¥ h√¨nh ch√≠nh
    'VOCAB_C95_PATH': "/kaggle/input/c95-cafa6/vocab_C95_remove.csv", # Gi·∫£ ƒë·ªãnh C95 l√† d√πng to√†n b·ªô nh√£n
    'VOCAB_C99_PATH': "/kaggle/input/c99-cafa6/vocab_C99_remove.csv", # C99 Vocab c·ªßa b·∫°n
    
    # File IA
    'IA_FILE_PATH': "/kaggle/input/cafa-6-protein-function-prediction/IA.tsv", 
}

# =============================================================================
# 2. H√ÄM X·ª¨ L√ù D·ªÆ LI·ªÜU
# =============================================================================

def load_terms_set(path, term_col='term'):
    """T·∫£i file Vocab ho·∫∑c Term list v√† tr·∫£ v·ªÅ m·ªôt Set ch·ª©a c√°c GO Terms."""
    try:
        if path.endswith('.tsv') or path.endswith('.txt'):
            # X·ª≠ l√Ω tr∆∞·ªùng h·ª£p C95 d√πng file train_terms g·ªëc
            df = pd.read_csv(path, sep='\t', usecols=['term'])
        elif path.endswith('.csv'):
            df = pd.read_csv(path, usecols=[term_col])
        else:
            print(f"Warning: Unknown file type for {path}")
            return set()
        
        # X√≥a c√°c gi√° tr·ªã tr√πng l·∫∑p v√† nan
        return set(df[term_col].dropna().unique())
        
    except FileNotFoundError:
        print(f"‚ùå ERROR: Kh√¥ng t√¨m th·∫•y file Vocabulary t·∫°i {path}")
        return set()
    except Exception as e:
        print(f"‚ùå ERROR: L·ªói khi t·∫£i file {path}: {e}")
        return set()

def load_ia_map(path):
    """T·∫£i file IA v√† tr·∫£ v·ªÅ dictionary {term: IA_value}."""
    ia_map = {}
    try:
        # File IA th∆∞·ªùng l√† TSV (GO_ID \t IA_value)
        with open(path, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    try:
                        ia_map[parts[0]] = float(parts[1])
                    except ValueError:
                        continue # B·ªè qua header ho·∫∑c d√≤ng l·ªói
    except FileNotFoundError:
        print(f"‚ùå ERROR: Kh√¥ng t√¨m th·∫•y file IA t·∫°i {path}")
    return ia_map

def get_ia_range(ia_value):
    """Ph√¢n lo·∫°i IA v√†o c√°c kho·∫£ng ƒë√£ ƒë·ªãnh."""
    if ia_value < 4: return '0_to_4_HEAD' # Kh√¥ng quan t√¢m
    if ia_value < 5: return '4_to_5'
    if ia_value < 6: return '5_to_6'
    if ia_value < 7: return '6_to_7'
    if ia_value < 8: return '7_to_8'
    if ia_value < 9: return '8_to_9'
    if ia_value < 10: return '9_to_10'
    return '>10_SUPER_TAIL'

# =============================================================================
# 3. PH√ÇN T√çCH ƒê√ìNG G√ìP ƒê·ªòC QUY·ªÄN C·ª¶A KNN
# =============================================================================

def analyze_unique_contribution():
    print("üöÄ B·∫ÆT ƒê·∫¶U PH√ÇN T√çCH ƒê√ìNG G√ìP ƒê·ªòC QUY·ªÄN C·ª¶A KNN...")
    
    # --- B∆Ø·ªöC 1: T·∫£i D·ªØ li·ªáu ---
    set_c95 = load_terms_set(CONFIG['VOCAB_C95_PATH'], term_col='term')
    set_c99 = load_terms_set(CONFIG['VOCAB_C99_PATH'], term_col='term')
    ia_map = load_ia_map(CONFIG['IA_FILE_PATH'])
    
    try:
        # Load KNN file, GI·ªÆ C·∫¢ SCORE ƒë·ªÉ l·ªçc theo ng∆∞·ª°ng 0.90
        knn_df = pd.read_csv(
            CONFIG['KNN_FILE'],
            sep='\t',
            names=['pid', 'term', 'score']
        )
    except FileNotFoundError:
        print(f"‚ùå ERROR: Kh√¥ng t√¨m th·∫•y file KNN Candidates t·∫°i {CONFIG['KNN_FILE']}")
        return
    
    # ======================= üî• L·ªåC THEO NG∆Ø·ª†NG 0.90 üî• =======================
    knn_df = knn_df[knn_df['score'] >= 0.90]
    
    print(f"   -> Sau khi l·ªçc score >= 0.90:")
    print(f"      - S·ªë d√≤ng c√≤n l·∫°i: {len(knn_df):,}")
    print(f"      - S·ªë protein c√≤n l·∫°i: {knn_df['pid'].nunique():,}")
    
    # L·∫•y t·∫≠p h·ª£p c√°c nh√£n duy nh·∫•t m√† KNN t√¨m th·∫•y (sau l·ªçc)
    set_knn = set(knn_df['term'].unique())
    print(f"      - S·ªë nh√£n GO duy nh·∫•t c√≤n l·∫°i: {len(set_knn):,}")

    # --- B∆Ø·ªöC 2: Kh·ªüi t·∫°o B·ªô ƒë·∫øm ---
    # B·ªô ƒë·∫øm s·∫Ω ƒë·∫øm s·ªë l∆∞·ª£ng nh√£n (ƒë·ªôc l·∫≠p v·ªõi s·ªë l·∫ßn d·ª± ƒëo√°n)
    # Kh√≥a: Kho·∫£ng IA (v√≠ d·ª•: '5_to_6')
    # Gi√° tr·ªã: List c√°c GO term thu·ªôc lo·∫°i ƒë√≥
    
    unique_counts = {
        'C95_Missing': defaultdict(list),  # Nh√£n kh√¥ng c√≥ trong C95
        'C99_Missing': defaultdict(list),  # Nh√£n kh√¥ng c√≥ trong C99
    }
    
    # --- B∆Ø·ªöC 3: Loop v√† Ph√¢n lo·∫°i ---
    
    # Ch·ªâ duy·ªát qua c√°c nh√£n m√† KNN t√¨m th·∫•y
    for term in tqdm(set_knn, desc="Ph√¢n lo·∫°i nh√£n"):
        ia_value = ia_map.get(term, 0.0) # N·∫øu kh√¥ng c√≥ IA th√¨ m·∫∑c ƒë·ªãnh l√† 0.0
        ia_range = get_ia_range(ia_value)
        
        # 1. Ki·ªÉm tra Nh√£n M·∫•t (Missing) kh·ªèi C95
        if term not in set_c95:
            unique_counts['C95_Missing'][ia_range].append(term)
            
        # 2. Ki·ªÉm tra Nh√£n M·∫•t (Missing) kh·ªèi C99
        if term not in set_c99:
            unique_counts['C99_Missing'][ia_range].append(term)


    # --- B∆Ø·ªöC 4: B√°o c√°o k·∫øt qu·∫£ ---
    
    print("\n" + "=" * 60)
    print("  üèÜ ƒê√ìNG G√ìP ƒê·ªòC QUY·ªÄN C·ª¶A KNN (THEO ƒê·ªò HI·∫æM IA)")
    print("=" * 60)
    
    print("\n--- A. NH√ÉN KH√îNG C√ì TRONG C95 (C95 Missing) ---")
    
    headers = ["Kho·∫£ng IA", "S·ªë nh√£n ƒê·ªòC QUY·ªÄN", "T·ª∑ l·ªá (%)"]
    data_c95 = []
    
    total_c95_missing = sum(len(v) for k, v in unique_counts['C95_Missing'].items() if k != '0_to_4_HEAD')

    for ia_range in ['4_to_5', '5_to_6', '6_to_7', '7_to_8', '8_to_9', '9_to_10', '>10_SUPER_TAIL']:
        count = len(unique_counts['C95_Missing'][ia_range])
        percent = (count / total_c95_missing) * 100 if total_c95_missing else 0
        data_c95.append([ia_range, f"{count:,}", f"{percent:.2f}%"])

    print(pd.DataFrame(data_c95, columns=headers).to_markdown(index=False))
    print(f"\n   -> T·ªïng s·ªë nh√£n hi·∫øm (IA >= 4) m√† C95 b·ªè qua: {total_c95_missing:,}\n")


    print("--- B. NH√ÉN KH√îNG C√ì TRONG C99 (C99 Missing) ---")

    headers = ["Kho·∫£ng IA", "S·ªë nh√£n ƒê·ªòC QUY·ªÄN", "T·ª∑ l·ªá (%)"]
    data_c99 = []
    
    total_c99_missing = sum(len(v) for k, v in unique_counts['C99_Missing'].items() if k != '0_to_4_HEAD')

    for ia_range in ['4_to_5', '5_to_6', '6_to_7', '7_to_8', '8_to_9', '9_to_10', '>10_SUPER_TAIL']:
        count = len(unique_counts['C99_Missing'][ia_range])
        percent = (count / total_c99_missing) * 100 if total_c99_missing else 0
        data_c99.append([ia_range, f"{count:,}", f"{percent:.2f}%"])

    print(pd.DataFrame(data_c99, columns=headers).to_markdown(index=False))
    print(f"\n   -> T·ªïng s·ªë nh√£n hi·∫øm (IA >= 4) m√† C99 b·ªè qua: {total_c99_missing:,}\n")

if __name__ == "__main__":
    analyze_unique_contribution()

üöÄ B·∫ÆT ƒê·∫¶U PH√ÇN T√çCH ƒê√ìNG G√ìP ƒê·ªòC QUY·ªÄN C·ª¶A KNN...
   -> Sau khi l·ªçc score >= 0.90:
      - S·ªë d√≤ng c√≤n l·∫°i: 2,001,441
      - S·ªë protein c√≤n l·∫°i: 223,835
      - S·ªë nh√£n GO duy nh·∫•t c√≤n l·∫°i: 14,504


Ph√¢n lo·∫°i nh√£n: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14504/14504 [00:00<00:00, 847780.50it/s]


  üèÜ ƒê√ìNG G√ìP ƒê·ªòC QUY·ªÄN C·ª¶A KNN (THEO ƒê·ªò HI·∫æM IA)

--- A. NH√ÉN KH√îNG C√ì TRONG C95 (C95 Missing) ---
| Kho·∫£ng IA      |   S·ªë nh√£n ƒê·ªòC QUY·ªÄN | T·ª∑ l·ªá (%)   |
|:---------------|--------------------:|:------------|
| 4_to_5         |                 671 | 26.82%      |
| 5_to_6         |                 536 | 21.42%      |
| 6_to_7         |                 447 | 17.87%      |
| 7_to_8         |                 346 | 13.83%      |
| 8_to_9         |                 221 | 8.83%       |
| 9_to_10        |                 112 | 4.48%       |
| >10_SUPER_TAIL |                 169 | 6.75%       |

   -> T·ªïng s·ªë nh√£n hi·∫øm (IA >= 4) m√† C95 b·ªè qua: 2,502

--- B. NH√ÉN KH√îNG C√ì TRONG C99 (C99 Missing) ---
| Kho·∫£ng IA      |   S·ªë nh√£n ƒê·ªòC QUY·ªÄN | T·ª∑ l·ªá (%)   |
|:---------------|--------------------:|:------------|
| 4_to_5         |                 369 | 26.51%      |
| 5_to_6         |                 299 | 21.48%      |
| 6_to_7         


