# Evaluation Stage 1: Candidate Retrieval Performance

## Mục tiêu:
- Evaluate retrieval performance trên tập train
- Metrics: P@100, R@100
- 3 methods: BM25, Dense retrieval, Hybrid search (Ensemble)
- So sánh kết quả giữa các methods

In [1]:
!pip install chromadb sentence_transformers pyvi

Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31

In [2]:
import os
import json
import pickle
import numpy as np
from typing import List, Dict, Tuple, Any
from tqdm import tqdm
import torch
from collections import defaultdict
import shutil

# ChromaDB
import chromadb
from chromadb.config import Settings

# Sentence Transformers
from sentence_transformers import SentenceTransformer

2025-12-04 15:21:07.168470: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764861667.365133      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764861667.420482      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

## Configuration

In [3]:
# Paths (Kaggle)
TRAIN_FILE = "/kaggle/input/vlqa-dataset/train.json"
BM25_INDEX_PATH = "/kaggle/input/bm25-index/bm25_index.pkl"
CHROMA_DB_PATH = "/kaggle/input/chroma-db-retriever/chroma_db_retriever"
COLLECTION_NAME = "retriever_legal_articles"

# Embedding model
EMBEDDING_MODEL = "AITeamVN/Vietnamese_Embedding"

# Evaluation parameters
TOP_K = 300  # P@100, R@100
MAX_SAMPLES = None  

# Ensemble weights
BM25_WEIGHT = 0.3
DENSE_WEIGHT = 0.7

# Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


## Helper Classes and Functions

In [4]:
# ==========================================
# ChromaIndex Class
# ==========================================
class ChromaIndex:
    """ChromaDB index for storing and querying embeddings"""
    
    def __init__(self, persist_directory: str, collection_name: str = "legal_articles_chunks"):
        """Initialize ChromaDB index."""
        actual_path = self._resolve_db_path(persist_directory)
        
        self.persist_directory = actual_path
        self.collection_name = collection_name
        print(f"Initialize ChromaDB at {actual_path}")
        
        self.client = chromadb.PersistentClient(path=actual_path)
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"Collection: {collection_name} is ready. Current count: {self.collection.count()}")
    
    def _resolve_db_path(self, path: str) -> str:
        """Resolve database path, copying from read-only location if needed."""
        if '/kaggle/input/' in path:
            working_path = path.replace('/kaggle/input/', '/kaggle/working/')
            
            if not os.path.exists(working_path):
                print(f"Copying ChromaDB from read-only to writable location...")
                print(f"  From: {path}")
                print(f"  To:   {working_path}")
                try:
                    os.makedirs(os.path.dirname(working_path), exist_ok=True)
                    try:
                        shutil.copytree(path, working_path, dirs_exist_ok=True)
                    except TypeError:
                        if os.path.exists(working_path):
                            shutil.rmtree(working_path)
                        shutil.copytree(path, working_path)
                    print("Database copied successfully")
                except Exception as e:
                    print(f"Failed to copy database: {e}")
                    raise RuntimeError(f"Cannot copy ChromaDB from {path} to {working_path}: {e}")
            else:
                print(f"Using existing copy at: {working_path}")
            
            return working_path
        
        return path

# ==========================================
# WeightedEnsemble Class
# ==========================================
class WeightedEnsemble:
    """Pure Logic for Weighted Ensemble Fusion."""
    
    def __init__(self, weights: tuple = (0.5, 0.5)):
        self.weights = weights
    
    def _normalize_scores(self, scores: Dict[str, float]) -> Dict[str, float]:
        if not scores: return {}
        values = list(scores.values())
        min_score = min(values)
        max_score = max(values)
        if max_score == min_score: return {k: 1.0 for k in scores}
        return {k: (v - min_score) / (max_score - min_score) for k, v in scores.items()}
    
    def rank(self, bm25_scores: Dict[str, float], dense_scores: Dict[str, float], top_k: int = 100) -> List[Tuple[str, float]]:
        w1, w2 = self.weights
        norm_bm25 = self._normalize_scores(bm25_scores)
        norm_dense = self._normalize_scores(dense_scores)
        all_ids = set(norm_bm25.keys()) | set(norm_dense.keys())
        
        final_scores = {}
        for doc_id in all_ids:
            s1 = norm_bm25.get(doc_id, 0.0)
            s2 = norm_dense.get(doc_id, 0.0)
            final_scores[doc_id] = (w1 * s1) + (w2 * s2)
        
        sorted_docs = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_docs[:top_k]

In [5]:
def calculate_precision_at_k(predicted_ids: List[str], relevant_ids: List[str], k: int = 100) -> float:
    """
    Calculate Precision@k
    
    Args:
        predicted_ids: List of predicted article IDs (ranked)
        relevant_ids: List of relevant article IDs
        k: Top-k to consider
    
    Returns:
        Precision@k score
    """
    predicted_ids = [str(pid) for pid in predicted_ids]
    relevant_set = set(str(rid) for rid in relevant_ids)
    
    if len(predicted_ids) == 0:
        return 0.0
    
    # Count relevant items in top-k
    relevant_in_topk = sum(1 for doc_id in predicted_ids[:k] if doc_id in relevant_set)
    
    # Precision = relevant_retrieved / total_retrieved
    precision = relevant_in_topk / min(k, len(predicted_ids))
    return precision

def calculate_recall_at_k(predicted_ids: List[str], relevant_ids: List[str], k: int = 100) -> float:
    """
    Calculate Recall@k
    
    Args:
        predicted_ids: List of predicted article IDs (ranked)
        relevant_ids: List of relevant article IDs
        k: Top-k to consider
    
    Returns:
        Recall@k score
    """
    predicted_ids = [str(pid) for pid in predicted_ids]
    relevant_set = set(str(rid) for rid in relevant_ids)
    
    if len(relevant_set) == 0:
        return 0.0
    
    # Count relevant items in top-k
    relevant_in_topk = sum(1 for doc_id in predicted_ids[:k] if doc_id in relevant_set)
    
    # Recall = relevant_retrieved / total_relevant
    recall = relevant_in_topk / len(relevant_set)
    return recall

In [6]:
# Load train data
print(f"Loading train data from {TRAIN_FILE}...")
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

if MAX_SAMPLES:
    train_data = train_data[:MAX_SAMPLES]
    print(f"Limited to {MAX_SAMPLES} samples")

# Load BM25 index
print(f"\nLoading BM25 index from {BM25_INDEX_PATH}...")
with open(BM25_INDEX_PATH, 'rb') as f:
    bm25_data = pickle.load(f)

# Load ChromaDB
print(f"\nLoading ChromaDB from {CHROMA_DB_PATH}...")
chroma_index = ChromaIndex(
    persist_directory=CHROMA_DB_PATH,
    collection_name=COLLECTION_NAME
)

# Load embedding model
print(f"\nLoading embedding model: {EMBEDDING_MODEL}...")
embedder_model = SentenceTransformer(EMBEDDING_MODEL, device=DEVICE)

Loading train data from /kaggle/input/vlqa-dataset/train.json...

Loading BM25 index from /kaggle/input/bm25-index/bm25_index.pkl...

Loading ChromaDB from /kaggle/input/chroma-db-retriever/chroma_db_retriever...
Copying ChromaDB from read-only to writable location...
  From: /kaggle/input/chroma-db-retriever/chroma_db_retriever
  To:   /kaggle/working/chroma-db-retriever/chroma_db_retriever
Database copied successfully
Initialize ChromaDB at /kaggle/working/chroma-db-retriever/chroma_db_retriever
Collection: retriever_legal_articles is ready. Current count: 74352

Loading embedding model: AITeamVN/Vietnamese_Embedding...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/708 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

## Initialize Retrievers

In [7]:
# BM25 Retriever 
# BM25 Retriever (FIXED: Aggregation by AID + Better Tokenizer)
class BM25Retriever:
    def __init__(self, bm25_data, chroma_index=None):
        """
        Initialize BM25 Retriever.
        
        Args:
            bm25_data: Loaded pickle data from bm25_index.pkl
            chroma_index: ChromaIndex instance (optional, for mapping chunk_id -> aid)
        """
        # Load data from pickle (structure from bm25_retriever.save())
        self.corpus_size = bm25_data['corpus_size']
        self.avgdl = bm25_data['avgdl']
        self.doc_lengths = bm25_data['doc_lengths']
        self.doc_ids = bm25_data['doc_ids']
        self.index_to_id = bm25_data.get('index_to_id', {})
        self.inverted_index = bm25_data['inverted_index']
        self.idf = bm25_data['idf']
        self.k1 = 1.5
        self.b = 0.75
        
        # ChromaDB for mapping chunk_id -> aid (if needed)
        self.chroma_index = chroma_index
        
        # Cache for doc_id -> aid mapping (to avoid repeated queries)
        self._doc_id_to_aid_cache = {}
        
        # FIX TOKENIZER: Try pyvi for Vietnamese, fallback to simple split
        try:
            from pyvi import ViTokenizer
            self.tokenizer = lambda x: ViTokenizer.tokenize(x).split()
            print("✅ BM25: Using Pyvi tokenizer for Vietnamese")
        except ImportError:
            try:
                import underthesea
                self.tokenizer = lambda x: underthesea.word_tokenize(x.lower())
                print("✅ BM25: Using Underthesea tokenizer for Vietnamese")
            except ImportError:
                # Fallback: simple split (worse for Vietnamese)
                print("⚠️ BM25: Using simple split (install 'pyvi' or 'underthesea' for better Vietnamese tokenization)")
                self.tokenizer = lambda x: x.lower().split()
    
    def _get_aid_from_doc_id(self, doc_id: str) -> str:
        """
        Get article ID (aid) from document ID (chunk_id).
        If doc_id is already aid, return it. Otherwise, query ChromaDB.
        """
        if doc_id in self._doc_id_to_aid_cache:
            return self._doc_id_to_aid_cache[doc_id]
        
        # Try 1: Assume doc_id is already aid (if index was built with aid as id)
        # This is the case if build_bm25_index.py used "aid" as id
        aid = doc_id
        
        # Try 2: Query ChromaDB to get aid from metadata
        if self.chroma_index:
            try:
                # Query by doc_id (chunk_id)
                results = self.chroma_index.collection.get(
                    ids=[doc_id],
                    include=["metadatas"]
                )
                if results and results.get('metadatas') and len(results['metadatas']) > 0:
                    metadata = results['metadatas'][0]
                    if metadata and 'aid' in metadata:
                        aid = str(metadata['aid'])
            except Exception as e:
                # If query fails, assume doc_id is aid
                pass
        
        # Cache the result
        self._doc_id_to_aid_cache[doc_id] = aid
        return aid
    
    def retrieve(self, query: str, top_k: int = 100) -> List[Tuple[str, float]]:
        """
        Retrieve top-k documents using BM25 with AGGREGATION BY AID.
        Similar to DenseRetriever, groups chunks by article and keeps max score.
        """
        if self.corpus_size == 0:
            return []
        
        # Tokenize query
        query_tokens = self.tokenizer(query)
        scores = defaultdict(float)
        
        # Calculate BM25 scores for all chunks
        for token in query_tokens:
            if token not in self.inverted_index:
                continue
            
            idf_score = self.idf[token]
            
            for doc_idx, term_freq in self.inverted_index[token].items():
                doc_len = self.doc_lengths[doc_idx]
                
                numerator = term_freq * (self.k1 + 1)
                denominator = term_freq + self.k1 * (1 - self.b + self.b * (doc_len / self.avgdl))
                score = idf_score * (numerator / denominator)
                
                scores[doc_idx] += score
        
        # FIX: AGGREGATION BY ARTICLE ID (AID)
        # Group scores by aid (similar to DenseRetriever)
        article_scores = defaultdict(float)
        
        for doc_idx, score in scores.items():
            doc_id = str(self.index_to_id.get(doc_idx, doc_idx))
            # Get aid from doc_id (chunk_id)
            aid = self._get_aid_from_doc_id(doc_id)
            
            # Keep MAX score for each article (if multiple chunks)
            if score > article_scores[aid]:
                article_scores[aid] = score
        
        # Sort by score and return top-k
        sorted_scores = sorted(article_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        
        return [(str(aid), float(score)) for aid, score in sorted_scores]

# Dense Retriever
class DenseRetriever:
    def __init__(self, chroma_index, embedder_model):
        self.chroma_index = chroma_index
        self.embedder_model = embedder_model
    
    def retrieve(self, query: str, top_k: int = 100) -> List[Tuple[str, float]]:
        """Retrieve top-k documents using Dense retrieval."""
        # Encode query
        query_embedding = self.embedder_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
        if len(query_embedding.shape) > 1:
            query_vec = query_embedding[0].tolist()
        else:
            query_vec = query_embedding.tolist()
        
        # Query ChromaDB
        results = self.chroma_index.collection.query(
            query_embeddings=[query_vec],
            n_results=top_k,
            include=["metadatas", "distances"]
        )
        
        if not results or not results.get('ids') or not results['ids'][0]:
            return []
        
        ids = results['ids'][0]
        distances = results['distances'][0]
        metadatas = results['metadatas'][0] if results.get('metadatas') else []
        
        # Deduplicate by aid
        retrieved_items = {}
        for i, doc_id in enumerate(ids):
            dist = distances[i]
            score = 1.0 - dist
            score = max(0.0, min(1.0, score))
            
            aid = None
            if metadatas and i < len(metadatas) and metadatas[i]:
                aid = str(metadatas[i].get('aid', doc_id))
            else:
                aid = str(doc_id)
            
            if aid not in retrieved_items or score > retrieved_items[aid][1]:
                retrieved_items[aid] = (aid, score)
        
        return list(retrieved_items.values())

# Ensemble Retriever (FIXED: Increase search_k for better overlap)
class EnsembleRetriever:
    def __init__(self, bm25_retriever, dense_retriever, weights: tuple = (0.5, 0.5)):
        self.bm25 = bm25_retriever
        self.dense = dense_retriever
        self.fusion = WeightedEnsemble(weights=weights)
    
    def retrieve(self, query: str, top_k: int = 100) -> List[Tuple[str, float]]:
        """
        Retrieve using ensemble (hybrid search).
        FIX: Retrieve more candidates (2x-3x) to increase overlap between BM25 and Dense.
        """
        # FIX: Retrieve more candidates to increase intersection
        # This helps WeightedEnsemble find common documents between BM25 and Dense
        search_k = max(top_k * 2, 100)  # At least 2x top_k, minimum 100
        
        # Get results from both retrievers
        try:
            bm25_results = self.bm25.retrieve(query, top_k=search_k)
        except Exception as e:
            print(f"BM25 Retrieval failed: {e}")
            bm25_results = []
        
        try:
            dense_results = self.dense.retrieve(query, top_k=search_k)
        except Exception as e:
            print(f"Dense Retrieval failed: {e}")
            dense_results = []
        
        # Convert to dict
        bm25_scores = dict(bm25_results)
        dense_scores = dict(dense_results)
        
        # Combine using weighted ensemble
        return self.fusion.rank(bm25_scores, dense_scores, top_k=top_k)

# Initialize retrievers
print("Initializing retrievers...")
# FIX: Pass chroma_index to BM25Retriever for aid mapping
bm25_retriever = BM25Retriever(bm25_data, chroma_index=chroma_index)
dense_retriever = DenseRetriever(chroma_index, embedder_model)
ensemble_retriever = EnsembleRetriever(
    bm25_retriever, 
    dense_retriever, 
    weights=(BM25_WEIGHT, DENSE_WEIGHT)
)
print("✅ Retrievers initialized with fixes:")
print("   - BM25: Aggregation by AID + Better tokenizer")
print("   - Ensemble: Increased search_k for better overlap")

Initializing retrievers...
✅ BM25: Using Pyvi tokenizer for Vietnamese
✅ Retrievers initialized with fixes:
   - BM25: Aggregation by AID + Better tokenizer
   - Ensemble: Increased search_k for better overlap


## Run Evaluation

In [8]:
# Storage for results
results = {
    'bm25': {'precision': [], 'recall': []},
    'dense': {'precision': [], 'recall': []},
    'ensemble': {'precision': [], 'recall': []}
}

print(f"Evaluating on {len(train_data)} samples...")
print("=" * 80)

for idx, sample in enumerate(tqdm(train_data, desc="Evaluating")):
    qid = sample['qid']
    question = sample['question']
    relevant_laws = sample['relevant_laws']  # Ground truth
    
    # BM25 retrieval
    try:
        bm25_results = bm25_retriever.retrieve(question, top_k=TOP_K)
        bm25_predicted = [str(aid) for aid, _ in bm25_results]
    except Exception as e:
        print(f"Error in BM25 retrieval for sample {qid}: {e}")
        bm25_predicted = []
    
    # Dense retrieval
    try:
        dense_results = dense_retriever.retrieve(question, top_k=TOP_K)
        dense_predicted = [str(aid) for aid, _ in dense_results]
    except Exception as e:
        print(f"Error in Dense retrieval for sample {qid}: {e}")
        dense_predicted = []
    
    # Ensemble retrieval
    try:
        ensemble_results = ensemble_retriever.retrieve(question, top_k=TOP_K)
        ensemble_predicted = [str(aid) for aid, _ in ensemble_results]
    except Exception as e:
        print(f"Error in Ensemble retrieval for sample {qid}: {e}")
        ensemble_predicted = []
    
    # Calculate metrics
    results['bm25']['precision'].append(
        calculate_precision_at_k(bm25_predicted, relevant_laws, TOP_K)
    )
    results['bm25']['recall'].append(
        calculate_recall_at_k(bm25_predicted, relevant_laws, TOP_K)
    )
    
    results['dense']['precision'].append(
        calculate_precision_at_k(dense_predicted, relevant_laws, TOP_K)
    )
    results['dense']['recall'].append(
        calculate_recall_at_k(dense_predicted, relevant_laws, TOP_K)
    )
    
    results['ensemble']['precision'].append(
        calculate_precision_at_k(ensemble_predicted, relevant_laws, TOP_K)
    )
    results['ensemble']['recall'].append(
        calculate_recall_at_k(ensemble_predicted, relevant_laws, TOP_K)
    )

Evaluating on 2190 samples...


Evaluating: 100%|██████████| 2190/2190 [15:30<00:00,  2.35it/s]


In [12]:
# Calculate average metrics
summary = {}
for method in ['bm25', 'dense', 'ensemble']:
    summary[method] = {
        'precision': np.mean(results[method]['precision']),
        'recall': np.mean(results[method]['recall'])
    }

# Print results in table format
print("=" * 80)
print("Table: Candidate retrieval performance on the VLSP 2025 DRiLL training set")
print("=" * 80)
print(f"\n{'Method':<20} {'P@500':<15} {'R@500':<15}")
print("-" * 80)

method_names = {
    'bm25': 'BM25',
    'dense': 'Dense retrieval',
    'ensemble': 'Hybrid search'
}

for method in ['bm25', 'dense', 'ensemble']:
    method_name = method_names[method]
    p_at_100 = summary[method]['precision']
    r_at_100 = summary[method]['recall']
    print(f"{method_name:<20} {p_at_100:<15.4f} {r_at_100:<15.4f}")

print("=" * 80)
print(f"\nTotal samples evaluated: {len(train_data)}")
print(f"Top-k: {TOP_K}")
print(f"Ensemble weights: BM25={BM25_WEIGHT}, Dense={DENSE_WEIGHT}")

Table: Candidate retrieval performance on the VLSP 2025 DRiLL training set

Method               P@500           R@500          
--------------------------------------------------------------------------------
BM25                 0.0039          0.8927         
Dense retrieval      0.0050          0.9701         
Hybrid search        0.0043          0.9764         

Total samples evaluated: 2190
Top-k: 300
Ensemble weights: BM25=0.3, Dense=0.7


## Save Results

In [13]:
# Save results to JSON
output_file = "eval_stage1_results.json"
output_data = {
    'config': {
        'top_k': TOP_K,
        'num_samples': len(train_data),
        'ensemble_weights': {'bm25': BM25_WEIGHT, 'dense': DENSE_WEIGHT}
    },
    'summary': summary,
    'detailed_results': {
        method: {
            'precision': [float(x) for x in results[method]['precision']],
            'recall': [float(x) for x in results[method]['recall']]
        }
        for method in ['bm25', 'dense', 'ensemble']
    }
}

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)