# RAG Evaluation with Metrics

In [None]:
import os
import sys
import json
import pickle
import numpy as np
from typing import List, Dict, Tuple
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

sys.path.append(os.getcwd())
from src.rag_system import RAGSystem


## –°—Ç—ä–ø–∫–∞ 2: –ó–∞—Ä–µ–∂–¥–∞–Ω–µ –Ω–∞ RAG System

In [None]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä–∞–π RAG —Å–∏—Å—Ç–µ–º–∞—Ç–∞
print("Initializing RAG System...\n")

rag = RAGSystem(
    vector_db_path="vector_db",
    model_name="llama3.2:3b"  # –ü—Ä–æ–º–µ–Ω–∏ –∞–∫–æ –∏–∑–ø–æ–ª–∑–≤–∞—à –¥—Ä—É–≥ –º–æ–¥–µ–ª
)

print("\n‚úì RAG System ready for evaluation!")

## –°—Ç—ä–ø–∫–∞ 3: –°—ä–∑–¥–∞–≤–∞–Ω–µ –Ω–∞ Test Dataset

–©–µ —Å—ä–∑–¥–∞–¥–µ–º —Ç–µ—Å—Ç –≤—ä–ø—Ä–æ—Å–∏ —Å –∏–∑–≤–µ—Å—Ç–Ω–∏ –ø—Ä–∞–≤–∏–ª–Ω–∏ –æ—Ç–≥–æ–≤–æ—Ä–∏ (ground truth)

In [None]:
# Test cases: –í—Å–µ–∫–∏ –∏–º–∞ query –∏ –∫–æ–∏ –¥–æ–∫—É–º–µ–Ω—Ç–∏ —Ç—Ä—è–±–≤–∞ –¥–∞ –±—ä–¥–∞—Ç –Ω–∞–º–µ—Ä–µ–Ω–∏
# relevant_doc_ids —Å—ä–¥—ä—Ä–∂–∞ product + category –∫–æ–º–±–∏–Ω–∞—Ü–∏–∏

test_cases = [
    {
        'id': 'test_001',
        'query': 'How do I install CloudSync Pro on Windows?',
        'relevant_products': ['CloudSync Pro'],
        'relevant_categories': ['installation'],
        'expected_answer_contains': ['download', 'install', 'windows']
    },
    {
        'id': 'test_002',
        'query': 'My files are not syncing in CloudSync Pro',
        'relevant_products': ['CloudSync Pro'],
        'relevant_categories': ['troubleshooting'],
        'expected_answer_contains': ['sync', 'internet', 'connection']
    },
    {
        'id': 'test_003',
        'query': 'What is selective sync feature?',
        'relevant_products': ['CloudSync Pro'],
        'relevant_categories': ['features'],
        'expected_answer_contains': ['selective', 'sync', 'folders']
    },
    {
        'id': 'test_004',
        'query': 'How to backup files with DataVault?',
        'relevant_products': ['DataVault'],
        'relevant_categories': ['features', 'installation'],
        'expected_answer_contains': ['backup', 'files']
    },
    {
        'id': 'test_005',
        'query': 'DataVault backup is failing',
        'relevant_products': ['DataVault'],
        'relevant_categories': ['troubleshooting'],
        'expected_answer_contains': ['backup', 'disk space', 'destination']
    },
    {
        'id': 'test_006',
        'query': 'TeamChat video call quality is bad',
        'relevant_products': ['TeamChat'],
        'relevant_categories': ['troubleshooting'],
        'expected_answer_contains': ['video', 'internet', 'connection']
    },
    {
        'id': 'test_007',
        'query': 'How to send messages in TeamChat?',
        'relevant_products': ['TeamChat'],
        'relevant_categories': ['features'],
        'expected_answer_contains': ['message', 'channel']
    },
    {
        'id': 'test_008',
        'query': 'Cannot create new project in ProjectHub',
        'relevant_products': ['ProjectHub'],
        'relevant_categories': ['troubleshooting'],
        'expected_answer_contains': ['project', 'limit', 'permission']
    },
    {
        'id': 'test_009',
        'query': 'What is Kanban board in ProjectHub?',
        'relevant_products': ['ProjectHub'],
        'relevant_categories': ['features'],
        'expected_answer_contains': ['kanban', 'board', 'task']
    },
    {
        'id': 'test_010',
        'query': 'How much storage do I get with CloudSync Pro?',
        'relevant_products': ['CloudSync Pro'],
        'relevant_categories': ['faq'],
        'expected_answer_contains': ['storage', 'gb', 'pro']
    },
    {
        'id': 'test_011',
        'query': 'Is TeamChat available on mobile?',
        'relevant_products': ['TeamChat'],
        'relevant_categories': ['faq'],
        'expected_answer_contains': ['mobile', 'ios', 'android']
    },
    {
        'id': 'test_012',
        'query': 'How secure are DataVault backups?',
        'relevant_products': ['DataVault'],
        'relevant_categories': ['faq'],
        'expected_answer_contains': ['secure', 'encrypt', 'aes']
    },
]

print(f"‚úì Created {len(test_cases)} test cases")
print("\nSample test case:")
print(f"Query: {test_cases[0]['query']}")
print(f"Expected product: {test_cases[0]['relevant_products']}")
print(f"Expected category: {test_cases[0]['relevant_categories']}")

## –°—Ç—ä–ø–∫–∞ 4: Evaluation –§—É–Ω–∫—Ü–∏–∏

–§—É–Ω–∫—Ü–∏–∏ –∑–∞ –∏–∑—á–∏—Å–ª—è–≤–∞–Ω–µ –Ω–∞ –º–µ—Ç—Ä–∏–∫–∏

In [None]:
def is_relevant_result(retrieved_metadata: Dict, test_case: Dict) -> bool:
    """
    –ü—Ä–æ–≤–µ—Ä—è–≤–∞ –¥–∞–ª–∏ retrieved –¥–æ–∫—É–º–µ–Ω—Ç –µ relevant –∑–∞ test case
    
    Args:
        retrieved_metadata: Metadata –Ω–∞ retrieved –¥–æ–∫—É–º–µ–Ω—Ç
        test_case: Test case —Å relevant_products –∏ relevant_categories
    
    Returns:
        True –∞–∫–æ –¥–æ–∫—É–º–µ–Ω—Ç—ä—Ç –µ relevant
    """
    product_match = retrieved_metadata['product'] in test_case['relevant_products']
    category_match = retrieved_metadata['category'] in test_case['relevant_categories']
    
    return product_match and category_match


def calculate_precision_at_k(retrieved_metadatas: List[Dict], test_case: Dict, k: int) -> float:
    """
    Precision@K: –ö–∞–∫—ä–≤ –ø—Ä–æ—Ü–µ–Ω—Ç –æ—Ç top K –¥–æ–∫—É–º–µ–Ω—Ç–∏—Ç–µ —Å–∞ relevant?
    
    Formula: (# relevant docs in top K) / K
    """
    top_k = retrieved_metadatas[:k]
    relevant_count = sum(1 for meta in top_k if is_relevant_result(meta, test_case))
    return relevant_count / k if k > 0 else 0


def calculate_recall_at_k(retrieved_metadatas: List[Dict], test_case: Dict, k: int, total_relevant: int) -> float:
    """
    Recall@K: –ö–∞–∫—ä–≤ –ø—Ä–æ—Ü–µ–Ω—Ç –æ—Ç –≤—Å–∏—á–∫–∏ relevant –¥–æ–∫—É–º–µ–Ω—Ç–∏ –Ω–∞–º–µ—Ä–∏—Ö–º–µ?
    
    Formula: (# relevant docs retrieved) / (total # relevant docs)
    """
    top_k = retrieved_metadatas[:k]
    found_count = sum(1 for meta in top_k if is_relevant_result(meta, test_case))
    return found_count / total_relevant if total_relevant > 0 else 0


def calculate_hit_rate(retrieved_metadatas: List[Dict], test_case: Dict, k: int) -> float:
    """
    Hit Rate: –ù–∞–º–µ—Ä–∏—Ö–º–µ –ª–∏ –ø–æ–Ω–µ –µ–¥–∏–Ω relevant –¥–æ–∫—É–º–µ–Ω—Ç?
    
    Returns 1.0 if yes, 0.0 if no
    """
    top_k = retrieved_metadatas[:k]
    for meta in top_k:
        if is_relevant_result(meta, test_case):
            return 1.0
    return 0.0


def calculate_mrr(retrieved_metadatas: List[Dict], test_case: Dict) -> float:
    """
    MRR (Mean Reciprocal Rank): –ù–∞ –∫–æ—è –ø–æ–∑–∏—Ü–∏—è –µ –ø—ä—Ä–≤–∏—è—Ç relevant –¥–æ–∫—É–º–µ–Ω—Ç?
    
    Returns 1/rank (so higher rank = better score)
    Example: First relevant at position 2 ‚Üí MRR = 1/2 = 0.5
    """
    for rank, meta in enumerate(retrieved_metadatas, start=1):
        if is_relevant_result(meta, test_case):
            return 1.0 / rank
    return 0.0  # No relevant doc found


print("‚úì Evaluation functions created!")

## –°—Ç—ä–ø–∫–∞ 5: Run Evaluation

**–¢–æ–≤–∞ –º–æ–∂–µ –¥–∞ –æ—Ç–Ω–µ–º–µ 2-3 –º–∏–Ω—É—Ç–∏** –∑–∞—â–æ—Ç–æ —Ç—Ä—è–±–≤–∞ –¥–∞ –ø—É—Å–Ω–µ–º –≤—Å–∏—á–∫–∏ test queries!

In [None]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –∑–∞ evaluation
K_VALUES = [1, 3, 5]  # –©–µ —Ç–µ—Å—Ç–≤–∞–º–µ —Å —Ä–∞–∑–ª–∏—á–Ω–∏ K
TOTAL_RELEVANT_DOCS = 5  # –ü—Ä–µ–¥–ø–æ–ª–∞–≥–∞–º–µ ~5 relevant docs per query

# –°—ä—Ö—Ä–∞–Ω—è–≤–∞–π —Ä–µ–∑—É–ª—Ç–∞—Ç–∏
evaluation_results = []

print("Running evaluation...\n")
print("="*80)

for test_case in tqdm(test_cases, desc="Evaluating test cases"):
    query = test_case['query']
    
    # Retrieve –¥–æ–∫—É–º–µ–Ω—Ç–∏ (–≤–∑–∏–º–∞–º–µ max K –∑–∞ –¥–∞ –º–æ–∂–µ–º –¥–∞ —Ç–µ—Å—Ç–≤–∞–º–µ –≤—Å–∏—á–∫–∏ K —Å—Ç–æ–π–Ω–æ—Å—Ç–∏)
    search_results = rag.search(query, n_results=max(K_VALUES))
    retrieved_metadatas = search_results['metadatas']
    distances = search_results['distances']
    
    # –ò–∑—á–∏—Å–ª–∏ –º–µ—Ç—Ä–∏–∫–∏ –∑–∞ –≤—Å—è–∫–æ K
    result = {
        'test_id': test_case['id'],
        'query': query,
        'relevant_products': test_case['relevant_products'],
        'relevant_categories': test_case['relevant_categories'],
    }
    
    for k in K_VALUES:
        precision = calculate_precision_at_k(retrieved_metadatas, test_case, k)
        recall = calculate_recall_at_k(retrieved_metadatas, test_case, k, TOTAL_RELEVANT_DOCS)
        hit_rate = calculate_hit_rate(retrieved_metadatas, test_case, k)
        
        result[f'precision@{k}'] = precision
        result[f'recall@{k}'] = recall
        result[f'hit_rate@{k}'] = hit_rate
    
    # MRR (–Ω–µ –∑–∞–≤–∏—Å–∏ –æ—Ç K)
    result['mrr'] = calculate_mrr(retrieved_metadatas, test_case)
    
    # –ó–∞–ø–∞–∑–∏ retrieved results –∑–∞ debugging
    result['top_3_results'] = [
        {
            'product': meta['product'],
            'category': meta['category'],
            'title': meta['title'],
            'distance': dist
        }
        for meta, dist in zip(retrieved_metadatas[:3], distances[:3])
    ]
    
    evaluation_results.append(result)

print("\n‚úì Evaluation complete!")

## –°—Ç—ä–ø–∫–∞ 6: –ü—Ä–µ–≥–ª–µ–¥ –Ω–∞ –∏–Ω–¥–∏–≤–∏–¥—É–∞–ª–Ω–∏ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏

In [None]:
# –ü–æ–∫–∞–∂–∏ –ø—ä—Ä–≤–∏—Ç–µ 3 —Ç–µ—Å—Ç–∞ –¥–µ—Ç–∞–π–ª–Ω–æ
print("Sample Evaluation Results:\n")
print("="*80)

for i, result in enumerate(evaluation_results[:3], 1):
    print(f"\nTest {i}: {result['test_id']}")
    print(f"Query: {result['query']}")
    print(f"Expected: {result['relevant_products']} - {result['relevant_categories']}")
    print(f"\nMetrics:")
    print(f"  Precision@3: {result['precision@3']:.2f}")
    print(f"  Recall@3: {result['recall@3']:.2f}")
    print(f"  Hit Rate@3: {result['hit_rate@3']:.2f}")
    print(f"  MRR: {result['mrr']:.3f}")
    
    print(f"\nTop 3 Retrieved:")
    for j, doc in enumerate(result['top_3_results'], 1):
        relevant = "‚úì" if (doc['product'] in result['relevant_products'] and 
                          doc['category'] in result['relevant_categories']) else "‚úó"
        print(f"  {j}. {relevant} {doc['product']} - {doc['category']} (dist: {doc['distance']:.4f})")
    print("-"*80)

## –°—Ç—ä–ø–∫–∞ 7: Aggregate Metrics

–ò–∑—á–∏—Å–ª—è–≤–∞–º–µ —Å—Ä–µ–¥–Ω–∏—Ç–µ —Å—Ç–æ–π–Ω–æ—Å—Ç–∏ –∑–∞ –≤—Å–∏—á–∫–∏ –º–µ—Ç—Ä–∏–∫–∏

In [None]:
# –ò–∑—á–∏—Å–ª–∏ —Å—Ä–µ–¥–Ω–∏ —Å—Ç–æ–π–Ω–æ—Å—Ç–∏
def calculate_average_metrics(results: List[Dict]) -> Dict:
    """–ò–∑—á–∏—Å–ª–∏ —Å—Ä–µ–¥–Ω–∏—Ç–µ —Å—Ç–æ–π–Ω–æ—Å—Ç–∏ –Ω–∞ –≤—Å–∏—á–∫–∏ –º–µ—Ç—Ä–∏–∫–∏"""
    avg_metrics = {}
    
    # –ó–∞ –≤—Å—è–∫–æ K
    for k in K_VALUES:
        avg_metrics[f'precision@{k}'] = np.mean([r[f'precision@{k}'] for r in results])
        avg_metrics[f'recall@{k}'] = np.mean([r[f'recall@{k}'] for r in results])
        avg_metrics[f'hit_rate@{k}'] = np.mean([r[f'hit_rate@{k}'] for r in results])
    
    # MRR
    avg_metrics['mrr'] = np.mean([r['mrr'] for r in results])
    
    return avg_metrics

avg_metrics = calculate_average_metrics(evaluation_results)

print("\n" + "="*80)
print("OVERALL EVALUATION METRICS")
print("="*80)
print()

for k in K_VALUES:
    print(f"K = {k}:")
    print(f"  Precision@{k}: {avg_metrics[f'precision@{k}']:.3f}")
    print(f"  Recall@{k}:    {avg_metrics[f'recall@{k}']:.3f}")
    print(f"  Hit Rate@{k}:  {avg_metrics[f'hit_rate@{k}']:.3f}")
    print()

print(f"MRR (Mean Reciprocal Rank): {avg_metrics['mrr']:.3f}")
print("\n" + "="*80)

## –°—Ç—ä–ø–∫–∞ 8: –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –Ω–∞ –º–µ—Ç—Ä–∏–∫–∏—Ç–µ

–ì—Ä–∞—Ñ–∏–∫–∏ –∑–∞ –ø–æ-–¥–æ–±—Ä–æ —Ä–∞–∑–±–∏—Ä–∞–Ω–µ!

In [None]:
# –ì—Ä–∞—Ñ–∏–∫ 1: –ú–µ—Ç—Ä–∏–∫–∏ –ø–æ K —Å—Ç–æ–π–Ω–æ—Å—Ç–∏
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Precision@K
precision_values = [avg_metrics[f'precision@{k}'] for k in K_VALUES]
axes[0, 0].plot(K_VALUES, precision_values, marker='o', linewidth=2, markersize=8, color='#3498db')
axes[0, 0].set_xlabel('K')
axes[0, 0].set_ylabel('Precision')
axes[0, 0].set_title('Precision@K')
axes[0, 0].set_ylim(0, 1.1)
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(K_VALUES)

# Recall@K
recall_values = [avg_metrics[f'recall@{k}'] for k in K_VALUES]
axes[0, 1].plot(K_VALUES, recall_values, marker='o', linewidth=2, markersize=8, color='#2ecc71')
axes[0, 1].set_xlabel('K')
axes[0, 1].set_ylabel('Recall')
axes[0, 1].set_title('Recall@K')
axes[0, 1].set_ylim(0, 1.1)
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(K_VALUES)

# Hit Rate@K
hit_rate_values = [avg_metrics[f'hit_rate@{k}'] for k in K_VALUES]
axes[1, 0].plot(K_VALUES, hit_rate_values, marker='o', linewidth=2, markersize=8, color='#f39c12')
axes[1, 0].set_xlabel('K')
axes[1, 0].set_ylabel('Hit Rate')
axes[1, 0].set_title('Hit Rate@K')
axes[1, 0].set_ylim(0, 1.1)
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_xticks(K_VALUES)

# Comparison bar chart for K=3
metrics_k3 = ['Precision@3', 'Recall@3', 'Hit Rate@3']
values_k3 = [
    avg_metrics['precision@3'],
    avg_metrics['recall@3'],
    avg_metrics['hit_rate@3']
]
colors = ['#3498db', '#2ecc71', '#f39c12']
bars = axes[1, 1].bar(metrics_k3, values_k3, color=colors)
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_title('Metrics Comparison (K=3)')
axes[1, 1].set_ylim(0, 1.1)
axes[1, 1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, values_k3):
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{value:.3f}',
                    ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('data/processed/evaluation_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Visualization saved to data/processed/evaluation_metrics.png")

## –°—Ç—ä–ø–∫–∞ 9: –î–µ—Ç–∞–π–ª–µ–Ω –∞–Ω–∞–ª–∏–∑ –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏

In [None]:
# –ê–Ω–∞–ª–∏–∑ –ø–æ –ø—Ä–æ–¥—É–∫—Ç
product_metrics = {}

for product in ['CloudSync Pro', 'DataVault', 'TeamChat', 'ProjectHub']:
    product_results = [
        r for r in evaluation_results 
        if product in r['relevant_products']
    ]
    
    if product_results:
        product_metrics[product] = calculate_average_metrics(product_results)

# –ü–æ–∫–∞–∂–∏ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏
print("\nPerformance by Product (K=3):\n")
print("="*80)

for product, metrics in product_metrics.items():
    print(f"\n{product}:")
    print(f"  Precision@3: {metrics['precision@3']:.3f}")
    print(f"  Recall@3:    {metrics['recall@3']:.3f}")
    print(f"  Hit Rate@3:  {metrics['hit_rate@3']:.3f}")
    print(f"  MRR:         {metrics['mrr']:.3f}")

print("\n" + "="*80)

## –°—Ç—ä–ø–∫–∞ 10: –ó–∞–ø–∞–∑–≤–∞–Ω–µ –Ω–∞ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏

In [None]:
# –°—ä–∑–¥–∞–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—è –∞–∫–æ –Ω–µ —Å—ä—â–µ—Å—Ç–≤—É–≤–∞
os.makedirs('data/processed', exist_ok=True)

# –ó–∞–ø–∞–∑–∏ –¥–µ—Ç–∞–π–ª–Ω–∏ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏
with open('data/processed/evaluation_results.json', 'w', encoding='utf-8') as f:
    json.dump(evaluation_results, f, indent=2, ensure_ascii=False)

print("‚úì Detailed results saved to data/processed/evaluation_results.json")

# –ó–∞–ø–∞–∑–∏ –∞–≥—Ä–µ–≥–∏—Ä–∞–Ω–∏ –º–µ—Ç—Ä–∏–∫–∏
summary = {
    'overall_metrics': avg_metrics,
    'product_metrics': product_metrics,
    'total_test_cases': len(test_cases),
    'k_values': K_VALUES
}

with open('data/processed/evaluation_summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print("‚úì Summary saved to data/processed/evaluation_summary.json")

## –°—Ç—ä–ø–∫–∞ 11: –°—ä–∑–¥–∞–π evaluation —Ñ—É–Ω–∫—Ü–∏—è –∑–∞ –ø–æ-–∫—ä—Å–Ω–∞ —É–ø–æ—Ç—Ä–µ–±–∞

In [None]:
# –ó–∞–ø–∞–∑–∏ evaluation functions –∫–∞—Ç–æ Python —Ñ–∞–π–ª
evaluator_code = '''"""RAG Evaluation Functions"""

import numpy as np
from typing import List, Dict

def is_relevant_result(retrieved_metadata: Dict, test_case: Dict) -> bool:
    """Check if retrieved document is relevant"""
    product_match = retrieved_metadata['product'] in test_case['relevant_products']
    category_match = retrieved_metadata['category'] in test_case['relevant_categories']
    return product_match and category_match

def calculate_precision_at_k(retrieved_metadatas: List[Dict], test_case: Dict, k: int) -> float:
    """Precision@K: (# relevant in top K) / K"""
    top_k = retrieved_metadatas[:k]
    relevant_count = sum(1 for meta in top_k if is_relevant_result(meta, test_case))
    return relevant_count / k if k > 0 else 0

def calculate_recall_at_k(retrieved_metadatas: List[Dict], test_case: Dict, k: int, total_relevant: int) -> float:
    """Recall@K: (# relevant retrieved) / (total relevant)"""
    top_k = retrieved_metadatas[:k]
    found_count = sum(1 for meta in top_k if is_relevant_result(meta, test_case))
    return found_count / total_relevant if total_relevant > 0 else 0

def calculate_hit_rate(retrieved_metadatas: List[Dict], test_case: Dict, k: int) -> float:
    """Hit Rate: Found at least one relevant doc?"""
    top_k = retrieved_metadatas[:k]
    for meta in top_k:
        if is_relevant_result(meta, test_case):
            return 1.0
    return 0.0

def calculate_mrr(retrieved_metadatas: List[Dict], test_case: Dict) -> float:
    """MRR: 1 / (rank of first relevant doc)"""
    for rank, meta in enumerate(retrieved_metadatas, start=1):
        if is_relevant_result(meta, test_case):
            return 1.0 / rank
    return 0.0

def evaluate_rag_system(rag_system, test_cases: List[Dict], k_values: List[int] = [1, 3, 5]) -> Dict:
    """Run full evaluation"""
    results = []
    
    for test_case in test_cases:
        query = test_case['query']
        search_results = rag_system.search(query, n_results=max(k_values))
        retrieved_metadatas = search_results['metadatas']
        
        result = {
            'test_id': test_case['id'],
            'query': query,
        }
        
        for k in k_values:
            result[f'precision@{k}'] = calculate_precision_at_k(retrieved_metadatas, test_case, k)
            result[f'recall@{k}'] = calculate_recall_at_k(retrieved_metadatas, test_case, k, 5)
            result[f'hit_rate@{k}'] = calculate_hit_rate(retrieved_metadatas, test_case, k)
        
        result['mrr'] = calculate_mrr(retrieved_metadatas, test_case)
        results.append(result)
    
    # Calculate averages
    avg_metrics = {}
    for k in k_values:
        avg_metrics[f'precision@{k}'] = np.mean([r[f'precision@{k}'] for r in results])
        avg_metrics[f'recall@{k}'] = np.mean([r[f'recall@{k}'] for r in results])
        avg_metrics[f'hit_rate@{k}'] = np.mean([r[f'hit_rate@{k}'] for r in results])
    avg_metrics['mrr'] = np.mean([r['mrr'] for r in results])
    
    return {
        'detailed_results': results,
        'average_metrics': avg_metrics
    }
'''

with open('src/evaluator.py', 'w', encoding='utf-8') as f:
    f.write(evaluator_code)

print("‚úì Evaluator functions saved to src/evaluator.py")

## ‚úÖ Summary

**–ö–∞–∫–≤–æ –Ω–∞–ø—Ä–∞–≤–∏—Ö–º–µ:**
- ‚úì –°—ä–∑–¥–∞–¥–æ—Ö–º–µ 12 test cases —Å ground truth
- ‚úì –ò–∑—á–∏—Å–ª–∏—Ö–º–µ 4 –∫–ª—é—á–æ–≤–∏ –º–µ—Ç—Ä–∏–∫–∏:
  - Precision@K
  - Recall@K
  - Hit Rate@K
  - MRR
- ‚úì –¢–µ—Å—Ç–≤–∞—Ö–º–µ —Å K=1, 3, 5
- ‚úì –í–∏–∑—É–∞–ª–∏–∑–∏—Ä–∞—Ö–º–µ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏—Ç–µ
- ‚úì –ê–Ω–∞–ª–∏–∑–∏—Ä–∞—Ö–º–µ –ø–æ –ø—Ä–æ–¥—É–∫—Ç
- ‚úì –ó–∞–ø–∞–∑–∏—Ö–º–µ evaluation functions –∑–∞ re-use

**–ò–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏—è –Ω–∞ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏—Ç–µ:**
- **Precision**: –ö–æ–ª–∫–æ —Ç–æ—á–Ω–∏ —Å–∞ –Ω–∞—à–∏—Ç–µ retrieval —Ä–µ–∑—É–ª—Ç–∞—Ç–∏
- **Recall**: –ö–æ–ª–∫–æ comprehensive –µ coverage-–∞
- **Hit Rate**: –ú–∏–Ω–∏–º–∞–ª–Ω–∞—Ç–∞ –µ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç –Ω–∞ —Å–∏—Å—Ç–µ–º–∞—Ç–∞
- **MRR**: –ö–æ–ª–∫–æ –¥–æ–±—Ä–µ ranking-–∞ –Ω–∞ —Ä–µ–∑—É–ª—Ç–∞—Ç–∏—Ç–µ

**–°–ª–µ–¥–≤–∞—â–∞ —Å—Ç—ä–ø–∫–∞:**
–û—Ç–≤–æ—Ä–∏ `04_gradio_dashboard.ipynb` –∑–∞ –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤–µ–Ω UI! üé®