# 04 - RAG System Evaluation

This notebook covers:
- Retrieval performance metrics (Recall@K, MRR, NDCG)
- LLM response quality evaluation (BLEU, ROUGE)
- End-to-end system testing
- Embedding quality visualization
- A/B testing different retrieval strategies

In [None]:
import os
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import pickle

# NLP evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Paths
PROCESSED_DIR = Path('../data/processed')
INDEX_DIR = Path('../data/processed/indexes')

## 1. Load Test Data and Models

In [None]:
# Load embeddings and metadata
text_embeddings = np.load(PROCESSED_DIR / 'text_embeddings.npy')
clip_embeddings = np.load(PROCESSED_DIR / 'clip_text_embeddings.npy')
metadata_df = pd.read_csv(PROCESSED_DIR / 'metadata_processed.csv')

print(f"Loaded {len(metadata_df)} items")
print(f"Text embeddings: {text_embeddings.shape}")
print(f"CLIP embeddings: {clip_embeddings.shape}")

## 2. Create Test Queries with Ground Truth

In [None]:
# Define test queries with expected relevant items
test_queries = [
    {
        'query': 'red shirt for casual wear',
        'relevant_ids': ['prod_001'],  # Ground truth relevant items
        'category': 'clothing'
    },
    {
        'query': 'comfortable jeans',
        'relevant_ids': ['prod_002'],
        'category': 'clothing'
    },
    {
        'query': 'white shoes',
        'relevant_ids': ['prod_003'],
        'category': 'footwear'
    },
]

print(f"Created {len(test_queries)} test queries")
for i, q in enumerate(test_queries):
    print(f"{i+1}. {q['query']} -> Expected: {q['relevant_ids']}")

## 3. Retrieval Metrics

In [None]:
def recall_at_k(retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
    """
    Calculate Recall@K: proportion of relevant items retrieved in top-K
    """
    retrieved_k = set(retrieved_ids[:k])
    relevant_set = set(relevant_ids)
    
    if len(relevant_set) == 0:
        return 0.0
    
    return len(retrieved_k.intersection(relevant_set)) / len(relevant_set)

def precision_at_k(retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
    """
    Calculate Precision@K: proportion of retrieved items that are relevant
    """
    retrieved_k = retrieved_ids[:k]
    relevant_set = set(relevant_ids)
    
    if len(retrieved_k) == 0:
        return 0.0
    
    return len([id for id in retrieved_k if id in relevant_set]) / len(retrieved_k)

def mean_reciprocal_rank(retrieved_ids: List[str], relevant_ids: List[str]) -> float:
    """
    Calculate MRR: 1 / rank of first relevant item
    """
    relevant_set = set(relevant_ids)
    
    for i, doc_id in enumerate(retrieved_ids):
        if doc_id in relevant_set:
            return 1.0 / (i + 1)
    
    return 0.0

def dcg_at_k(retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
    """
    Calculate Discounted Cumulative Gain at K
    """
    relevant_set = set(relevant_ids)
    dcg = 0.0
    
    for i, doc_id in enumerate(retrieved_ids[:k]):
        if doc_id in relevant_set:
            dcg += 1.0 / np.log2(i + 2)  # i+2 because index starts at 0
    
    return dcg

def ndcg_at_k(retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
    """
    Calculate Normalized Discounted Cumulative Gain at K
    """
    dcg = dcg_at_k(retrieved_ids, relevant_ids, k)
    
    # Ideal DCG (if all relevant items were at the top)
    ideal_retrieved = relevant_ids + [id for id in retrieved_ids if id not in relevant_ids]
    idcg = dcg_at_k(ideal_retrieved, relevant_ids, k)
    
    if idcg == 0:
        return 0.0
    
    return dcg / idcg

print("Retrieval metrics functions defined")

## 4. Evaluate Retrieval Performance

In [None]:
# Load RAG pipeline
with open(PROCESSED_DIR / 'rag_pipeline.pkl', 'rb') as f:
    rag_components = pickle.load(f)

retriever = rag_components['retriever']

# Evaluate on test queries
k_values = [1, 3, 5, 10]
results = []

print("Evaluating retrieval performance...\n")

for test_query in test_queries:
    query = test_query['query']
    relevant_ids = test_query['relevant_ids']
    
    # Retrieve documents
    retrieved_docs = retriever.retrieve(query, k=10)
    retrieved_ids = [doc['id'] for doc in retrieved_docs]
    
    # Calculate metrics for different K values
    for k in k_values:
        results.append({
            'query': query,
            'k': k,
            'recall': recall_at_k(retrieved_ids, relevant_ids, k),
            'precision': precision_at_k(retrieved_ids, relevant_ids, k),
            'ndcg': ndcg_at_k(retrieved_ids, relevant_ids, k)
        })
    
    # Calculate MRR (independent of K)
    mrr = mean_reciprocal_rank(retrieved_ids, relevant_ids)
    
    print(f"Query: {query}")
    print(f"  MRR: {mrr:.4f}")
    print(f"  Recall@5: {recall_at_k(retrieved_ids, relevant_ids, 5):.4f}")
    print(f"  NDCG@5: {ndcg_at_k(retrieved_ids, relevant_ids, 5):.4f}")
    print()

# Create results dataframe
results_df = pd.DataFrame(results)

# Calculate average metrics
avg_metrics = results_df.groupby('k')[['recall', 'precision', 'ndcg']].mean()
print("\nAverage Metrics Across All Queries:")
print(avg_metrics)

In [None]:
# Visualize metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

metrics = ['recall', 'precision', 'ndcg']
titles = ['Recall@K', 'Precision@K', 'NDCG@K']

for ax, metric, title in zip(axes, metrics, titles):
    avg_metrics[metric].plot(kind='bar', ax=ax, color='steelblue')
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('K', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_ylim(0, 1.1)
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(avg_metrics[metric]):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig(PROCESSED_DIR / 'retrieval_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nMetrics visualization saved")

## 5. LLM Response Quality Evaluation

In [None]:
# Define reference answers for test queries
test_qa_pairs = [
    {
        'question': 'What red clothing items do you have?',
        'reference_answer': 'We have a comfortable red cotton t-shirt with a round neck that is perfect for casual wear.',
    },
    {
        'question': 'Do you have any jeans?',
        'reference_answer': 'Yes, we have classic blue denim jeans with a regular fit.',
    },
]

# Initialize scorers
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

def evaluate_response(generated: str, reference: str) -> Dict[str, float]:
    """
    Evaluate generated response against reference
    """
    # ROUGE scores
    rouge_scores = rouge.score(reference, generated)
    
    # BLEU score
    reference_tokens = reference.lower().split()
    generated_tokens = generated.lower().split()
    bleu = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothing)
    
    return {
        'bleu': bleu,
        'rouge1_f': rouge_scores['rouge1'].fmeasure,
        'rouge2_f': rouge_scores['rouge2'].fmeasure,
        'rougeL_f': rouge_scores['rougeL'].fmeasure,
    }

print("Response evaluation functions defined")

In [None]:
# Note: This requires running the RAG pipeline which needs API keys
# Uncomment and run if you have the API configured

# from notebooks.notebook_03_rag_pipeline import rag_query

# response_scores = []

# for qa_pair in test_qa_pairs:
#     question = qa_pair['question']
#     reference = qa_pair['reference_answer']
    
#     # Generate response
#     result = rag_query(question, k=3)
#     generated = result['answer']
    
#     # Evaluate
#     scores = evaluate_response(generated, reference)
#     scores['question'] = question
#     response_scores.append(scores)
    
#     print(f"Question: {question}")
#     print(f"Generated: {generated[:100]}...")
#     print(f"Scores: {scores}")
#     print()

# # Create summary
# scores_df = pd.DataFrame(response_scores)
# print("Average Response Quality Scores:")
# print(scores_df[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean())

print("Response evaluation setup complete (uncomment to run with API)")

## 6. Embedding Quality Visualization

In [None]:
# Visualize embedding space with t-SNE
def visualize_embedding_space(embeddings, labels, title, save_path=None):
    """
    Visualize high-dimensional embeddings in 2D using t-SNE
    """
    print(f"Running t-SNE on {len(embeddings)} embeddings...")
    
    # Run t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings)-1))
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # Create plot
    plt.figure(figsize=(12, 8))
    
    # Color by category if available
    unique_labels = list(set(labels))
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
    label_to_color = {label: colors[i] for i, label in enumerate(unique_labels)}
    
    for label in unique_labels:
        mask = np.array(labels) == label
        plt.scatter(
            embeddings_2d[mask, 0],
            embeddings_2d[mask, 1],
            c=[label_to_color[label]],
            label=label,
            s=100,
            alpha=0.7,
            edgecolors='black',
            linewidth=0.5
        )
    
    plt.title(title, fontsize=16, fontweight='bold')
    plt.xlabel('t-SNE Dimension 1', fontsize=12)
    plt.ylabel('t-SNE Dimension 2', fontsize=12)
    plt.legend(fontsize=10, loc='best')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

# Visualize text embeddings
if len(text_embeddings) > 1:
    visualize_embedding_space(
        text_embeddings,
        metadata_df['category'].tolist(),
        'Text Embeddings Space (OpenAI)',
        save_path=PROCESSED_DIR / 'text_embeddings_tsne.png'
    )

In [None]:
# Visualize CLIP embeddings
if len(clip_embeddings) > 1:
    visualize_embedding_space(
        clip_embeddings,
        metadata_df['category'].tolist(),
        'CLIP Embeddings Space (Multimodal)',
        save_path=PROCESSED_DIR / 'clip_embeddings_tsne.png'
    )

## 7. Similarity Matrix Heatmap

In [None]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(text_embeddings)

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    similarity_matrix,
    xticklabels=metadata_df['title'].tolist(),
    yticklabels=metadata_df['title'].tolist(),
    cmap='coolwarm',
    center=0.5,
    vmin=0,
    vmax=1,
    annot=True,
    fmt='.2f',
    square=True,
    cbar_kws={'label': 'Cosine Similarity'}
)

plt.title('Product Embedding Similarity Matrix', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Products', fontsize=12)
plt.ylabel('Products', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(PROCESSED_DIR / 'similarity_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Similarity matrix saved")

## 8. Performance Summary Report

In [None]:
# Generate evaluation summary
summary = f"""
=================================================================
MULTIMODAL RAG SYSTEM - EVALUATION SUMMARY
=================================================================

Dataset Statistics:
-------------------
Total items: {len(metadata_df)}
Text embedding dimension: {text_embeddings.shape[1]}
CLIP embedding dimension: {clip_embeddings.shape[1]}

Retrieval Performance:
----------------------
Recall@5:    {avg_metrics.loc[5, 'recall']:.4f}
Precision@5: {avg_metrics.loc[5, 'precision']:.4f}
NDCG@5:      {avg_metrics.loc[5, 'ndcg']:.4f}

Recall@10:   {avg_metrics.loc[10, 'recall']:.4f}
Precision@10: {avg_metrics.loc[10, 'precision']:.4f}
NDCG@10:     {avg_metrics.loc[10, 'ndcg']:.4f}

Key Findings:
-------------
1. The system achieves {avg_metrics.loc[5, 'recall']:.1%} recall at top-5 results
2. NDCG@5 of {avg_metrics.loc[5, 'ndcg']:.4f} indicates good ranking quality
3. Embedding space shows clear semantic clustering

Recommendations:
----------------
- Consider implementing reranking for improved precision
- Test with larger, more diverse dataset
- Experiment with hybrid text+image retrieval
- Add user feedback loop for continuous improvement

=================================================================
"""

print(summary)

# Save summary
with open(PROCESSED_DIR / 'evaluation_summary.txt', 'w') as f:
    f.write(summary)

print("Evaluation summary saved to evaluation_summary.txt")

## Summary

In this notebook, we:
1. Implemented comprehensive retrieval metrics (Recall@K, Precision@K, NDCG, MRR)
2. Evaluated LLM response quality using BLEU and ROUGE scores
3. Visualized embedding space quality using t-SNE
4. Created similarity matrices to understand item relationships
5. Generated a comprehensive evaluation report

Key Metrics:
- Recall@5: Measures coverage of relevant results
- NDCG@5: Measures ranking quality
- BLEU/ROUGE: Measures response quality

Next steps: Backend API implementation and Frontend development