# RAG Evaluation Analysis
**Production-quality evaluation dashboard for the FAISS + Gemini RAG pipeline.**

Sections:
1. Install & Import Dependencies
2. Configuration & Synthetic Data (for offline demo)
3. Load Evaluation Results from SQLite
4. Faithfulness Score Distribution Histogram
5. Query Route Assignment Pie Chart
6. Retrieval Metrics (Recall@k, MRR, NDCG@k)
7. Latency & Cost Breakdown
8. Top-10 Failure Analysis (worst faithfulness)
9. Top-3 Failure Pattern Clustering (via Gemini-Flash)
10. Experiment Comparison Table


In [None]:
# ── 1. Install & Import Dependencies ──────────────────────────────────────
# Uncomment the line below if running for the first time
# !pip install faiss-cpu sentence-transformers google-generativeai pandas matplotlib seaborn numpy

import sys, os, json, sqlite3, csv, time, logging, math, uuid, warnings
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.WARNING)

# Add project root to path so we can import rag.*
sys.path.insert(0, str(Path.cwd().parent))

sns.set_theme(style="whitegrid", palette="muted")
print("✓ Imports OK")


In [None]:
# ── 2. Configuration ──────────────────────────────────────────────────────
@dataclass
class Config:
    gemini_flash_model: str   = "gemini-1.5-flash"
    gemini_pro_model: str     = "gemini-1.5-pro"
    embedding_model: str      = "all-mpnet-base-v2"
    k_retrieved: int          = 5
    db_path: str              = "../evaluations.db"
    experiments_dir: str      = "../experiments"
    routing_log: str          = "../routing_log.jsonl"

cfg = Config()

# Load Gemini API key (set GEMINI_API_KEY as an env var before running)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
_gemini_available = bool(GEMINI_API_KEY)

if _gemini_available:
    try:
        import google.generativeai as genai
        genai.configure(api_key=GEMINI_API_KEY)
        flash_llm = genai.GenerativeModel(cfg.gemini_flash_model)
        pro_llm   = genai.GenerativeModel(cfg.gemini_pro_model)
        print(f"✓ Gemini initialised  (flash={cfg.gemini_flash_model}, pro={cfg.gemini_pro_model})")
    except Exception as e:
        _gemini_available = False
        print(f"⚠  Gemini init failed: {e}. Pattern analysis will be skipped.")
else:
    print("⚠  GEMINI_API_KEY not set. Gemini-dependent cells will use cached/synthetic data.")


In [None]:
# ── 3. Load Evaluation Results ────────────────────────────────────────────
# Tries to load from real SQLite DB + routing JSONL.
# Falls back to realistic synthetic data so the notebook is always runnable.

def load_evaluation_data(db_path: str) -> pd.DataFrame:
    """Load evaluation rows from SQLite → DataFrame."""
    try:
        conn = sqlite3.connect(db_path)
        df = pd.read_sql_query("SELECT * FROM evaluations ORDER BY timestamp", conn)
        conn.close()
        if len(df) > 0:
            print(f"✓ Loaded {len(df)} real evaluation rows from {db_path}")
            return df
    except Exception as e:
        print(f"  SQLite load failed ({e}). Generating synthetic data.")
    return None


def load_routing_data(log_path: str) -> pd.DataFrame:
    """Load routing decisions from JSONL → DataFrame."""
    records = []
    try:
        with open(log_path) as f:
            for line in f:
                line = line.strip()
                if line:
                    records.append(json.loads(line))
        if records:
            print(f"✓ Loaded {len(records)} routing decisions from {log_path}")
            return pd.DataFrame(records)
    except Exception as e:
        print(f"  Routing log load failed ({e}). Using synthetic routing data.")
    return None


def _synthetic_evals(n: int = 120, seed: int = 42) -> pd.DataFrame:
    """Generate realistic synthetic evaluation rows for demo."""
    rng = np.random.default_rng(seed)
    routes = rng.choice(
        ["RAG_RETRIEVAL", "DIRECT_LLM", "HYBRID"],
        size=n,
        p=[0.58, 0.25, 0.17],
    )
    # RAG queries tend to have lower faithfulness (harder)
    base_faith = np.where(routes == "DIRECT_LLM", 0.88, np.where(routes == "HYBRID", 0.75, 0.72))
    faithfulness = np.clip(rng.normal(base_faith, 0.12), 0, 1)
    answer_relevance = np.clip(rng.normal(0.82, 0.11), 0, 1)
    context_precision = np.clip(rng.normal(0.74, 0.15), 0, 1)
    context_recall = np.clip(rng.normal(0.69, 0.14), 0, 1)

    sample_queries = [
        "Recommend running shoes under 3000 rupees",
        "Best wireless headphones for gym",
        "What is machine learning?",
        "Is our return policy competitive?",
        "Show me gaming laptops with RTX GPU",
        "What is RAG in NLP?",
        "Compare our laptop warranty to industry standard",
        "Noise cancelling earbuds under 2000",
        "What is gradient descent?",
        "Best rated products this month",
    ] * (n // 10 + 1)

    return pd.DataFrame({
        "query_id": [str(uuid.uuid4()) for _ in range(n)],
        "query": sample_queries[:n],
        "response": ["[synthetic response]"] * n,
        "faithfulness": faithfulness,
        "answer_relevance": answer_relevance,
        "context_precision": context_precision,
        "context_recall": context_recall,
        "judge_model": ["gemini-1.5-pro"] * n,
        "timestamp": np.linspace(time.time() - 7200, time.time(), n),
        "latency_ms": rng.normal(2800, 400, n),
        "route": routes,
        "routing_method": rng.choice(["embedding", "llm"], size=n, p=[0.72, 0.28]),
        "routing_confidence": np.clip(rng.normal(0.82, 0.10), 0.4, 1.0),
        "retrieval_latency_ms": rng.normal(28, 8, n),
        "generation_latency_ms": rng.normal(900, 150, n),
        "evaluation_latency_ms": rng.normal(2100, 300, n),
        "estimated_cost_usd": rng.uniform(0.005, 0.015, n),
        "input_tokens": rng.integers(600, 1400, n),
        "output_tokens": rng.integers(80, 250, n),
        "unsupported_claims": [
            json.dumps(["Hallucinated price", "Unverified brand claim"]
                      if f < 0.5 else (["Minor unsupported claim"] if f < 0.75 else []))
            for f in faithfulness
        ],
        "missing_facts": [json.dumps([]) for _ in range(n)],
        "relevance_reasoning": ["[synthetic reasoning]"] * n,
    })


def _synthetic_routing(n: int = 120, seed: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    routes = rng.choice(
        ["RAG_RETRIEVAL", "DIRECT_LLM", "HYBRID"],
        size=n,
        p=[0.58, 0.25, 0.17],
    )
    return pd.DataFrame({
        "route": routes,
        "method": rng.choice(["embedding", "llm"], size=n, p=[0.72, 0.28]),
        "confidence": np.clip(rng.normal(0.82, 0.10, n), 0.4, 1.0),
        "latency_ms": np.where(
            rng.choice(["embedding", "llm"], size=n, p=[0.72, 0.28]) == "embedding",
            rng.normal(6, 2, n),
            rng.normal(210, 40, n),
        ),
    })


# --- Load or generate data ---
df_eval = load_evaluation_data(cfg.db_path)
if df_eval is None:
    df_eval = _synthetic_evals()
    print(f"  Using {len(df_eval)} synthetic evaluation rows.")

df_route = load_routing_data(cfg.routing_log)
if df_route is None:
    df_route = _synthetic_routing(len(df_eval))
    print(f"  Using {len(df_route)} synthetic routing rows.")

# Parse JSON columns if loaded from SQLite
for col in ["unsupported_claims", "missing_facts"]:
    if col in df_eval.columns:
        df_eval[col] = df_eval[col].apply(
            lambda x: json.loads(x) if isinstance(x, str) else (x or [])
        )

df_eval.head(3)


## 4. Faithfulness Score Distribution

Faithfulness measures what fraction of the generated response's claims are **directly supported by retrieved context**.  
A score of 1.0 means every claim is grounded; 0.0 means the response is entirely hallucinated.

**Thresholds used:**
- ≥ 0.80 → High quality (green zone)
- 0.50 – 0.80 → Medium quality (amber zone)
- < 0.50 → Low quality / hallucination risk (red zone)


In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

scores = df_eval["faithfulness"].dropna()
mean_f = scores.mean()
median_f = scores.median()

# Coloured zone backgrounds
ax.axvspan(0.0, 0.5,  alpha=0.08, color="red",    label="_nolegend_")
ax.axvspan(0.5, 0.8,  alpha=0.08, color="orange",  label="_nolegend_")
ax.axvspan(0.8, 1.0,  alpha=0.08, color="green",   label="_nolegend_")

# Histogram
n_bins = 25
ax.hist(scores, bins=n_bins, color="#4C72B0", edgecolor="white", linewidth=0.6,
        alpha=0.85, density=True, label="Frequency (density)")

# KDE overlay
try:
    from scipy.stats import gaussian_kde
    kde = gaussian_kde(scores, bw_method=0.25)
    x_line = np.linspace(0, 1, 300)
    ax.plot(x_line, kde(x_line), color="#DD8452", linewidth=2.5, label="KDE")
except ImportError:
    pass  # scipy optional

# Mean / Median lines
ax.axvline(mean_f,   color="#C44E52", linewidth=2, linestyle="--", label=f"Mean  = {mean_f:.3f}")
ax.axvline(median_f, color="#55A868", linewidth=2, linestyle=":",  label=f"Median = {median_f:.3f}")

# Zone annotations
pct_high   = (scores >= 0.8).mean() * 100
pct_medium = ((scores >= 0.5) & (scores < 0.8)).mean() * 100
pct_low    = (scores < 0.5).mean() * 100

for x_pos, pct, label, color in [
    (0.25, pct_low,    f"Low\n{pct_low:.1f}%",    "red"),
    (0.65, pct_medium, f"Medium\n{pct_medium:.1f}%", "darkorange"),
    (0.90, pct_high,   f"High\n{pct_high:.1f}%",  "darkgreen"),
]:
    ax.text(x_pos, ax.get_ylim()[1] * 0.92 if ax.get_ylim()[1] > 0 else 2.5,
            label, ha="center", fontsize=11, color=color, fontweight="bold")

ax.set_xlabel("Faithfulness Score", fontsize=13)
ax.set_ylabel("Density", fontsize=13)
ax.set_title("Faithfulness Score Distribution\n(LLM-as-Judge, Gemini-1.5-Pro)", fontsize=14)
ax.set_xlim(0, 1)
ax.legend(fontsize=10)
plt.tight_layout()
plt.savefig("faithfulness_distribution.png", dpi=150, bbox_inches="tight")
plt.show()

print(f"\nSummary statistics:")
print(scores.describe().round(4).to_string())


## 5. Query Route Assignment

The `QueryRouter` classifies each query into one of three routes before retrieval.  
The pie chart on the left shows the **distribution of routes**, while the bar chart on the right  
shows how **mean faithfulness varies by route** — a key signal for whether routing improves quality.


In [None]:
route_col = "route" if "route" in df_eval.columns else None
if route_col is None and "route" in df_route.columns:
    # Merge routing data into eval df on index (best-effort)
    df_eval = df_eval.copy()
    df_eval["route"] = df_route["route"].values[:len(df_eval)]
    route_col = "route"

route_counts = df_eval[route_col].value_counts() if route_col else df_route["route"].value_counts()
route_order  = ["RAG_RETRIEVAL", "DIRECT_LLM", "HYBRID"]
route_counts = route_counts.reindex(
    [r for r in route_order if r in route_counts.index], fill_value=0
)

colors = ["#4C72B0", "#55A868", "#DD8452"]

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# --- Pie chart ---
wedges, texts, autotexts = axes[0].pie(
    route_counts.values,
    labels=route_counts.index,
    autopct="%1.1f%%",
    colors=colors,
    startangle=90,
    pctdistance=0.75,
    wedgeprops={"edgecolor": "white", "linewidth": 2},
)
for at in autotexts:
    at.set_fontsize(11)
    at.set_fontweight("bold")
axes[0].set_title("Query Route Distribution", fontsize=13, fontweight="bold")

# Add routing method annotation
if "method" in df_route.columns:
    method_counts = df_route["method"].value_counts()
    legend_labels = [f"{m}: {c} ({c/len(df_route)*100:.0f}%)"
                     for m, c in method_counts.items()]
    axes[0].legend(legend_labels, title="Routing method", loc="lower left",
                   fontsize=9, title_fontsize=9)

# --- Bar: mean faithfulness per route ---
if route_col and "faithfulness" in df_eval.columns:
    faith_by_route = (
        df_eval.groupby(route_col)["faithfulness"]
        .agg(["mean", "std", "count"])
        .reindex([r for r in route_order if r in df_eval[route_col].unique()])
    )
    bars = axes[1].bar(
        faith_by_route.index,
        faith_by_route["mean"],
        color=colors[:len(faith_by_route)],
        edgecolor="white",
        linewidth=1.5,
        alpha=0.85,
        yerr=faith_by_route["std"],
        capsize=5,
    )
    for bar, (_, row) in zip(bars, faith_by_route.iterrows()):
        axes[1].text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 0.02,
            f"{row['mean']:.3f}\n(n={int(row['count'])})",
            ha="center", va="bottom", fontsize=10, fontweight="bold"
        )
    axes[1].set_ylim(0, 1.15)
    axes[1].set_ylabel("Mean Faithfulness ± std", fontsize=12)
    axes[1].set_xlabel("Route", fontsize=12)
    axes[1].set_title("Mean Faithfulness by Route", fontsize=13, fontweight="bold")
    axes[1].axhline(df_eval["faithfulness"].mean(), color="gray", linestyle="--",
                    linewidth=1.2, label=f"Overall mean = {df_eval['faithfulness'].mean():.3f}")
    axes[1].legend(fontsize=10)
else:
    axes[1].text(0.5, 0.5, "No faithfulness data to plot",
                 ha="center", va="center", fontsize=12, transform=axes[1].transAxes)

plt.suptitle("Query Routing Analysis", fontsize=15, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig("routing_distribution.png", dpi=150, bbox_inches="tight")
plt.show()

print("\nRoute counts:")
print(route_counts.to_string())


## 6. Retrieval Metrics — Recall@k, MRR, NDCG@k

If an experiments directory exists, load the aggregated metrics from the latest run.  
Otherwise, display a sample results table using the current e-commerce baseline metrics  
(Recall@5 = 0.85, MRR = 0.82 as measured by the existing evaluation layer).


In [None]:
def _load_experiment_metrics(experiments_dir: str) -> Optional[dict]:
    """Try loading the most recently modified experiment's results.json."""
    base = Path(experiments_dir)
    if not base.exists():
        return None
    results_files = sorted(base.glob("*/results.json"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not results_files:
        return None
    with open(results_files[0]) as f:
        return json.load(f)

exp_metrics = _load_experiment_metrics(cfg.experiments_dir)

if exp_metrics:
    print(f"✓ Loaded metrics from experiment: {exp_metrics.get('experiment_id', '?')}")
    mrr_val = exp_metrics.get("mrr", 0.82)
    recall  = exp_metrics.get("mean_recall", {})
    ndcg    = exp_metrics.get("mean_ndcg", {})
else:
    print("  No experiments found. Using sample metrics from existing baseline.")
    # Your existing baseline numbers
    mrr_val = 0.82
    recall  = {"1": 0.62, "3": 0.78, "5": 0.85, "10": 0.91}
    ndcg    = {"1": 0.62, "3": 0.72, "5": 0.80,  "10": 0.85}

# Normalise key type
recall = {int(k): v for k, v in recall.items()}
ndcg   = {int(k): v for k, v in ndcg.items()}

k_vals = sorted(set(recall) | set(ndcg))

# ── Table ──────────────────────────────────────────────────────────────────
metrics_df = pd.DataFrame({
    "k":          k_vals,
    "Recall@k":   [recall.get(k, 0) for k in k_vals],
    "NDCG@k":     [ndcg.get(k, 0)   for k in k_vals],
}).set_index("k")

print(f"\nMRR = {mrr_val:.4f}\n")
print(metrics_df.round(4).to_string())

# ── Bar chart ─────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(9, 4.5))
x = np.arange(len(k_vals))
width = 0.35

bars1 = ax.bar(x - width/2, metrics_df["Recall@k"], width,
               label="Recall@k", color="#4C72B0", alpha=0.85, edgecolor="white")
bars2 = ax.bar(x + width/2, metrics_df["NDCG@k"],   width,
               label="NDCG@k",   color="#55A868", alpha=0.85, edgecolor="white")

ax.axhline(mrr_val, color="#DD8452", linestyle="--", linewidth=2,
           label=f"MRR = {mrr_val:.4f}")

for bar in list(bars1) + list(bars2):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
            f"{bar.get_height():.3f}", ha="center", va="bottom", fontsize=9)

ax.set_xticks(x)
ax.set_xticklabels([f"k={k}" for k in k_vals], fontsize=11)
ax.set_ylim(0, 1.12)
ax.set_ylabel("Score", fontsize=12)
ax.set_title("Retrieval Metrics: Recall@k and NDCG@k", fontsize=13, fontweight="bold")
ax.legend(fontsize=10)
plt.tight_layout()
plt.savefig("retrieval_metrics.png", dpi=150, bbox_inches="tight")
plt.show()


## 7. Latency & Cost Breakdown

Understanding where time is spent and what each evaluation costs per query helps  
decide where to optimise (e.g., caching, smaller judge model, batch API calls).


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ── Latency stacked bar (per query, mean) ─────────────────────────────────
latency_cols = {
    "retrieval_latency_ms":  "Retrieval",
    "generation_latency_ms": "Generation",
    "evaluation_latency_ms": "LLM Judge",
}
present_latency = {k: v for k, v in latency_cols.items() if k in df_eval.columns}

if present_latency:
    means = {label: df_eval[col].mean() for col, label in present_latency.items()}
    pal = ["#4C72B0", "#55A868", "#DD8452"][:len(means)]

    bottom = 0
    for (label, val), color in zip(means.items(), pal):
        axes[0].bar("Mean per query", val, bottom=bottom, color=color, label=label,
                    edgecolor="white", linewidth=1.5, alpha=0.88)
        axes[0].text(0, bottom + val / 2, f"{val:.0f} ms", ha="center",
                     va="center", fontsize=11, color="white", fontweight="bold")
        bottom += val

    axes[0].set_ylabel("Latency (ms)", fontsize=12)
    axes[0].set_title("Mean Latency Breakdown (per query)", fontsize=12, fontweight="bold")
    axes[0].legend(loc="upper right", fontsize=10)
    axes[0].set_ylim(0, bottom * 1.15)
else:
    axes[0].text(0.5, 0.5, "No latency columns in data", ha="center", va="center",
                 fontsize=12, transform=axes[0].transAxes)

# ── Cost distribution ─────────────────────────────────────────────────────
if "estimated_cost_usd" in df_eval.columns:
    cost_data = df_eval["estimated_cost_usd"].dropna() * 100  # cents
    axes[1].hist(cost_data, bins=20, color="#4C72B0", edgecolor="white",
                 linewidth=0.6, alpha=0.85)
    axes[1].axvline(cost_data.mean(), color="#C44E52", linewidth=2, linestyle="--",
                    label=f"Mean = {cost_data.mean():.3f} ¢")
    axes[1].set_xlabel("Estimated Cost per Query (¢ USD)", fontsize=12)
    axes[1].set_ylabel("Count", fontsize=12)
    axes[1].set_title("Cost Distribution (Judge API calls)", fontsize=12, fontweight="bold")
    axes[1].legend(fontsize=10)

    total_cost = df_eval["estimated_cost_usd"].sum()
    print(f"\nTotal estimated cost for {len(df_eval)} queries: ${total_cost:.4f} USD")
    if "input_tokens" in df_eval.columns:
        print(f"Total input tokens : {df_eval['input_tokens'].sum():,}")
        print(f"Total output tokens: {df_eval['output_tokens'].sum():,}")
else:
    axes[1].text(0.5, 0.5, "No cost data available", ha="center", va="center",
                 fontsize=12, transform=axes[1].transAxes)

plt.tight_layout()
plt.savefig("latency_cost.png", dpi=150, bbox_inches="tight")
plt.show()


## 8. Top-10 Failure Analysis

The lowest-faithfulness queries reveal systematic failure modes.  
Each row shows the query, faithfulness score, and the specific unsupported claims  
flagged by the LLM judge — these are claims in the response that are **not in the retrieved context**.


In [None]:
worst10 = df_eval.nsmallest(10, "faithfulness")[
    ["query", "faithfulness", "answer_relevance", "context_precision", "unsupported_claims"]
].reset_index(drop=True)

# Pretty-print unsupported_claims
worst10["unsupported_claims_str"] = worst10["unsupported_claims"].apply(
    lambda x: "; ".join(x) if isinstance(x, list) else str(x)
)

display_df = worst10[["query", "faithfulness", "answer_relevance", "unsupported_claims_str"]].copy()
display_df.columns = ["Query", "Faithfulness", "Relevance", "Unsupported Claims"]
display_df["Query"] = display_df["Query"].apply(lambda q: q[:70] + "…" if len(q) > 70 else q)

try:
    display(display_df.style
        .background_gradient(subset=["Faithfulness"], cmap="RdYlGn", vmin=0, vmax=1)
        .background_gradient(subset=["Relevance"],   cmap="RdYlGn", vmin=0, vmax=1)
        .set_properties(**{"font-size": "11px"})
        .set_caption("Top-10 Worst Queries by Faithfulness")
    )
except Exception:
    print(display_df.to_string(index=False))

# Horizontal bar chart
fig, ax = plt.subplots(figsize=(9, 5))
bar_colors = ["#C44E52" if f < 0.5 else "#DD8452" for f in worst10["faithfulness"]]
ax.barh(
    range(len(worst10)),
    worst10["faithfulness"],
    color=bar_colors,
    edgecolor="white",
    linewidth=0.8,
    alpha=0.85,
)
ax.set_yticks(range(len(worst10)))
ax.set_yticklabels(
    [f"Q{i+1}: {q[:45]}…" if len(q) > 45 else f"Q{i+1}: {q}"
     for i, q in enumerate(worst10["query"])],
    fontsize=9,
)
ax.set_xlim(0, 1)
ax.set_xlabel("Faithfulness Score", fontsize=12)
ax.set_title("Top-10 Lowest Faithfulness Queries", fontsize=13, fontweight="bold")
ax.axvline(0.5, color="gray", linestyle=":", linewidth=1.2, label="Threshold = 0.5")
ax.legend(fontsize=10)
ax.invert_yaxis()
plt.tight_layout()
plt.savefig("failure_analysis.png", dpi=150, bbox_inches="tight")
plt.show()


## 9. Top-3 Failure Pattern Clustering (via Gemini-Flash)

The unsupported claims from the failure analysis are clustered by Gemini-Flash  
into the three most common hallucination patterns. This surfaces actionable insight  
without manual review of every failure.

> **Requires** `GEMINI_API_KEY` to be set. If not available, hardcoded sample patterns are shown.


In [None]:
def _get_failure_patterns_from_gemini(claims_list: list) -> list:
    """Ask Gemini-Flash to cluster claims into 3 failure patterns."""
    flat_claims = []
    for claims in claims_list:
        if isinstance(claims, list):
            flat_claims.extend(claims)
        elif isinstance(claims, str) and claims:
            flat_claims.append(claims)

    flat_claims = [c for c in flat_claims if c and c != "[parse error]"]
    if not flat_claims:
        return []

    prompt = f"""You are a quality analyst for an AI-powered e-commerce recommendation system.

Below are unsupported claims flagged by an LLM judge (claims made in AI responses that
are NOT supported by the retrieved product context):

{json.dumps(flat_claims[:40], indent=2)}

Identify the 3 most common FAILURE PATTERNS among these claims.
Each pattern should represent a category of hallucination (e.g., "price hallucination",
"brand fabrication", "specification invention").

Return ONLY a valid JSON object with no extra text:
{{
  "patterns": [
    {{
      "name": "<short pattern name>",
      "description": "<one sentence explanation>",
      "example_claims": ["<claim 1>", "<claim 2>"],
      "estimated_frequency_pct": <int 0-100>
    }},
    ...
  ]
}}"""

    try:
        resp = flash_llm.generate_content(
            prompt,
            generation_config={"temperature": 0.2, "response_mime_type": "application/json"},
        )
        text = resp.text.strip()
        if text.startswith("```"):
            text = "\n".join(text.split("\n")[1:-1])
        data = json.loads(text)
        return data.get("patterns", [])
    except Exception as e:
        print(f"  Gemini pattern clustering failed: {e}")
        return []


# ── Collect all unsupported claims from worst 30 queries ─────────────────
worst30 = df_eval.nsmallest(30, "faithfulness")
all_claims = worst30["unsupported_claims"].tolist()

if _gemini_available:
    print("Clustering failure patterns via Gemini-Flash…")
    patterns = _get_failure_patterns_from_gemini(all_claims)
else:
    patterns = []

# Fallback patterns (always shown if Gemini unavailable)
if not patterns:
    print("  Using sample failure patterns (representative of e-commerce RAG systems).")
    patterns = [
        {
            "name": "Price Hallucination",
            "description": "The model fabricates or significantly alters product prices not found in context.",
            "example_claims": ["Hallucinated price", "Price stated as 1999 but not in catalogue"],
            "estimated_frequency_pct": 45,
        },
        {
            "name": "Brand / Specification Fabrication",
            "description": "The model invents product specifications or brand claims unsupported by retrieved chunks.",
            "example_claims": ["Unverified brand claim", "Spec not in product description"],
            "estimated_frequency_pct": 35,
        },
        {
            "name": "Stock / Availability Assumption",
            "description": "The model incorrectly asserts product availability or delivery timelines.",
            "example_claims": ["Minor unsupported claim", "Availability not confirmed"],
            "estimated_frequency_pct": 20,
        },
    ]

# ── Display as markdown table ─────────────────────────────────────────────
print("\n### Top-3 Failure Patterns\n")
print(f"{'#':<4} {'Pattern':<30} {'Freq %':<8} Description")
print("-" * 80)
for i, p in enumerate(patterns[:3], 1):
    print(f"{i:<4} {p['name']:<30} {p.get('estimated_frequency_pct', '?'):>5}%   {p['description'][:60]}")

# ── Bar chart ─────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(9, 4))
names  = [p["name"] for p in patterns[:3]]
freqs  = [p.get("estimated_frequency_pct", 33) for p in patterns[:3]]
colors = ["#C44E52", "#DD8452", "#937860"]

bars = ax.barh(names[::-1], freqs[::-1], color=colors[::-1],
               edgecolor="white", linewidth=1.5, alpha=0.88)
for bar, freq in zip(bars, freqs[::-1]):
    ax.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height() / 2,
            f"{freq}%", va="center", fontsize=11, fontweight="bold")

ax.set_xlim(0, max(freqs) * 1.25)
ax.set_xlabel("Estimated Frequency (%)", fontsize=12)
ax.set_title("Top-3 Hallucination Failure Patterns\n(identified by LLM-as-Judge)", fontsize=12, fontweight="bold")
plt.tight_layout()
plt.savefig("failure_patterns.png", dpi=150, bbox_inches="tight")
plt.show()

for i, p in enumerate(patterns[:3], 1):
    print(f"\nPattern {i}: {p['name']}")
    print(f"  {p['description']}")
    if p.get("example_claims"):
        print(f"  Examples: {p['example_claims'][:2]}")


## 10. Experiment Comparison Table

Compare two saved experiments side-by-side. Positive deltas (Δ) are highlighted in green,  
negative deltas in red. Run `python evaluate.py --compare exp_a exp_b` from the CLI  
for the same output in the terminal.


In [None]:
try:
    sys.path.insert(0, str(Path.cwd().parent))
    from rag.experiments import ExperimentTracker
    tracker = ExperimentTracker(base_dir=cfg.experiments_dir)
    exp_list = tracker.list_experiments()

    if len(exp_list) >= 2:
        id1 = exp_list[0]["experiment_id"]
        id2 = exp_list[1]["experiment_id"]
        comparison = tracker.compare_experiments(id1, id2)

        # Build styled DataFrame
        cmp_data = []
        for metric, vals in comparison["metrics_comparison"].items():
            cmp_data.append({
                "Metric": metric,
                id1: vals.get(id1),
                id2: vals.get(id2),
                "Δ (B−A)": vals.get("delta (2-1)"),
            })

        cmp_df = pd.DataFrame(cmp_data).set_index("Metric")

        def _color_delta(val):
            if not isinstance(val, (int, float)) or val == 0:
                return ""
            # Higher is better for most metrics, lower is better for cost/latency
            return "color: green" if val > 0 else "color: red"

        try:
            display(cmp_df.style
                .applymap(_color_delta, subset=["Δ (B−A)"])
                .format("{:.4f}", na_rep="N/A")
                .set_caption(f"Experiment Comparison: {id1} vs {id2}")
                .set_properties(**{"font-size": "11px"})
            )
        except Exception:
            print(cmp_df.round(4).to_string())
    elif len(exp_list) == 1:
        print(f"Only 1 experiment saved ({exp_list[0]['experiment_id']}). "
              "Run a second experiment to enable comparison.")
        tracker.print_experiments()
    else:
        print("No experiments saved yet.")
        print("\nRun an evaluation first:")
        print("  python evaluate.py --dataset qa_pairs.json "
              "--k 5 --experiment-name baseline")

except ImportError as e:
    print(f"  rag.experiments not importable ({e}). "
          "Run the evaluate.py CLI to generate experiments first.")
