# Meta Evaluation Analysis

This notebook analyzes evaluation results across different model configurations.

## Setup


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import toml
import numpy as np

# Set up plotting style
plt.style.use('default')
%matplotlib inline


In [None]:
# Import shared calculation function
import sys
sys.path.append('..')
from evaluate_semantic_search import calculate_evaluation_metrics

## Load Data

Load the most recent evaluation file or specify a different one.


In [None]:
# Auto-detect most recent evaluation file
eval_files = glob.glob("evals/evaluation_*.csv")
if not eval_files:
    raise FileNotFoundError("No evaluation files found in eval/ directory.")

eval_path = max(eval_files, key=os.path.getmtime)
print(f"Loading: {eval_path}")
print(f"File size: {os.path.getsize(eval_path) / (1024*1024):.1f} MB")

# Load evaluation data
eval_df = pd.read_csv(eval_path)

# Fill NaN values
eval_df["notes"] = eval_df["notes"].fillna("")
eval_df["db_tag"] = eval_df["db_tag"].fillna("")

print(f"\nTotal rows: {len(eval_df):,}")
print(f"Date range: {eval_df['date'].min()} to {eval_df['date'].max()}")

# Show models found
unique_models = eval_df[["bi_encoder_model", "db_tag", "meta_data_included"]].drop_duplicates()
print(f"\nFound {len(unique_models)} model configurations:")
for _, row in unique_models.iterrows():
    meta_str = "with meta" if row['meta_data_included'] else "no meta"
    tag_str = f" ({row['db_tag']})" if row['db_tag'] else ""
    print(f"  - {row['bi_encoder_model']}{tag_str} - {meta_str}")


## Calculate Metrics

Calculate metrics per model configuration using the shared `calculate_meta_evaluation_metrics` function.


In [None]:
# Load config for k values
config = toml.load("../config.toml")
initial_k = config["SEARCH"]["initial_k"]
final_k = config["SEARCH"]["final_k"]

# Use shared function to calculate metrics
# Returns both aggregated results and query-level metrics
aggregated, query_metrics_df = calculate_evaluation_metrics(eval_df, initial_k, final_k)

print(f"Calculated metrics for {len(aggregated)} model configurations")
print("\nSummary by model:")
aggregated[["bi_encoder_model", "db_tag", "meta_data_included", "query_location", "recall_at_initial_k", "recall_at_final_k", "mrr"]]



In [None]:
# Create model labels for display
def create_model_label(row):
    model = row['bi_encoder_model'].split('/')[-1]  # Get short model name
    meta = "meta" if row['meta_data_included'] else "no-meta"
    tag = f"-{row['db_tag']}" if row['db_tag'] else ""
    return f"{model}{tag}\n({meta})"

aggregated['model_label'] = aggregated.apply(create_model_label, axis=1)


In [None]:
# Compare Recall@K by model and location
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, query_location in enumerate([False, True]):
    loc_data = aggregated[aggregated['query_location'] == query_location].copy()
    loc_data = loc_data.sort_values('recall_at_initial_k', ascending=False)
    
    ax = axes[idx]
    bars = ax.barh(range(len(loc_data)), loc_data['recall_at_initial_k'])
    ax.set_yticks(range(len(loc_data)))
    ax.set_yticklabels(loc_data['model_label'], fontsize=9)
    ax.set_xlabel('Recall@K (%)', fontsize=11)
    ax.set_title(f"Query with Location: {query_location}", fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, val) in enumerate(zip(bars, loc_data['recall_at_initial_k'])):
        ax.text(val + 1, i, f'{val:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
# Compare MRR by model and location
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, query_location in enumerate([False, True]):
    loc_data = aggregated[aggregated['query_location'] == query_location].copy()
    loc_data = loc_data.sort_values('mrr', ascending=False)
    
    ax = axes[idx]
    bars = ax.barh(range(len(loc_data)), loc_data['mrr'], color='forestgreen')
    ax.set_yticks(range(len(loc_data)))
    ax.set_yticklabels(loc_data['model_label'], fontsize=9)
    ax.set_xlabel('Mean Reciprocal Rank (MRR)', fontsize=11)
    ax.set_title(f"Query with Location: {query_location}", fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, val) in enumerate(zip(bars, loc_data['mrr'])):
        ax.text(val + 0.01, i, f'{val:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
# Compare average rank overall (lower is better)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, query_location in enumerate([False, True]):
    loc_data = aggregated[aggregated['query_location'] == query_location].copy()
    loc_data = loc_data.sort_values('avg_rank_overall', ascending=True)
    
    ax = axes[idx]
    bars = ax.barh(range(len(loc_data)), loc_data['avg_rank_overall'], color='steelblue')
    ax.set_yticks(range(len(loc_data)))
    ax.set_yticklabels(loc_data['model_label'], fontsize=9)
    ax.set_xlabel('Average Rank Overall', fontsize=11)
    ax.set_title(f"Query with Location: {query_location}", fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (bar, val) in enumerate(zip(bars, loc_data['avg_rank_overall'])):
        ax.text(val + 2, i, f'{val:.1f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
# Summary table: Compare models side by side
summary = aggregated.pivot_table(
    index=['bi_encoder_model', 'db_tag', 'meta_data_included'],
    columns='query_location',
    values=['recall_at_initial_k', 'recall_at_final_k', 'mrr', 'avg_rank_overall'],
    aggfunc='mean'
)

# Flatten the MultiIndex columns for better readability
summary.columns = ['_'.join(map(str, col)).strip() for col in summary.columns.values]
summary = summary.reset_index()

# Create a cleaner model identifier column
summary['model_config'] = summary.apply(
    lambda row: f"{row['bi_encoder_model'].split('/')[-1]}{'-' + row['db_tag'] if row['db_tag'] else ''} ({'meta' if row['meta_data_included'] else 'no-meta'})",
    axis=1
)

# Select and reorder columns for display
display_cols = ['model_config'] + [col for col in summary.columns if col not in ['bi_encoder_model', 'db_tag', 'meta_data_included', 'model_config']]
summary_display = summary[display_cols].copy()

# Round numeric columns
for col in summary_display.columns:
    if summary_display[col].dtype in ['float64', 'int64']:
        summary_display[col] = summary_display[col].round(2)

# Display the cleaned DataFrame
print("Summary Table: Metrics by Model and Location")
print("=" * 100)
display(summary_display)

## Query-Level Analysis

Which queries are hardest? Which are easiest?


In [None]:
# Aggregate by query to see which are hardest/easiest across all models
# Note: recall_at_initial_k is already on a 0-100 scale from calculate_evaluation_metrics()
query_summary = query_metrics_df.groupby(['query', 'query_location']).agg({
    'recall_at_initial_k': 'mean'
}).reset_index()

# Show hardest queries (lowest success rate)
print("Hardest Queries (lowest success rate across all models):")
print("-" * 60)
hardest = query_summary.nsmallest(10, 'recall_at_initial_k')
for _, row in hardest.iterrows():
    loc_str = "with location" if row['query_location'] else "no location"
    print(f"{row['query'][:60]:60s} ({loc_str}): {row['recall_at_initial_k']:.1f}%")

print("\n\nEasiest Queries (highest success rate across all models):")
print("-" * 60)
easiest = query_summary.nlargest(10, 'recall_at_initial_k')
for _, row in easiest.iterrows():
    loc_str = "with location" if row['query_location'] else "no location"
    print(f"{row['query'][:60]:60s} ({loc_str}): {row['recall_at_initial_k']:.1f}%")


## Location Impact Analysis

Does adding location information help?


In [None]:
# Compare location vs no-location for each model (Recall@K)
location_impact_recall = aggregated.pivot_table(
    index=['bi_encoder_model', 'db_tag', 'meta_data_included'],
    columns='query_location',
    values='recall_at_initial_k'
).reset_index()
location_impact_recall.columns = ['bi_encoder_model', 'db_tag', 'meta_data_included', 'no_location', 'with_location']
location_impact_recall['improvement'] = location_impact_recall['with_location'] - location_impact_recall['no_location']
location_impact_recall = location_impact_recall.sort_values('improvement', ascending=False)

location_impact_recall['model_label'] = location_impact_recall.apply(create_model_label, axis=1)

print("Location Impact on Recall@K (with_location - no_location):")
print("=" * 80)
print(location_impact_recall[['model_label', 'no_location', 'with_location', 'improvement']].to_string(index=False))

# Visualize Recall@K
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(location_impact_recall))
width = 0.35

bars1 = ax.bar(x - width/2, location_impact_recall['no_location'], width, label='No Location', alpha=0.8)
bars2 = ax.bar(x + width/2, location_impact_recall['with_location'], width, label='With Location', alpha=0.8)

ax.set_xlabel('Model Configuration', fontsize=11)
ax.set_ylabel('Recall@K (%)', fontsize=11)
ax.set_title('Impact of Adding Location Information (Recall@K)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(location_impact_recall['model_label'], rotation=45, ha='right', fontsize=9)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Compare location vs no-location for each model (MRR)
location_impact_mrr = aggregated.pivot_table(
    index=['bi_encoder_model', 'db_tag', 'meta_data_included'],
    columns='query_location',
    values='mrr'
).reset_index()
location_impact_mrr.columns = ['bi_encoder_model', 'db_tag', 'meta_data_included', 'no_location', 'with_location']
location_impact_mrr['improvement'] = location_impact_mrr['with_location'] - location_impact_mrr['no_location']
location_impact_mrr = location_impact_mrr.sort_values('improvement', ascending=False)

location_impact_mrr['model_label'] = location_impact_mrr.apply(create_model_label, axis=1)

print("Location Impact on MRR (with_location - no_location):")
print("=" * 80)
print(location_impact_mrr[['model_label', 'no_location', 'with_location', 'improvement']].to_string(index=False))

# Visualize MRR
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(location_impact_mrr))
width = 0.35

bars1 = ax.bar(x - width/2, location_impact_mrr['no_location'], width, label='No Location', alpha=0.8)
bars2 = ax.bar(x + width/2, location_impact_mrr['with_location'], width, label='With Location', alpha=0.8)

ax.set_xlabel('Model Configuration', fontsize=11)
ax.set_ylabel('Mean Reciprocal Rank (MRR)', fontsize=11)
ax.set_title('Impact of Adding Location Information (MRR)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(location_impact_mrr['model_label'], rotation=45, ha='right', fontsize=9)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


## Average Rank vs MMR: 
Average Rank: includes all queries. If the answer isn’t found, it assigns initial_k + 1 (201). So it heavily penalizes misses.

MRR: for misses, it contributes 0 (doesn’t penalize). For hits, it uses 1/rank, heavily favoring top ranks.

So a model can have:
Lower average rank (better) because it finds answers more often, even at mediocre ranks
Lower MRR (worse) because when it finds answers, they’re not at top ranks