In [1]:
from collections import defaultdict
from datetime import datetime
from typing import Any, Literal

import matplotlib.pyplot as plt
import pandas as pd

from benchmark.utils import Score, get_langfuse_scores, get_local_scores


In [None]:
scores_source: Literal["local", "langfuse"] = "langfuse"
if scores_source == "langfuse":
    scores = get_langfuse_scores()
else:
    scores = get_local_scores()

print(f"{len(scores)} Scores Loaded from the last month")

In [9]:
scores_by_name: defaultdict[str, list[Score]] = defaultdict(list)

for score in scores:
    scores_by_name[score.name].append(score)

# Sort each group by updated_at
for name in scores_by_name:
    scores_by_name[name].sort(key=lambda x: x.updated_at)

metrics = ["answer_correctness", "conciseness", "faithfulness", "completeness"]

answer_correctness_scores = scores_by_name["answer_correctness"]  # for numeric responses
conciseness_scores = scores_by_name["conciseness"]  # for text responses
faithfulness_scores = scores_by_name["faithfulness"]  # for text responses
completeness_scores = scores_by_name["completeness"]  # for text responses


In [None]:
def scores_to_dataframe(scores_dict: dict[str, list[Score]]) -> pd.DataFrame:
    data: list[dict[str, Any]] = []
    for metric_name, scores_list in scores_dict.items():
        for score in scores_list:
            # Convert updated_at back to datetime if it's a string
            if isinstance(score.updated_at, str):
                updated_at = datetime.fromisoformat(score.updated_at.replace("Z", "+00:00"))
            else:
                updated_at = score.updated_at

            benchmark_run_time = updated_at.replace(minute=0, second=0, microsecond=0)

            data.append(
                {
                    "metric": metric_name,
                    "value": score.value,
                    "string_value": score.string_value if score.string_value != "" else None,
                    "timestamp": updated_at,
                    "benchmark_run": benchmark_run_time,
                }
            )

    return pd.DataFrame(data)


# Create DataFrame
scores_df = scores_to_dataframe(scores_by_name)


# Display basic statistics
print("Dataset Overview:")
print(f"Total scores: {len(scores_df)}")
print(f"Date range: {scores_df['timestamp'].min()} to {scores_df['timestamp'].max()}")  # type: ignore[index]
print(f"Metrics: {scores_df['metric'].unique()}")
print(f"Unique benchmark runs: {scores_df['benchmark_run'].nunique()}")
print("\nScore counts by metric:")
print(scores_df["metric"].value_counts())
print("\nBenchmark runs:")
print(scores_df['benchmark_run'].value_counts().sort_index())


In [11]:
def plot_categorical_metrics(scores_df: pd.DataFrame, metrics: list[str]) -> None:
    """Plot categorical metrics using stacked bar charts."""
    categorical_metrics: list[str] = []
    for metric in metrics:
        metric_data = scores_df[scores_df["metric"] == metric]
        if len(metric_data) > 0:
            categorical_metrics.append(metric)

    if not categorical_metrics:
        return

    fig, ax = plt.subplots(figsize=(12, 6))

    for metric in categorical_metrics:
        metric_data = scores_df[scores_df["metric"] == metric].sort_values("benchmark_run")
        # Group by benchmark run and count occurrences
        category_counts = metric_data.groupby(["benchmark_run", "string_value"]).size().unstack(fill_value=0)

        # Create stacked bar chart
        category_counts.plot(kind="bar", stacked=True, width=0.8, alpha=0.8, ax=ax)
        ax.set_title(f"{metric.replace('_', ' ').title()}")
        ax.set_xlabel("Benchmark Run")
        ax.set_ylabel("Count")
        ax.legend(title="Category")
        ax.tick_params(axis="x", rotation=0)
        ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()


def plot_numeric_metrics(scores_df: pd.DataFrame, metrics: list[str]) -> None:
    """Plot numeric metrics using line plots with average and shaded standard deviation."""
    numeric_metrics = []
    for metric in metrics:
        metric_data = scores_df[scores_df["metric"] == metric]
        if len(metric_data) > 0:
            numeric_metrics.append(metric)

    if not numeric_metrics:
        return

    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    fig.suptitle("Numeric Metrics Evolution Over Time", fontsize=16)

    for i, metric in enumerate(numeric_metrics):
        metric_data = scores_df[scores_df["metric"] == metric].sort_values("benchmark_run")

        # Calculate average and std for each benchmark run
        run_stats = metric_data.groupby("benchmark_run")["value"].agg(["mean", "std"]).reset_index()
        run_stats["std"] = run_stats["std"].fillna(0)  # Handle single-value runs

        # Plot average line
        axes[i].plot(
            run_stats["benchmark_run"],
            run_stats["mean"],
            marker="o",
            linewidth=2,
            markersize=6,
            color='blue'
        )

        # Fill area for standard deviation
        axes[i].fill_between(
            run_stats["benchmark_run"],
            run_stats["mean"] - run_stats["std"],
            run_stats["mean"] + run_stats["std"],
            alpha=0.3,
            color='blue',
            label='± 1 std'
        )

        axes[i].set_title(f"{metric.replace('_', ' ').title()}")
        axes[i].set_xlabel("Benchmark Run")
        axes[i].set_ylabel("Score")
        axes[i].tick_params(axis="x", rotation=45)
        axes[i].grid(True, alpha=0.3)
        axes[i].legend()

    # Hide unused subplots if there are fewer than 3 metrics
    for i in range(len(numeric_metrics), 3):
        axes[i].set_visible(False)

    plt.tight_layout()
    plt.show()


In [None]:
# Plot categorical and numeric metrics separately
categorical_metrics = ["answer_correctness"]
plot_categorical_metrics(scores_df, categorical_metrics)

In [None]:
numeric_metrics = ["completeness", "faithfulness", "conciseness"]
plot_numeric_metrics(scores_df, numeric_metrics)