In [7]:
import litellm
import logging
from litellm._logging import _disable_debugging, verbose_logger

_disable_debugging()


# https://github.com/BerriAI/litellm/issues/9815
class CostCalculationFilter(logging.Filter):
    def filter(self, record):
        # Block messages containing this specific text
        if "selected model name for cost calculation" in record.getMessage():
            return False
        return True


# Apply the filter to the verbose logger
verbose_logger.addFilter(CostCalculationFilter())

In [8]:
from local_email_db import generate_database

generate_database()

2025-04-14 23:44:28,354 - INFO - Starting database generation for repo 'corbt/enron-emails' at './data/enron_emails.db'
2025-04-14 23:44:28,355 - INFO - Overwrite existing database: False
2025-04-14 23:44:28,355 - INFO - Overwrite existing database: False


In [10]:
import asyncio
import polars as pl
import pandas as pd
from tqdm.asyncio import tqdm
from dotenv import load_dotenv
from query_iterators import load_synthetic_queries
import rollout
import importlib
import logging

importlib.reload(rollout)

load_dotenv()

logging.basicConfig(level=logging.ERROR)

MODELS_TO_BENCHMARK = [
    "openai/gpt-4o",
    "openai/gpt-4.1",
    # "gemini/gemini-2.0-flash",
    # "gemini/gemini-2.5-pro-preview-03-25",
]

TEST_SET_ENTRIES = 100


async def benchmark_model(model: str, limit: int = 100) -> pl.DataFrame:
    """Benchmark a model on the test dataset"""
    scenarios = load_synthetic_queries(split="test", limit=limit)
    trajectories = await tqdm.gather(
        *[rollout.rollout(model, scenario, trainable=False) for scenario in scenarios],
        desc=f"Benchmarking {model}",
    )

    metrics = pl.DataFrame([{**t.metrics, "reward": t.reward} for t in trajectories])

    avg_metrics = metrics.select([pl.mean(c).alias(c) for c in metrics.columns])

    return avg_metrics


results = await asyncio.gather(
    *[benchmark_model(model, TEST_SET_ENTRIES) for model in MODELS_TO_BENCHMARK]
)


Benchmarking openai/gpt-4o:   0%|          | 0/100 [00:00<?, ?it/s]2025-04-14 23:45:26,766 - INFO - Search found 0 results.
2025-04-14 23:45:26,768 - INFO - Search found 1 results.
2025-04-14 23:45:26,779 - INFO - Search found 0 results.
2025-04-14 23:45:26,781 - INFO - Search found 0 results.
2025-04-14 23:45:26,783 - INFO - Search found 0 results.
2025-04-14 23:45:26,784 - INFO - Search found 0 results.
2025-04-14 23:45:26,786 - INFO - Search found 0 results.
2025-04-14 23:45:26,788 - INFO - Search found 0 results.
2025-04-14 23:45:26,791 - INFO - Search found 0 results.
2025-04-14 23:45:26,793 - INFO - Search found 0 results.
2025-04-14 23:45:26,796 - INFO - Search found 0 results.
2025-04-14 23:45:26,797 - INFO - Search found 0 results.
2025-04-14 23:45:26,799 - INFO - Search found 0 results.
2025-04-14 23:45:26,801 - INFO - Search found 0 results.
2025-04-14 23:45:26,803 - INFO - Search found 4 results.
2025-04-14 23:45:26,805 - INFO - Search found 0 results.
2025-04-14 23:45:26,8

In [11]:
from IPython.display import HTML

df: pl.DataFrame = pl.concat(results)
df = df.transpose(include_header=True)

col_names = {"column": "metric"}
for i, model in enumerate(MODELS_TO_BENCHMARK):
    col_names[f"column_{i}"] = model

df = df.rename(col_names)
HTML(df.to_pandas().to_html())
# df.rename(MODELS_TO_BENCHMARK)
# df.rename(df.tail(1).to_dicts()[0])


Unnamed: 0,metric,openai/gpt-4o,openai/gpt-4.1
0,cant_parse_tool_call,0.0,0.0
1,bad_tool_call_name,0.0,0.0
2,bad_tool_call_args,0.5,0.0
3,ran_out_of_turns,0.01,0.07
4,returned_i_dont_know,0.11,0.1
5,attempted_answer,0.38,0.83
6,answer_correct,0.3,0.74
7,sources_correct,0.28,0.52
8,num_sources,0.0,0.0
9,num_turns,2.97,5.04


In [5]:
# Create a comparison table with models as columns and metrics as rows
comparison_data = {}
all_metrics = set()

for result in results:
    model_name = result["model"]
    metrics_df = result["metrics"].to_pandas()
    comparison_data[model_name] = {
        col: metrics_df[col][0] for col in metrics_df.columns
    }
    all_metrics.update(metrics_df.columns)

# Create the comparison DataFrame
comparison_df = pd.DataFrame(
    {
        metric: {
            model: comparison_data[model].get(metric, None)
            for model in comparison_data.keys()
        }
        for metric in all_metrics
    }
)

# print("\n=== Model Comparison ===")
# print(comparison_df.to_markdown())

comparison_df.transpose()


# List of models to benchmark


ColumnNotFoundError: "model" not found