In [None]:
%load_ext autoreload
%autoreload 2

import os  # noqa
import sys  # noqa

from collections import Counter  # noqa
import pandas as pd  # noqa
import requests  # noqa
import seaborn as sns  # noqa
from tqdm.notebook import tqdm  # noqa
import plotly.express as px  # noqa
from plotly.subplots import make_subplots  # noqa
import plotly.graph_objects as go  # noqa

sys.path.append("..")

from ypl.backend.llm.routing.policy import RoutingPolicy, SelectionCriteria  # noqa
from ypl.backend.llm.routing.router import RankedRouter  # noqa
from ypl.backend.llm.ranking import ChoixRankerConfIntervals, PerCategoryRanker  # noqa

In [None]:
# This data is also found in https://drive.google.com/drive/u/0/folders/1660oK765zlYCNf8B-cF82T_CKnQvc9cK as backup.

local_file_name = "../tmp/clean_battle_20240814_public.json"

if not os.path.exists(local_file_name):
    print("Downloading battles")
    url = "https://storage.googleapis.com/arena_external_data/public/clean_battle_20240814_public.json"
    response = requests.get(url)

    with open(local_file_name, "wb") as file:
        file.write(response.content)

with open(local_file_name) as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])

battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]

print(battles.shape)

battles.head(2)

In [4]:
models = list(set(battles.head()["model_a"]) | set(battles.head()["model_b"]))

routing_policy = RoutingPolicy(
    selection_criteria=SelectionCriteria.PROPORTIONAL,
    minimum_model_traffic_fraction={
        # "dolly-v2-12b": 0.2,
        # "gemini-advanced-0514": 0.2,
    },
    random_fraction=0.1,
)

ranker_kwargs = dict(
    models=models,
    num_bootstrap_iterations=3,
    choix_ranker_algorithm="lsr_pairwise",
)
categories = ("coding", "math")
ranker = PerCategoryRanker(categories=categories, ranker_cls=ChoixRankerConfIntervals, ranker_kwargs=ranker_kwargs)
router = RankedRouter(
    models=models,
    policy=routing_policy,
    ranker=ranker,
    seed=123,
)

routes = []
sample_fraction = 0.1
battles_sample = battles.sample(frac=sample_fraction).sort_values(by="tstamp", ascending=True)
for _, row in tqdm(battles_sample.iterrows(), total=len(battles_sample)):
    routes.append(router.select_models(2))
    model1, model2 = (row.model_a, row.model_b) if row.winner == "model_a" else (row.model_b, row.model_a)
    category = None
    if row.is_code:
        category = "coding"
    elif row.category_tag.get("math_v0.1", {}).get("math"):
        category = "math"
    router.update_ranker(model1, model2, 1.0, category=category)

ranker.update_ratings()

leaderboard = sorted(ranker.leaderboard(), key=lambda x: x.rating.value, reverse=True)
for rm in leaderboard[:5]:
    print(rm)
print("...")
for rm in leaderboard[-3:]:
    print(rm)




  0%|          | 0/167025 [00:00<?, ?it/s]

RatedModel(model='chatgpt-4o-latest', rating=AnnotatedFloat(value=1512.9990057037976, annotation='Wins: 902, Losses: 540, Ties: 0 (1473.8 to 1521.0)'))
RatedModel(model='gemini-1.5-pro-exp-0801', rating=AnnotatedFloat(value=1424.843661942927, annotation='Wins: 1194, Losses: 794, Ties: 0 (1405.7 to 1433.7)'))
RatedModel(model='gpt-4o-2024-05-13', rating=AnnotatedFloat(value=1400.6453699114654, annotation='Wins: 4606, Losses: 3049, Ties: 0 (1400.3 to 1403.8)'))
RatedModel(model='claude-3-5-sonnet-20240620', rating=AnnotatedFloat(value=1371.1675027896265, annotation='Wins: 2646, Losses: 2005, Ties: 0 (1360.9 to 1377.1)'))
RatedModel(model='gpt-4o-mini-2024-07-18', rating=AnnotatedFloat(value=1368.8755648709946, annotation='Wins: 1098, Losses: 814, Ties: 0 (1355.3 to 1374.1)'))
...
RatedModel(model='llama-13b', rating=AnnotatedFloat(value=342.5471870222201, annotation='Wins: 84, Losses: 159, Ties: 0 (293.4 to 433.7)'))
RatedModel(model='dolly-v2-12b', rating=AnnotatedFloat(value=300.010406

In [5]:
route_df = pd.DataFrame([r for r in routes if r], columns=["model_a", "model_b"])


def visualize_route_count(df, title, show_num_models=40):
    route_counts = pd.pivot_table(df, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
    ordering = route_counts.sum().sort_values(ascending=False).index
    ordering = ordering[:show_num_models]
    fig = px.imshow(route_counts.loc[ordering, ordering], title=title, text_auto=True)
    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        height=800,
        width=800,
        title_y=0.07,
        title_x=0.5,
        font=dict(size=10),
    )
    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig


visualize_route_count(route_df, title="Route Count of Each Combination of Models", show_num_models=30)

In [6]:
# Battle Count vs Route Count


fig = make_subplots(rows=2, cols=1, subplot_titles=("Battle Count for Each Model", "Route Count for Each Model"))

battle_counts = pd.concat([battles["model_a"], battles["model_b"]]).value_counts()
fig.add_trace(go.Bar(x=battle_counts.index, y=battle_counts.values, name="Battle Count"), row=1, col=1)

route_counts = pd.concat([route_df["model_a"], route_df["model_b"]]).value_counts()
fig.add_trace(go.Bar(x=route_counts.index, y=route_counts.values, name="Route Count"), row=2, col=1)

fig.update_layout(height=1000, showlegend=False, title_text="Battle Count vs Route Count")
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=2, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

# Show the figure
fig.show()

In [7]:
# Showcase per-category leaderboards.

for category, leaderboard in ranker.leaderboard_all_categories().items():
    print("-" * 50)
    print(category)
    print("-" * 50)
    leaderboard = sorted(leaderboard, key=lambda x: x.rating.value, reverse=True)
    for rm in leaderboard[:5]:
        print(rm)
    print("...")

--------------------------------------------------
Overall
--------------------------------------------------
RatedModel(model='chatgpt-4o-latest', rating=AnnotatedFloat(value=1512.9990057037976, annotation='Wins: 902, Losses: 540, Ties: 0 (1473.8 to 1521.0)'))
RatedModel(model='gemini-1.5-pro-exp-0801', rating=AnnotatedFloat(value=1424.843661942927, annotation='Wins: 1194, Losses: 794, Ties: 0 (1405.7 to 1433.7)'))
RatedModel(model='gpt-4o-2024-05-13', rating=AnnotatedFloat(value=1400.6453699114654, annotation='Wins: 4606, Losses: 3049, Ties: 0 (1400.3 to 1403.8)'))
RatedModel(model='claude-3-5-sonnet-20240620', rating=AnnotatedFloat(value=1371.1675027896265, annotation='Wins: 2646, Losses: 2005, Ties: 0 (1360.9 to 1377.1)'))
RatedModel(model='gpt-4o-mini-2024-07-18', rating=AnnotatedFloat(value=1368.8755648709946, annotation='Wins: 1098, Losses: 814, Ties: 0 (1355.3 to 1374.1)'))
...
--------------------------------------------------
coding
-------------------------------------------