In [1]:
%load_ext autoreload
%autoreload 2

import os  # noqa
import sys  # noqa

from collections import Counter  # noqa
import pandas as pd  # noqa
import requests  # noqa
import seaborn as sns  # noqa
from tqdm.notebook import tqdm  # noqa
import plotly.express as px  # noqa
from plotly.subplots import make_subplots  # noqa
import plotly.graph_objects as go  # noqa

sys.path.append("..")

from backend.llm.routing.policy import RoutingPolicy, SelectionCriteria  # noqa
from backend.llm.routing.router import RankedRouter  # noqa
from backend.llm.ranking import ChoixRankerConfIntervals  # noqa

In [2]:
# This data is also found in https://drive.google.com/drive/u/0/folders/1660oK765zlYCNf8B-cF82T_CKnQvc9cK as backup.

local_file_name = "../tmp/clean_battle_20240814_public.json"

if not os.path.exists(local_file_name):
    print("Downloading battles")
    url = "https://storage.googleapis.com/arena_external_data/public/clean_battle_20240814_public.json"
    response = requests.get(url)

    with open(local_file_name, "wb") as file:
        file.write(response.content)

with open(local_file_name) as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])

battles.head()

Unnamed: 0,model_a,model_b,winner,judge,turn,anony,language,tstamp,conv_metadata,is_code,is_refusal,dedup_tag,category_tag
0,chatglm-6b,koala-13b,model_b,2e9c29aa140b8e50643235eab01dc9ea,1,True,English,1682352000.0,"{'sum_user_tokens': 10, 'sum_assistant_a_token...",True,False,"{'high_freq': False, 'sampled': True}","{'if_v0.1': {'if': False, 'score': 1}, 'math_v..."
1,oasst-pythia-12b,alpaca-13b,tie,2e9c29aa140b8e50643235eab01dc9ea,1,True,English,1682352000.0,"{'sum_user_tokens': 11, 'sum_assistant_a_token...",False,False,"{'high_freq': False, 'sampled': True}","{'if_v0.1': {'if': False, 'score': 0}, 'math_v..."
2,koala-13b,oasst-pythia-12b,model_b,2e9c29aa140b8e50643235eab01dc9ea,1,True,English,1682352000.0,"{'sum_user_tokens': 10, 'sum_assistant_a_token...",False,False,"{'high_freq': False, 'sampled': True}","{'if_v0.1': {'if': False, 'score': 0}, 'math_v..."
3,vicuna-13b,oasst-pythia-12b,model_b,2e9c29aa140b8e50643235eab01dc9ea,1,True,English,1682352000.0,"{'sum_user_tokens': 9, 'sum_assistant_a_tokens...",False,False,"{'high_freq': False, 'sampled': True}","{'if_v0.1': {'if': False, 'score': 1}, 'math_v..."
4,vicuna-13b,koala-13b,model_a,2e9c29aa140b8e50643235eab01dc9ea,1,True,English,1682352000.0,"{'sum_user_tokens': 5, 'sum_assistant_a_tokens...",False,True,"{'high_freq': False, 'sampled': True}","{'if_v0.1': {'if': False, 'score': 0}, 'math_v..."


In [3]:
models = list(set(battles.head()["model_a"]) | set(battles.head()["model_b"]))

ranker = ChoixRankerConfIntervals(
    models=models,
    num_bootstrap_iterations=8,
    choix_ranker_algorithm="lsr_pairwise",
)
router = RankedRouter(
    models=models,
    policy=RoutingPolicy(SelectionCriteria.PROPORTIONAL, random_fraction=0.1),
    ranker=ranker,
    seed=123,
)

routes = []
sample_fraction = 0.5
for i, row in tqdm(battles.iterrows(), total=len(battles)):
    if i % 10000 > sample_fraction * 10000:
        # sample the data, but keep the order
        continue
    routes.append(router.select_models(2))
    if row.winner == "model_a":
        router.update_ranker(row.model_a, row.model_b, 1.0)
    else:
        router.update_ranker(row.model_b, row.model_a, 1.0)

ranker.update_ratings()

leaderboard = sorted(ranker.leaderboard(), key=lambda x: x.rating.value, reverse=True)
for rm in leaderboard[:10]:
    print(rm)
print("...")
for rm in leaderboard[-5:]:
    print(rm)

print("\nThreshold Counter", ranker.update_ratings_counter)

  0%|          | 0/1799991 [00:00<?, ?it/s]

RatedModel(model='chatgpt-4o-latest', rating=AnnotatedFloat(value=1456.603119975846, annotation='Wins: 4730, Losses: 2926, Ties: 0 (1450.4 to 1469.7)'))
RatedModel(model='gemini-1.5-pro-exp-0801', rating=AnnotatedFloat(value=1414.5306465415001, annotation='Wins: 6275, Losses: 4176, Ties: 0 (1401.0 to 1429.5)'))
RatedModel(model='gpt-4o-2024-05-13', rating=AnnotatedFloat(value=1385.1264609358823, annotation='Wins: 24360, Losses: 16133, Ties: 0 (1380.3 to 1392.3)'))
RatedModel(model='gpt-4o-mini-2024-07-18', rating=AnnotatedFloat(value=1362.3383177203978, annotation='Wins: 5892, Losses: 4398, Ties: 0 (1355.0 to 1375.8)'))
RatedModel(model='claude-3-5-sonnet-20240620', rating=AnnotatedFloat(value=1354.3116041799403, annotation='Wins: 13801, Losses: 10507, Ties: 0 (1351.8 to 1360.5)'))
RatedModel(model='gemini-advanced-0514', rating=AnnotatedFloat(value=1346.6753755626228, annotation='Wins: 16107, Losses: 11580, Ties: 0 (1339.8 to 1355.3)'))
RatedModel(model='llama-3.1-405b-instruct', rati

In [4]:
route_df = pd.DataFrame([r for r in routes if r], columns=["model_a", "model_b"])


def visualize_route_count(df, title, show_num_models=40):
    route_counts = pd.pivot_table(df, index="model_a", columns="model_b", aggfunc="size", fill_value=0)
    ordering = route_counts.sum().sort_values(ascending=False).index
    ordering = ordering[:show_num_models]
    fig = px.imshow(route_counts.loc[ordering, ordering], title=title, text_auto=True)
    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        height=800,
        width=800,
        title_y=0.07,
        title_x=0.5,
        font=dict(size=10),
    )
    fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig


visualize_route_count(route_df, title="Route Count of Each Combination of Models", show_num_models=30)

In [5]:
# Battle Count vs Route Count


fig = make_subplots(rows=2, cols=1, subplot_titles=("Battle Count for Each Model", "Route Count for Each Model"))

battle_counts = pd.concat([battles["model_a"], battles["model_b"]]).value_counts()
fig.add_trace(go.Bar(x=battle_counts.index, y=battle_counts.values, name="Battle Count"), row=1, col=1)

route_counts = pd.concat([route_df["model_a"], route_df["model_b"]]).value_counts()
fig.add_trace(go.Bar(x=route_counts.index, y=route_counts.values, name="Route Count"), row=2, col=1)

fig.update_layout(height=1000, showlegend=False, title_text="Battle Count vs Route Count")
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=2, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)

# Show the figure
fig.show()