In [1]:
# This notebook imports battles from LS.
# Note: it requires installing the `gradio_client` package, which is not a requirement of the repo in general.

%load_ext autoreload
%autoreload 2

from datetime import datetime  # noqa
import sys  # noqa
from time import time  # noqa: E402

import pandas as pd  # noqa
from gradio_client import Client  # noqa
from sqlmodel import Session, select  # noqa
from tqdm.notebook import tqdm  # noqa

sys.path.append("..")

from backend.db import get_engine  # noqa
from backend.llm.ranking import ChoixRankerConfIntervals  # noqa
from db.ratings import Category, OVERALL_CATEGORY_NAME, OVERALL_CATEGORY_DESCRIPTION  # noqa
from db.language_models import LanguageModel, LicenseEnum  # noqa
from db.ratings import Rating  # noqa




In [2]:
def get_hf_license(ls_license: str) -> LicenseEnum:
    if ls_license == "Proprietary":
        return LicenseEnum.other
    elif ls_license in ["Apache-2.0", "Apache 2.0"]:
        return LicenseEnum.apache_2_0
    elif ls_license == "Llama 2 Community":
        return LicenseEnum.llama2
    elif ls_license == "MIT":
        return LicenseEnum.mit
    elif ls_license == "Qianwen LICENSE":
        return LicenseEnum.other
    elif ls_license == "Non-commercial":
        return LicenseEnum.other
    elif ls_license == "Gemma license":
        return LicenseEnum.gemma
    elif ls_license == "CC-BY-NC-4.0":
        return LicenseEnum.cc_by_nc_4_0
    elif ls_license == "CC-BY-NC-SA-4.0":
        return LicenseEnum.cc_by_nc_sa_4_0
    elif ls_license in ["Llama 3.1 Community", "Llama 3 Community"]:
        return LicenseEnum.llama3
    elif ls_license in ["DeepSeek License", "DeepSeek"]:
        return LicenseEnum.other
    elif ls_license == "Jamba Open":
        return LicenseEnum.other
    elif ls_license == "NVIDIA Open Model":
        return LicenseEnum.other
    elif ls_license == "Yi License":
        return LicenseEnum.other
    elif ls_license == "AI2 ImpACT Low-risk":
        return LicenseEnum.other
    elif ls_license == "DBRX LICENSE":
        return LicenseEnum.other
    elif ls_license == "Mistral Research":
        return LicenseEnum.other
    elif ls_license == "Falcon-180B TII License":
        return LicenseEnum.other
    else:
        return LicenseEnum.unknown

In [3]:
def internal_name(name: str) -> str:
    return (
        name.lower()
        .replace("openassistant", "oasst")
        .replace("wizardlm-13b-v1.2", "wizardlm-13b")
        .replace("wizardlm-70b-v1.0", "wizardlm-70b")
        .replace("palm-chat-bison-001", "palm-2")
        .replace("mistral-7b-instruct-v0.1", "mistral-7b-instruct")
        .replace("mistral-7b-instruct-v0.2", "mistral-7b-instruct")
        .replace("nv-llama2-70b-steerlm-chat", "llama2-70b-steerlm-chat")
    )


# Get the LS leaderboard.
client = Client("lmsys/chatbot-arena-leaderboard")
result = client.predict(category="Overall", api_name="/update_leaderboard_and_plots")

# Put in a dataframe, and infer the model name and url.
df = pd.DataFrame(result[0]["value"]["data"], columns=result[0]["value"]["headers"])
df["model_url"] = df["🤖 Model"].str.extract(r'href="([^"]+)"')[0]
df["model_name"] = df["🤖 Model"].str.extract(r">([^<]+)<")[0]
df["model_internal_name"] = df["model_name"].apply(internal_name)
df["hf_license"] = df["License"].apply(get_hf_license)
print(f"Loaded {len(df)} models")
df.head(2)

Loaded as API: https://lmsys-chatbot-arena-leaderboard.hf.space ✔
Loaded 136 models


Unnamed: 0,Rank* (UB),🤖 Model,⭐ Arena Score,📊 95% CI,🗳️ Votes,Organization,License,Knowledge Cutoff,model_url,model_name,model_internal_name,hf_license
0,1,"<a target=""_blank"" href=""https://x.com/OpenAID...",1316,+4/-4,24023,OpenAI,Proprietary,2023/10,https://x.com/OpenAIDevs/status/18235103956190...,ChatGPT-4o-latest (2024-08-08),chatgpt-4o-latest (2024-08-08),LicenseEnum.other
1,2,"<a target=""_blank"" href=""https://aistudio.goog...",1301,+5/-5,19910,Google,Proprietary,2023/11,https://aistudio.google.com/app/prompts/new_ch...,Gemini-1.5-Pro-Exp-0827,gemini-1.5-pro-exp-0827,LicenseEnum.other


In [4]:
# Load the models into the database, if they don't exist.
models_names_to_llms = {}

with Session(get_engine()) as session:
    for _, row in df.iterrows():
        llm = session.exec(
            select(LanguageModel).where(LanguageModel.internal_name == row["model_internal_name"])
        ).first()
        if not llm:
            print(f"Adding {row['model_internal_name']}")
            llm = LanguageModel(
                name=row["model_name"],
                internal_name=row["model_internal_name"],
                license=row["hf_license"],
                label=row["model_name"],
                family=row["Organization"],
                # TODO: Add actual avatar_url
                avatar_url=row["model_url"],
            )
            session.add(llm)
        models_names_to_llms[row["model_internal_name"]] = llm

    category = session.exec(select(Category).where(Category.name == OVERALL_CATEGORY_NAME)).first()
    if not category:
        category = Category(name=OVERALL_CATEGORY_NAME, description=OVERALL_CATEGORY_DESCRIPTION)
        session.add(category)

    print(f"Committing {len(session.new)} new objects")
    session.commit()

Committing 0 new objects


In [5]:
# This data is also found in https://drive.google.com/drive/u/0/folders/1660oK765zlYCNf8B-cF82T_CKnQvc9cK as backup.

local_file_name = "../tmp/clean_battle_20240814_public.json"
with open(local_file_name) as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])
print(f"Loaded {len(battles)} raw battles")

Loaded 1799991 raw battles


In [6]:
models = set(df["model_internal_name"].unique())
battles = battles[
    # only include anonymous battles where both models are in the leaderboard.
    (battles.model_a.isin(models)) & (battles.model_b.isin(models)) & (battles["anony"] == True)  # noqa
]
# de-duplicate in the same way as LS.
battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]

print(f"After filtering battles with unknown models and dupes: {len(battles)} battles")

After filtering battles with unknown models and dupes: 781641 battles


In [7]:
# Actually create some ratings.

ranker = ChoixRankerConfIntervals(
    models=models,
    num_bootstrap_iterations=10,
    choix_ranker_algorithm="lsr_pairwise",
)

sample_fraction = 0.333
battles_sample = battles.sample(frac=sample_fraction).sort_values(by="tstamp", ascending=True)
prev_tstamp_datetime = datetime.fromtimestamp(battles_sample.iloc[0].tstamp)
for _, row in tqdm(battles_sample.iterrows(), total=len(battles_sample)):
    model_a = row.model_a.lower()
    model_b = row.model_b.lower()
    if row.winner == "model_a":
        ranker.update(model_a, model_b, 1.0)
    elif "tie" in row.winner:
        ranker.update(model_a, model_b, 0.5)
        ranker.update(model_b, model_a, 0.5)
    elif row.winner == "model_b":
        ranker.update(model_b, model_a, 1.0)
    else:
        raise ValueError(f"Unknown winner: {row.winner}")
    tstamp_datetime = datetime.fromtimestamp(row.tstamp)
    # Store a snapshot every month.
    should_store = prev_tstamp_datetime is None or (
        (tstamp_datetime.year, tstamp_datetime.month) != (prev_tstamp_datetime.year, prev_tstamp_datetime.month)
    )
    if should_store:
        start = time()
        ranker.to_db(OVERALL_CATEGORY_NAME, snapshot_timestamp=tstamp_datetime)
        delta = time() - start
        print(f"Storing snapshot for {tstamp_datetime.strftime('%Y-%m-%d')} (took {delta:.1f}s)")
    prev_tstamp_datetime = tstamp_datetime

  0%|          | 0/260286 [00:00<?, ?it/s]

Storing snapshot for 2023-05-01 (took 36.0s)
Storing snapshot for 2023-06-01 (took 36.0s)
Storing snapshot for 2023-07-01 (took 35.9s)
Storing snapshot for 2023-08-01 (took 35.9s)
Storing snapshot for 2023-09-01 (took 36.4s)
Storing snapshot for 2023-10-01 (took 35.9s)
Storing snapshot for 2023-11-01 (took 36.0s)
Storing snapshot for 2023-12-01 (took 36.4s)
Storing snapshot for 2024-01-01 (took 36.5s)
Storing snapshot for 2024-02-01 (took 36.6s)
Storing snapshot for 2024-03-01 (took 36.8s)
Storing snapshot for 2024-04-01 (took 37.1s)
Storing snapshot for 2024-05-01 (took 38.1s)
Storing snapshot for 2024-06-01 (took 38.2s)
Storing snapshot for 2024-07-01 (took 38.6s)
Storing snapshot for 2024-08-01 (took 38.8s)
