In [1]:
# This notebook is modified from a notebook published from Universtiy Berkley taken from
# https://colab.research.google.com/drive/1C2tQ-1j2Nm-NmtAx-Lo2wFTTod_s9jfe#scrollTo=mSizG3Pzglte

# Introduction

In this notebook, we present data analysis on Chatbot Arena data collected from https://arena.lmsys.org between April 24, 2023 to Apr 9, 2024.

We explain different Elo calculation methods (online Elo and MLE Elo, also known as Bradley-Terry model) for model ranking.

To view the latest leaderboard, see https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard.


In [None]:
from collections import defaultdict
import json, math, gdown, re
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
import requests
%pip install kaleido
pd.options.display.float_format = '{:.2f}'.format

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


# Obtaining and Cleaning the Tournament Data
We are hosting the initial tournament results as a JSON file on Google Drive. We use the `gdown` function to download the data. The data contains all the battels and voting results collected for ranking models.

In [None]:
url = "https://storage.googleapis.com/arena_external_data/public/clean_battle_20240410.json"
response = requests.get(url)

with open('local_file_name.json', 'wb') as file:
    file.write(response.content)

# load the JSON data from the local file
with open('local_file_name.json', 'r') as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])
    
battles = battles[battles["anony"] == True]
print(len(battles))

#Elo Ratings

The [Elo rating system ](https://en.wikipedia.org/wiki/Elo_rating_system)is a method for calculating the relative skill levels of players, which has been widely adopted in chess and other competitive games. The difference in the ratings between two players serves as a predictor of the outcome of a match. The Elo rating system works well for our case because we have multiple models and we run pairwise battles between them.
In this section, we present different methods for calculating Elo ratings.


### Maximum Likelihood Estimation for Elo Ratings (aka [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model))

In the context of LLM evaluation, models can be assumed to be static. In this case, we can directly fit the ratings by maximum likelihood estimation method (aka Bradley-Terry model), which produce significantly stable ratings. Here we provide an implementation with logistic regression.

In [None]:
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
    from sklearn.linear_model import LogisticRegression
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]

    X = np.zeros([n, p])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx)//2:] = False
    Y[tie_idx] = 1.0

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
    lr.fit(X,Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING

    # set anchor as mixtral = 1114
    if "mixtral-8x7b-instruct-v0.1" in models.index:
        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)

mle_elo_ratings = compute_mle_elo(battles)


In [None]:
def custom_bar_elo_rating(ratings):
    df = pd.DataFrame([
        [n, ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df.index = df.index + 1

    # Filter out rows with "gpt-4" substring except the one with the highest "Elo rating"
    models_to_filter_for_best = [ 'gpt-4', 'gpt-3.5', 'gemini-pro', 'starling-lm-7b']
    for model in models_to_filter_for_best:
      model_rows = df[df['Model'].str.contains(model)]
      if not model_rows.empty:
          max_rating = model_rows['Elo rating'].max()
          df = df[~(df['Model'].str.contains(model) & (df['Elo rating'] < max_rating))]

    # MODIFIED
    # openchat and starling are versions of mistral-7b so only keep best
    # wizard-LM-70b and tulu-2-dpo-70b are versions of llama-2-70b  so only keep best
    models_custom_remove = ['openchat-3.5', 'openchat-3.5', 'tulu-2-dpo-70b']
    for model in models_custom_remove:
      df = df[~df['Model'].str.contains(model)]
    print(df['Model'])

    # MODIFIED
    # only top 25 and make bar plot
    df = df.head(25)
    fig = px.bar(df, x='Model', y='Elo rating')
    fig.update_yaxes(range=[800, 1300])
    fig.update_layout(
        font_family="Calibri",
        title_font_family="Calibri",
        font_size=12,
        title_font_size=18
    )

    return fig, df

fig, df = custom_bar_elo_rating(mle_elo_ratings)
import matplotlib.pyplot as plt
fig.write_image('elo.png', scale=2)
from google.colab import files
files.download('elo.png')

df.to_excel('elo.xlsx')
files.download('elo.xlsx')
