Predict Win Rates
Utilizing Elo ratings allows us to predict win probabilities. By comparing the predicted win rates with the actual win rates, we can gain insight into the accuracy and quality of the Elo rating system.

code source: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=qsk4LxTWTxRx

In [1]:
from collections import defaultdict
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
from tqdm import tqdm
import numpy as np

In [2]:
battled_pairs = r'..\results\log_battle_arena_gpt4_as_judger.csv'
df = pd.read_csv(battled_pairs)
columns_to_inclusive = ['model_a', 'model_b', 'winner']
data = df[columns_to_inclusive]

data_no_nan = data[data['winner'].isna()==False]
data_no_ties = data[data['winner'].str.contains('tie', na=False) == False]
data_ties_only = data[data['winner'].str.contains('tie', na=False) == True]

In [3]:
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea

    data = {
        a: [wins[a][b] if a != b else np.NAN for b in names]
        for a in names
    }

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T

In [4]:
def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, winner in battles[['model_a', 'model_b', 'winner']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if winner == "model_a":
            sa = 1
        elif winner == "model_b":
            sa = 0
        elif winner == "tie" or winner == "tie(all bad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {winner}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [5]:
def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [6]:
BOOTSTRAP_ROUNDS = 1000

np.random.seed(42)
bootstrap_elo_lu = get_bootstrap_result(data_no_nan, compute_elo, BOOTSTRAP_ROUNDS)
bootstrap_lu_median = bootstrap_elo_lu.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
bootstrap_lu_median

bootstrap: 100%|██████████| 1000/1000 [00:01<00:00, 500.82it/s]


Unnamed: 0,model,Elo rating
0,vicuna-13b,1151
1,koala-13b,1053
2,alpaca-13b,1003
3,stablelm-tuned-alpha-7b,983
4,chatglm-6b,969
5,dolly-v2-12b,954
6,llama-13b,953
7,oasst-pythia-12b,935


In [7]:
win_rate = predict_win_rate(dict(bootstrap_elo_lu.quantile(0.5)))
ordered_models = win_rate.mean(axis=1).sort_values(ascending=False).index
fig = px.imshow(win_rate.loc[ordered_models, ordered_models],
                color_continuous_scale='RdBu', text_auto=".2f",
                title="Predicted Win Rate Using Elo Ratings for Model A in an A vs. B Battle")
fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top", height=600, width=600,
                  title_y=0.07, title_x=0.5)
fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Win Rate: %{z}<extra></extra>")
fig