In [24]:
from collections import defaultdict
import json, math
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from typing import List, TypedDict, Optional, Dict, Any
pd.options.display.float_format = '{:.2f}'.format

In [25]:

class Elo(TypedDict):
    id: str
    score: int

class Model(TypedDict):
    id: str
    label: str
    provider: str
    llmConfig: Dict[str, Any]
    asrConfig: Optional[Dict[str, Any]]
    ttsConfig: Optional[Dict[str, Any]]
    extraConfig: Optional[Dict[str, Any]]
    elo: Optional[Elo]
    eloId: Optional[str]

class Convo(TypedDict):
    id: str
    prompt: Optional[Dict[str, Any]]
    convoTranscript: Optional[str]
    inputAudioFileId: Optional[str]
    outputAudioFileId: Optional[str]

class Battle(TypedDict):
    id: str
    convoLength: str
    convoType: str
    userUuid: str
    modelAId: str
    modelBId: str
    convoAId: Optional[str]
    convoBId: Optional[str]
    state: str
    outcome: Optional[str]
    promptKey: str
    modelA: Model
    modelB: Model
    convoA: Optional[Convo]
    convoB: Optional[Convo]

def load_battles(file_path: str) -> List[Battle]:
    with open(file_path, 'r') as file:
        data: List[Battle] = json.load(file)
    return data

BASE_PATH = "/Users/cryogenic/tvc/bench.audio/data/src/data/.data/"
# Load the battles
battles = load_battles(BASE_PATH + 'battles.json')
print(f"Loaded {len(battles)} battles")

Loaded 177 battles


In [26]:
import pandas as pd

ids_to_model: Dict[str, Model] = {battle['modelA']['id']: battle['modelA'] for battle in battles}
ids_to_model.update({battle['modelB']['id']: battle['modelB'] for battle in battles})
model_ids_to_labels: Dict[str, str] = {model['id']: model['label'] for model in ids_to_model.values()}
model_labels_to_ids: Dict[str, str] = {v: k for k, v in model_ids_to_labels.items()}


# Convert battles to a DataFrame
battles_df = pd.DataFrame([{
    'outcome': battle['outcome'],
    'modelA_id': battle['modelA']['id'],
    'modelB_id': battle['modelB']['id'],
    'modelA_label': battle['modelA']['label'],
    'modelB_label': battle['modelB']['label']
} for battle in battles])

# Display the first few rows of the DataFrame
print(battles_df.head())

# Print the shape of the DataFrame
print(f"DataFrame shape: {battles_df.shape}")


      outcome                             modelA_id  \
0  TieBothBad  8740fd77-5d04-4da0-94a8-df853961ff09   
1  TieBothBad  2c52061b-e368-4699-a2e2-3f581650878c   
2  TieBothBad  16b5800e-33e6-496b-aa22-1c59bdb28224   
3  TieBothBad  8740fd77-5d04-4da0-94a8-df853961ff09   
4         Tie  8740fd77-5d04-4da0-94a8-df853961ff09   

                              modelB_id  \
0  16b5800e-33e6-496b-aa22-1c59bdb28224   
1  8740fd77-5d04-4da0-94a8-df853961ff09   
2  f02e7ac1-5222-4e1a-ae9a-b1682d7ea673   
3  2c52061b-e368-4699-a2e2-3f581650878c   
4  16b5800e-33e6-496b-aa22-1c59bdb28224   

                                        modelA_label  \
0  vapi-gpt-3.5-turbo-playht-deepgram-company-req...   
1         vapi-gpt-4-turbo-11labs-deepgram-no-custom   
2           retell-gpt-4-turbo-openai-nova-no-custom   
3  vapi-gpt-3.5-turbo-playht-deepgram-company-req...   
4  vapi-gpt-3.5-turbo-playht-deepgram-company-req...   

                                        modelB_label  
0           retell

In [27]:
import plotly.express as px

# Count the outcomes
outcome_counts = battles_df['outcome'].value_counts()

# Create the bar plot
fig = px.bar(outcome_counts, 
             title="Counts of Battle Outcomes", 
             text_auto=True, 
             height=400)

# Update layout
fig.update_layout(
    xaxis_title="Battle Outcome", 
    yaxis_title="Count",
    showlegend=False
)

# Display the figure
fig.show()



In [28]:
battles_no_ties = battles_df[~battles_df['outcome'].str.contains("Tie")]

In [29]:
# Count battles for each model
model_counts = pd.concat([battles_no_ties['modelA_label'], battles_no_ties['modelB_label']]).value_counts()



# Create the bar plot
fig = px.bar(model_counts, 
             title="Battle Count for Each Model", 
             text_auto=True)

fig.update_layout(xaxis_title="Model", 
                  yaxis_title="Battle Count", 
                  height=400,
                  showlegend=False)

fig.show()

In [30]:
def visualize_battle_count(battles_df, title, show_num_models=30):
    # Create a copy of the DataFrame to avoid modifying the original
    battles_copy = battles_df.copy()
    
    # Create new columns with model labels instead of IDs

    ptbl = pd.pivot_table(battles_copy, index="modelA_label", columns="modelB_label", aggfunc="size",
                          fill_value=0)
    battle_counts = ptbl + ptbl.T
    ordering = battle_counts.sum().sort_values(ascending=False).index
    ordering = ordering[:show_num_models]
    fig = px.imshow(battle_counts.loc[ordering, ordering],
                    title=title, text_auto=True)
    fig.update_layout(xaxis_title="Model B",
                      yaxis_title="Model A",
                      xaxis_side="top", height=800, width=800,
                      title_y=0.07, title_x=0.5,
                      font=dict(size=10))
    fig.update_traces(hovertemplate=
                      "Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig

fig = visualize_battle_count(battles_df, title="Battle Count of Each Combination of Models", show_num_models=30)
fig.show()

In [31]:
visualize_battle_count(battles_no_ties, "Battle Count for Each Combination of Models (without Ties)")

In [32]:
tie_battles = battles_df[battles_df['outcome'].str.contains("Tie")]
visualize_battle_count(tie_battles, "Tie Count for Each Combination of Models")


In [33]:
def compute_pairwise_win_fraction(battles_df, max_num_models=30):
    # Create DataFrames for Model A and Model B wins
    battles_copy = battles_df.copy()
    battles_copy.loc[:, 'winner'] = battles_copy['outcome'].map({'WinA': 'model_a', 'WinB': 'model_b'})

    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles_copy[battles_copy['winner'] == "model_a"],
        index="modelA_label", columns="modelB_label", aggfunc="size", fill_value=0)

    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles_copy[battles_copy['winner'] == "model_b"],
        index="modelA_label", columns="modelB_label", aggfunc="size", fill_value=0)

    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(battles_copy,
        index="modelA_label", columns="modelB_label", aggfunc="size", fill_value=0)

    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (
        (a_win_ptbl + b_win_ptbl.T) /
        (num_battles_ptbl + num_battles_ptbl.T)
    ).fillna(0)

    # Arrange ordering according to proportion of wins
    prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
    prop_wins = prop_wins[:max_num_models]
    model_names = list(prop_wins.keys())
    row_beats_col = row_beats_col_freq.loc[model_names, model_names]
    return row_beats_col

def visualize_pairwise_win_fraction(battles, title, max_num_models=30):
    row_beats_col = compute_pairwise_win_fraction(battles, max_num_models)
    fig = px.imshow(row_beats_col, color_continuous_scale='RdBu',
                    text_auto=".2f", title=title)
    fig.update_layout(xaxis_title="Model B: Loser",
                  yaxis_title="Model A: Winner",
                  xaxis_side="top", height=900, width=900,
                  title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")

    return fig

In [34]:
fig = visualize_pairwise_win_fraction(battles_no_ties,
      title = "Fraction of Model A Wins for All Non-tied A vs. B Battles")
fig

In [35]:
row_beats_col_freq = compute_pairwise_win_fraction(battles_no_ties)
fig = px.bar(row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
             title="Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
             text_auto=".2f")
fig.update_layout(yaxis_title="Average Win Rate", xaxis_title="Model",
                  showlegend=False)
fig.show()
# Save the figure
name = "average_win_rate_bar_chart.png"
fig.write_image(BASE_PATH + name)
print(f"Figure saved as {BASE_PATH + name}")


Figure saved as /Users/cryogenic/tvc/bench.audio/data/src/data/.data/average_win_rate_bar_chart.png


In [36]:
def compute_online_elo(battles_df, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)

    for _, battle in battles_df.iterrows():
        model_a = battle['modelA_id']
        model_b = battle['modelB_id']
        outcome = battle['outcome']

        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))

        if outcome == "WinA":
            sa = 1
        elif outcome == "WinB":
            sa = 0
        elif outcome in ["Tie", "TieBothBad"]:
            sa = 0.5
        else:
            raise Exception(f"unexpected outcome {outcome}")

        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    # calibrate vapi-gpt-3.5-turbo-playht-deepgram-company-requested-params to 800
    calibration_model = "vapi-gpt-3.5-turbo-playht-deepgram-company-requested-params"
    calibration_id = model_labels_to_ids[calibration_model]
    if calibration_id in rating:
        delta = 800 - rating[calibration_id]
        for model in rating:
            rating[model] += delta
    else:
        print(f"Warning: Calibration model '{calibration_model}' not found in battles.")

    return rating

In [37]:
def preety_print_model_ratings(ratings):
    df = pd.DataFrame([
        [model_ids_to_labels[n], ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    # df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

# Convert battles list to DataFrame

online_elo_ratings = compute_online_elo(battles_df)
preety_print_model_ratings(online_elo_ratings)

Unnamed: 0,Model,Elo rating
1,vapi-gpt-4-turbo-11labs-deepgram-no-custom,839.05
2,retell-gpt-3.5-turbo-11labs-no-custom,827.98
3,hume-default,822.48
4,retell-gpt-4-turbo-openai-nova-no-custom,809.6
5,vapi-gpt-3.5-turbo-playht-deepgram-company-req...,800.0


In [38]:
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
    df = pd.DataFrame([
        [model_ids_to_labels[n], ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
    ], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
    df.index = df.index + 1
    return df

elo_mle_ratings_reverse = compute_online_elo(battles_df.iloc[::-1])
preety_print_two_ratings(online_elo_ratings,
                         elo_mle_ratings_reverse,
                         column_names=["Elo rating", "Elo rating with reverse order"])

Unnamed: 0,Model,Elo rating,Elo rating with reverse order
1,vapi-gpt-4-turbo-11labs-deepgram-no-custom,839,838
2,retell-gpt-3.5-turbo-11labs-no-custom,828,831
3,hume-default,822,822
4,retell-gpt-4-turbo-openai-nova-no-custom,810,812
5,vapi-gpt-3.5-turbo-playht-deepgram-company-req...,800,800


In [39]:
def compute_mle_elo(
    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
):
    from sklearn.linear_model import LogisticRegression
    
    # Create a new column 'winner' based on the 'outcome'
    df['winner'] = df['outcome'].map({'WinA': 'model_a', 'WinB': 'model_b', 'Tie': 'tie', 'TieBothBad': 'tie'})
    
    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="modelA_id",
        columns="modelB_id",
        aggfunc="size",
        fill_value=0,
    )
    
    # Handle ties
    if sum(df["winner"] == "tie") == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"] == "tie"],
            index="modelA_id",
            columns="modelB_id",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="modelA_id",
        columns="modelB_id",
        aggfunc="size",
        fill_value=0,
    )
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    
    # Calibration (adjust if needed)
    calibration_model = "vapi-gpt-3.5-turbo-playht-deepgram-company-requested-params"
    calibration_id = model_labels_to_ids[calibration_model]
    if calibration_id in models.index:
        elo_scores += 800 - elo_scores[models[calibration_id]]
    
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)



In [40]:
elo_mle_ratings = compute_mle_elo(battles_df)
preety_print_model_ratings(elo_mle_ratings)

Unnamed: 0,Model,Elo rating
1,vapi-gpt-4-turbo-11labs-deepgram-no-custom,887.74
2,retell-gpt-3.5-turbo-11labs-no-custom,871.1
3,hume-default,858.69
4,retell-gpt-4-turbo-openai-nova-no-custom,823.16
5,vapi-gpt-3.5-turbo-playht-deepgram-company-req...,800.0


In [41]:
def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [42]:
BOOTSTRAP_ROUNDS = 100

np.random.seed(42)
bootstrap_elo_lu = get_bootstrap_result(battles_df, compute_mle_elo, BOOTSTRAP_ROUNDS)

bootstrap:   0%|          | 0/100 [00:00<?, ?it/s]

bootstrap: 100%|██████████| 100/100 [00:00<00:00, 142.83it/s]


In [43]:
def visualize_bootstrap_scores(df, title):
    # Convert id to label using the model_labels_to_ids dictionary
    id_to_label = {v: k for k, v in model_labels_to_ids.items()}
    
    bars = pd.DataFrame(dict(
        lower = df.quantile(.025),
        rating = df.quantile(.5),
        upper = df.quantile(.975))).reset_index(names="model_id").sort_values("rating", ascending=False)
    
    # Convert model_id to label
    bars['model'] = bars['model_id'].map(id_to_label)
    
    bars['error_y'] = bars['upper'] - bars["rating"]
    bars['error_y_minus'] = bars['rating'] - bars["lower"]
    bars['rating_rounded'] = np.round(bars['rating'], 2)
    
    fig = px.scatter(bars, x="model", y="rating", error_y="error_y",
                     error_y_minus="error_y_minus", text="rating_rounded",
                     title=title)
    fig.update_layout(xaxis_title="Model", yaxis_title="Rating",
                      height=600)
    fig.update_xaxes(tickangle=45)
    return fig

fig = visualize_bootstrap_scores(bootstrap_elo_lu, "Bootstrap of MLE Elo Rating Estimates")
fig.show()

name = "bootstrap_mle_elo_ratings.png"
fig.write_image(BASE_PATH + name)
print(f"Figure saved as {BASE_PATH + name}")


Figure saved as /Users/cryogenic/tvc/bench.audio/data/src/data/.data/bootstrap_mle_elo_ratings.png


In [44]:
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea

    data = {
        a: [wins[a][b] if a != b else np.NAN for b in names]
        for a in names
    }

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T

In [45]:
win_rate = predict_win_rate(dict(bootstrap_elo_lu.quantile(0.5)))
ordered_models = win_rate.mean(axis=1).sort_values(ascending=False).index
ordered_models = ordered_models[:30]

# Create a reverse mapping of model_ids_to_labels

# Use the reverse mapping to rename the index and columns
win_rate_labeled = win_rate.rename(index=model_ids_to_labels, columns=model_ids_to_labels)
ordered_models_labeled = [model_ids_to_labels.get(model, model) for model in ordered_models]

fig = px.imshow(win_rate_labeled.loc[ordered_models_labeled, ordered_models_labeled],
                color_continuous_scale='RdBu', text_auto=".2f",
                title="Predicted Win Rate Using Elo Ratings for Model A in an A vs. B Battle")
fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top", height=900, width=900,
                  title_y=0.07, title_x=0.5)
fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Win Rate: %{z}<extra></extra>")
fig.show()
name = "predicted_win_rate_elo_ratings.png"
fig.write_image(BASE_PATH + name)
print(f"Figure saved as {BASE_PATH + name}")


Figure saved as /Users/cryogenic/tvc/bench.audio/data/src/data/.data/predicted_win_rate_elo_ratings.png


In [46]:
np.random.seed(42)
bootstrap_online_elo = get_bootstrap_result(battles_df, compute_online_elo, BOOTSTRAP_ROUNDS)

bootstrap: 100%|██████████| 100/100 [00:00<00:00, 312.63it/s]


In [47]:
preety_print_two_ratings(bootstrap_elo_lu.quantile(.5),
                         bootstrap_online_elo.quantile(.5),
                         column_names=["Bootstrap Median of MLE Elo", "Bootstrap Median of Online Elo"])

Unnamed: 0,Model,Bootstrap Median of MLE Elo,Bootstrap Median of Online Elo
1,vapi-gpt-4-turbo-11labs-deepgram-no-custom,892,839
2,retell-gpt-3.5-turbo-11labs-no-custom,868,828
3,hume-default,854,823
4,retell-gpt-4-turbo-openai-nova-no-custom,823,810
5,vapi-gpt-3.5-turbo-playht-deepgram-company-req...,800,800


In [48]:
fig = visualize_bootstrap_scores(bootstrap_online_elo, "Bootstrap of Online Elo Rating Estimates")
fig

In [49]:
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea

    data = {
        a: [wins[a][b] if a != b else np.NAN for b in names]
        for a in names
    }

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T

In [50]:
win_rate = predict_win_rate(dict(bootstrap_elo_lu.quantile(0.5)))
ordered_models = win_rate.mean(axis=1).sort_values(ascending=False).index
ordered_models = ordered_models[:30]

# Create a reverse mapping of model_ids_to_labels

# Use the reverse mapping to rename the index and columns
win_rate_labeled = win_rate.rename(index=model_ids_to_labels, columns=model_ids_to_labels)
ordered_models_labeled = [model_ids_to_labels.get(model, model) for model in ordered_models]

fig = px.imshow(win_rate_labeled.loc[ordered_models_labeled, ordered_models_labeled],
                color_continuous_scale='RdBu', text_auto=".2f",
                title="Predicted Win Rate Using Elo Ratings for Model A in an A vs. B Battle")
fig.update_layout(xaxis_title="Model B",
                  yaxis_title="Model A",
                  xaxis_side="top", height=900, width=900,
                  title_y=0.07, title_x=0.5)
fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Win Rate: %{z}<extra></extra>")
fig

In [51]:
def sample_battle_even(battles, n_per_battle):
    groups = battles.groupby(["modelA_label", "modelB_label"], as_index=False)
    resampled = (groups
                 .apply(lambda grp: grp.sample(n_per_battle, replace=True))
                 .reset_index(drop=True))
    return resampled

In [52]:
num_samples = 50
battles_even = sample_battle_even(battles_df, num_samples)
pd.pivot_table(battles_even, index="modelA_label", columns="modelB_label", aggfunc="size", fill_value=0)

modelB_label,hume-default,retell-gpt-3.5-turbo-11labs-no-custom,retell-gpt-4-turbo-openai-nova-no-custom,vapi-gpt-3.5-turbo-playht-deepgram-company-requested-params,vapi-gpt-4-turbo-11labs-deepgram-no-custom
modelA_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hume-default,0,50,50,50,50
retell-gpt-3.5-turbo-11labs-no-custom,50,0,50,50,50
retell-gpt-4-turbo-openai-nova-no-custom,50,50,0,50,50
vapi-gpt-3.5-turbo-playht-deepgram-company-requested-params,50,50,50,0,50
vapi-gpt-4-turbo-11labs-deepgram-no-custom,50,50,50,50,0


In [53]:
# Sampling Battles Evenly
def get_bootstrap_even_sample(battles, n_per_battle, func_compute_elo, num_round=BOOTSTRAP_ROUNDS):
    rows = []
    for n in tqdm(range(num_round), desc="sampling battles evenly"):
        resampled = sample_battle_even(battles, n_per_battle)
        rows.append(func_compute_elo(resampled))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [54]:
print("number of samples per battle pair:", num_samples)
bootstrap_even_lu = get_bootstrap_even_sample(battles_df, num_samples, compute_mle_elo, num_round=100)

number of samples per battle pair: 50


sampling battles evenly: 100%|██████████| 100/100 [00:00<00:00, 121.84it/s]


In [55]:
fig = visualize_bootstrap_scores(bootstrap_even_lu, "Bootstrap of MLE Elo Estimates - Even sample")
fig

In [56]:
# Calculate final Elo scores for each model
final_elo_scores = bootstrap_even_lu.median().sort_values(ascending=False)

# Create a dictionary with model names and their Elo scores
final_elo_dict = {model: round(score, 2) for model, score in final_elo_scores.items()}

# Pretty print the final Elo scores
print("Final Elo Scores for Each Model:")
for model, score in final_elo_dict.items():
    print(f"{model}: {score}")

# # Output to JSON

with open(BASE_PATH + 'final_elo_scores.json', 'w') as f:
    json.dump(final_elo_dict, f, indent=2)

print("\nFinal Elo scores have been saved to 'final_elo_scores.json'")



Final Elo Scores for Each Model:
ef2b04de-9ad5-4c66-b448-cfc38d64b1ea: 919.52
f02e7ac1-5222-4e1a-ae9a-b1682d7ea673: 906.92
2c52061b-e368-4699-a2e2-3f581650878c: 900.03
16b5800e-33e6-496b-aa22-1c59bdb28224: 850.63
8740fd77-5d04-4da0-94a8-df853961ff09: 800.0

Final Elo scores have been saved to 'final_elo_scores.json'
