In [6]:
import os
import json
from collections import defaultdict
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
pd.options.display.float_format = '{:.2f}'.format

# Obtaining and Cleaning the Eval Data

LLM的评估结果保存在文件`pair_results_parallel_results.jsonl`, 首先将该文件处理为如下格式：
[
  {
    "model_a": "vicuna-13b",
    "model_b": "koala-13b",
    "win": "tie"
  },
  {
    "model_a": "vicuna-13b",
    "model_b": "llama-13b",
    "win": "model_b"
  }
]

In [None]:
with open('pair_results_parallel.jsonl', 'r', encoding='utf-8') as f:
    raw_data = [json.loads(line) for line in f]

battles = []
for idx, data in enumerate(raw_data):
    try:
        judgment = data[1]['choices'][0]['message']['content'].lower().strip().strip('.')
    except:
        print(f"{idx} has no LLM response")
        continue
    user_content_keys = list(eval(data[0]['messages'][1]['content']).keys())
    model_A = user_content_keys[2].split('_output')[0]
    model_B = user_content_keys[3].split('_output')[0]

    battle = {
        "model_a": model_A,
        "model_b": model_B,
        "win": judgment
    }
    battles.append(battle)

print(f"battles' len: {len(battles)}")
if os.path.exists("battles.jsonl"):
    os.remove("battles.jsonl")
for battle in battles:
    with open('battles.jsonl', 'a', encoding='utf-8') as f:
        f.write(json.dumps(battle, ensure_ascii=False) + '\n')


In [3]:
# 数据处理部分，将包含"it's a tie"的转化为"tie", 将"model_a/b is better"转化为"model_a/b", 与ChatbotArena一致
with open('battles.jsonl', 'r', encoding='utf-8') as f:
    battles = [json.loads(line) for line in f]

new_battles = []
for idx, battle in enumerate(battles):
    if "it's a tie" in battle['win'].lower():
        new_battles.append({"model_a": battle['model_a'], "model_b": battle['model_b'], "win": "tie"})
    elif "model_a is better" in battle['win'].lower():
        new_battles.append({"model_a": battle['model_a'], "model_b": battle['model_b'], "win": "model_a"})
    elif "model_b is better" in battle['win'].lower():
        new_battles.append({"model_a": battle['model_a'], "model_b": battle['model_b'], "win": "model_b"})
    else:
        new_battles.append(battle)

if os.path.exists("battles.jsonl"):
    os.remove("battles.jsonl")
for battle in new_battles:
    with open('battles.jsonl', 'a', encoding='utf-8') as f:
        f.write(json.dumps(battle, ensure_ascii=False) + '\n')

In [9]:
# 判断生成的数据是否规范，不规范的需要手动清洗
options = ["model_a", "model_b",  "tie"]
models = ['llama-7b',
          'llama-13b',
          'vicuna-7b',
          'vicuna-13b',
          'llama-7b-lora-iio',
          'llama-13b-lora-iio',
          'vicuna-7b-lora-iio',
          'vicuna-13b-lora-iio',
          'llama-7b-lora-turn',
          'llama-13b-lora-turn',
          # To Add New Models
          ]

with open('battles.jsonl', 'r', encoding='utf-8') as f:
    battles = [json.loads(line) for line in f]

for idx, battle in enumerate(battles):
    if battle['model_a'] not in models:
        print(f"{idx+1} has wrong model_a name")
        continue
    if battle['model_b'] not in models:
        print(f"{idx+1} has wrong model_b name")
        continue
    if battle['win'] not in options:
        print(f"{idx+1} has wrong win description")
        continue
# 手动修改，然后重新运行该单元格，直到无问题

# Exploratory Analysis

Before computing the Elo ratings, we first conduct some basic exploratory analysis to highlight a few key properties and caveates with this data. 

In [58]:
with open('battles.jsonl', 'r', encoding='utf-8') as f:
    battles = [json.loads(line) for line in f]
battles = pd.DataFrame(battles)
battles

Unnamed: 0,model_a,model_b,win
0,vicuna-7b,llama-7b-lora-iio,model_a
1,llama-13b,llama-13b-lora-iio,model_b
2,llama-7b-lora-iio,llama-13b-lora-iio,model_a
3,vicuna-13b-lora-iio,llama-7b-lora-iio,model_a
4,llama-13b,llama-7b-lora-iio,model_b
...,...,...,...
6500,vicuna-7b,llama-7b-lora-turn,model_a
6501,llama-13b-lora-turn,vicuna-13b-lora-iio,model_b
6502,llama-7b-lora-turn,llama-7b-lora-iio,model_a
6503,llama-7b-lora-iio,llama-13b-lora-turn,model_b


## Signfiicant Number of Ties

We allowed the user to declare a tie between the pairs of models.  To collect additional data, later in the tournament we also allowed the user to declare a tie in which both models were bad.  There were a significant number of tied outcomes. 

In [59]:
fig = px.bar(battles["win"].value_counts(),
             title="Counts of Battle Outcomes", text_auto=True, height=400)
fig.update_layout(xaxis_title="Battle Outcome", yaxis_title="Count", 
                  showlegend=False)
fig

In [60]:
battles_no_ties = battles[~battles["win"].str.contains("tie")]

## Non-uniform Model Frequency

When we initially launched the tournament, we had prior information on the likely ranking based on our benchmarks and chose to pair models according to this ranking. We gave preference to what we believed would be strong pairings based on this ranking. However, we later switched to uniform sampling to get better overall coverage of the rankings.
Towards the end of the tournament, we also introduced a new model `fastchat-t5-3b`.
All of these result in non-uniform model frequency.

In [61]:
fig = px.bar(pd.concat([battles["model_a"], battles["model_b"]]).value_counts(),
             title="Battle Count for Each Model", text_auto=True)
fig.update_layout(xaxis_title="model", yaxis_title="Battle Count", 
                  showlegend=False)
fig

We examing the number of pairings for each combination of models.

In [62]:
def visualize_battle_count(battles, title):
    ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", 
                          fill_value=0)
    battle_counts = ptbl + ptbl.T
    ordering = battle_counts.sum().sort_values(ascending=False).index
    fig = px.imshow(battle_counts.loc[ordering, ordering], 
                    title=title, text_auto=True, width=600)
    fig.update_layout(xaxis_title="Model B", 
                      yaxis_title="Model A",
                      xaxis_side="top",
                      title_y=0.07, title_x=0.5)
    fig.update_traces(hovertemplate=
                      "Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>")
    return fig

fig = visualize_battle_count(battles, title="Battle Count of Each Combination of Models")
fig.write_image("./figs/battle_counts.svg")
fig

### Battles Excluding Ties

In [63]:
fig = visualize_battle_count(battles_no_ties, "Battle Count for Each Combination of Models (without Ties)")
fig.write_image("./figs/battle_counts_no_ties.svg")
fig

### Counting Ties

In [64]:
fig = visualize_battle_count(battles[battles['win'].str.contains("tie")], "Tie Count for Each Combination of Models")
fig.write_image("./figs/battle_counts_ties.svg")
fig

## Preliminary Ranking

Using just the average win rate against all other models we can already compute an estimated leaderboard.
However, this method may not be as scalable as the Elo rating system that we will use later because this method requires data from all model combinations.

#Elo Ratings

The [Elo rating system ](https://en.wikipedia.org/wiki/Elo_rating_system)is a method for calculating the relative skill levels of players, which has been widely adopted in chess, sports, and MOBA games. The difference in the ratings between two players serves as a predictor of the outcome of a match. The Elo rating system works well for our case because we have multiple models and we run pairwise battles between them.
Next, we compute the Elo ratings of these models.

### Compute Ratings

In [65]:
def compute_elo(battles, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    rating = defaultdict(lambda: INIT_RATING)
 
    for rd, model_a, model_b, win in battles[['model_a', 'model_b', 'win']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_a":
            sa = 1
        elif win == "model_b":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)
    
    return rating

In [66]:
elo_ratings = compute_elo(battles)
df = pd.DataFrame([
    [n, elo_ratings[n]] for n in elo_ratings.keys()
], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
df["Elo rating"] = df["Elo rating"].astype(int)
df

Unnamed: 0,Model,Elo rating
0,vicuna-13b,1220
1,vicuna-7b-lora-iio,1156
2,vicuna-13b-lora-iio,1134
3,vicuna-7b,1087
4,llama-13b-lora-iio,1040
5,llama-13b-lora-turn,991
6,llama-7b-lora-turn,949
7,llama-7b-lora-iio,920
8,llama-13b,816
9,llama-7b,682


In [67]:
def predict_win_rate(elo_ratings, K=32, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea
      
    data = {
        a: [wins[a][b] if a != b else np.NAN for b in names]
        for a in names
    }

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T

In [68]:
win_rate = predict_win_rate(compute_elo(battles))
ordered_models = win_rate.mean(axis=1).sort_values(ascending=False).index
fig = px.imshow(win_rate.loc[ordered_models, ordered_models], 
                color_continuous_scale='RdBu', text_auto=".2f",
                title="Predicted Win Rate Using Elo Ratings for Model A in an A vs. B Battle")
fig.update_layout(xaxis_title="Model B", 
                  yaxis_title="Model A",
                  xaxis_side="top",
                  title_y=0.07, title_x=0.5)
fig.update_traces(hovertemplate=
                  "Model A: %{y}<br>Model B: %{x}<br>Win Rate: %{z}<extra></extra>")
fig.write_image("./figs/elo_predicted_win_rate.svg")
fig

# Links



Some good resources to learn more about Elo rating systems:
- Wikipedia https://en.wikipedia.org/wiki/Elo_rating_system
- An introduction video https://www.youtube.com/watch?v=AsYfbmp0To0
- A FiveThirtyEight article https://fivethirtyeight.com/methodology/how-our-nfl-predictions-work/
