In [1]:
from collections import defaultdict
import json, math, gdown
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
import requests
pd.options.display.float_format = '{:.2f}'.format

# Obtaining and Cleaning the Tournament Data
We are hosting the initial tournament results as a JSON file on Google Drive. We use the `gdown` function to download the data. The data contains all the battels and voting results collected for ranking models.

In [2]:
# we use the latest data
path_dir = "/mnt/hdd1/ljiahao/xianglin/llm-as-a-judge-attack/style_control"
path = f"{path_dir}/clean_battle_20240826_public.json"
with open(path, 'rb') as file:
    response = json.load(file)
response[0]

{'model_a': 'chatglm-6b',
 'model_b': 'koala-13b',
 'winner': 'model_b',
 'judge': '2e9c29aa140b8e50643235eab01dc9ea',
 'turn': 1,
 'anony': True,
 'language': 'English',
 'tstamp': 1682351591.1322,
 'conv_metadata': {'sum_user_tokens': 10,
  'sum_assistant_a_tokens': 171,
  'sum_assistant_b_tokens': 373,
  'context_a_tokens': 10,
  'context_b_tokens': 10,
  'turns': 1,
  'header_count_a': {'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0, 'h6': 0},
  'list_count_a': {'ordered': 0, 'unordered': 0},
  'bold_count_a': {'**': 0, '__': 0},
  'header_count_b': {'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0, 'h6': 0},
  'list_count_b': {'ordered': 5, 'unordered': 0},
  'bold_count_b': {'**': 0, '__': 0}},
 'is_code': True,
 'is_refusal': False,
 'dedup_tag': {'high_freq': False, 'sampled': True},
 'category_tag': {'if_v0.1': {'if': False, 'score': 1},
  'math_v0.1': {'math': False},
  'criteria_v0.1': {'specificity': True,
   'domain_knowledge': True,
   'complexity': False,
   'problem_solving': F

In [None]:
with open(path, 'rb') as file:
    battles = pd.read_json(file).sort_values(ascending=True, by=["tstamp"])
battles.head()

In [None]:
battles.conv_metadata[0]

In [None]:
battles.head()

In [None]:
# we use anony battles only for leaderboard
battles = battles[battles["anony"] == True]

# we de-duplicate top 0.1% redudant prompts
# see https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication
print("Before dedup: ", len(battles))
battles = battles[battles["dedup_tag"].apply(lambda x: x.get("sampled", False))]
print("After dedup: ", len(battles))


## [Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model)

In [None]:
def preety_print_model_ratings(ratings):
    df = pd.DataFrame([
        [n, ratings[n]] for n in ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    # df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

def preety_print_two_ratings(ratings_1, ratings_2, column_names):
    df = pd.DataFrame([
        [n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
    ], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
    df.index = df.index + 1
    return df

In [None]:
def compute_bt(
    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
):
    from sklearn.linear_model import LogisticRegression
    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # if no tie, create a zero matrix
    if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"].isin(["tie", "tie (bothbad)"])],
            index="model_a",
            columns="model_b",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # if nan skip
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    if "mixtral-8x7b-instruct-v0.1" in models.index:
        # anchor
        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

In [None]:
bt_ratings = compute_bt(battles)
preety_print_model_ratings(bt_ratings)

### Compute Bootstrap Confidence Interavals for BT scores

We can further use bootstrap to estimate the confidence intervals as well.

In [None]:
def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

In [None]:
BOOTSTRAP_ROUNDS = 10 # 10 for demo purpose

np.random.seed(42)
bootstrap_elo_lu = get_bootstrap_result(battles, compute_bt, BOOTSTRAP_ROUNDS)

In [None]:
def visualize_bootstrap_scores(df, title):
    bars = pd.DataFrame(dict(
        lower = df.quantile(.025),
        rating = df.quantile(.5),
        upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
    bars['error_y'] = bars['upper'] - bars["rating"]
    bars['error_y_minus'] = bars['rating'] - bars["lower"]
    bars['rating_rounded'] = np.round(bars['rating'], 2)
    fig = px.scatter(bars, x="model", y="rating", error_y="error_y",
                     error_y_minus="error_y_minus", text="rating_rounded",
                     title=title)
    fig.update_layout(xaxis_title="Model", yaxis_title="Rating",
                      height=600)
    return fig

fig = visualize_bootstrap_scores(bootstrap_elo_lu, "Bootstrap of BT Rating Estimates")
fig

# Style Control Arena
Check our blog: [Controlling for Style in Chatbot Arena]()

**Background**: We controlled for the effect of style by adding extra “style features” into our Bradley-Terry regression. This is a [standard technique](https://en.wikipedia.org/wiki/Controlling_for_a_variable) in statistics, and has been recently used in LLM evaluations [1](https://arxiv.org/abs/2404.04475). The idea is that, by including any confounding variables (e.g. response length) in the regression, we can attribute any increase in strength to the confounder, as opposed to the model. Then, the Bradley-Terry coefficient will be more reflective of the model’s intrinsic properties, as opposed to undesirable confounders. The definition of a confounder is to some extent up to our interpretation; as our style features, we use the (normalized) difference in response lengths, the number of markdown headers, and the number of lists.

Formally, we define length difference as a features:
- Token length difference between answer A and answer B

$$\text{normalize }(\frac{\text{length}_A - \text{length}_B}{\text{length}_A + \text{length}_B})$$

Similarily, we also define 3 markdown elements,
- Markdown header elements
- Markdown list elements
- Markdown bold elements

We normalize each features before apply logistic regression.

In [None]:
STYLE_CONTROL_ELEMENTS_V1 = [
    "sum_assistant_a_tokens",
    "header_count_a",
    "list_count_a",
    "bold_count_a",
    "sum_assistant_b_tokens",
    "header_count_b",
    "list_count_b",
    "bold_count_b",
]

In [None]:
def fit_bt(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
    from sklearn.linear_model import LogisticRegression

    p = len(models.index)

    lr = LogisticRegression(fit_intercept=False)
    if indices:
        lr.fit(X[indices], Y[indices])
    else:
        lr.fit(X, Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    # calibrate llama-13b to 800 if applicable
    if "mixtral-8x7b-instruct-v0.1" in models.index:
        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
    return (
        pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
        lr.coef_[0][p:],
    )


def construct_style_matrices(
    df,
    BASE=10,
    apply_ratio=[1, 1, 1, 1],
    style_elements=STYLE_CONTROL_ELEMENTS_V1,
    add_one=True,
):
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]
    assert len(style_elements) % 2 == 0
    k = int(len(style_elements) / 2)

    X = np.zeros([n, p + k])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # creates turn each of the specified column in "conv_metadata" into a vector
    style_vector = np.array(
        [
            df.conv_metadata.map(
                lambda x: x[element]
                if type(x[element]) is int
                else sum(x[element].values())
            ).tolist()
            for element in style_elements
        ]
    )

    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)

    if add_one:
        style_sum = style_sum + np.ones(style_diff.shape)

    apply_ratio = np.flatnonzero(apply_ratio)

    style_diff[apply_ratio] /= style_sum[
        apply_ratio
    ]  # Apply ratio where necessary (length, etc)

    style_mean = np.mean(style_diff, axis=1)
    style_std = np.std(style_diff, axis=1)

    X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx) // 2 :] = False
    Y[tie_idx] = 1.0

    return X, Y, models


def get_bootstrap_result_style_control(X, Y, battles, models, func_compute_elo, num_round=1000):
    elos = []
    coefs = []
    assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
    k = int(
        X.shape[0] / 2
    )  # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates

    battles_tie_idx = (battles["winner"] == "tie") | (battles["winner"] == "tie (bothbad)")
    for _ in tqdm(range(num_round), desc="bootstrap"):
        indices = np.random.choice(list(range(k)), size=(k), replace=True)

        index2tie = np.zeros(k, dtype=bool)
        index2tie[battles_tie_idx] = True

        nontie_indices = indices[~index2tie[indices]]
        tie_indices = np.concatenate([indices[index2tie[indices]], indices[index2tie[indices]]+k])

        _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
        _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])

        assert _X.shape == X.shape and _Y.shape == Y.shape

        states = ~_X[:, : len(models)].any(axis=0)

        elo, coef = func_compute_elo(_X, _Y, models=models[~states])
        elos.append(elo)
        coefs.append(coef)

    df = pd.DataFrame(elos)
    return df[df.median().sort_values(ascending=False).index], coefs


def visualize_bootstrap_scores(df, title):
    bars = pd.DataFrame(dict(
        lower = df.quantile(.025),
        rating = df.quantile(.5),
        upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
    bars['error_y'] = bars['upper'] - bars["rating"]
    bars['error_y_minus'] = bars['rating'] - bars["lower"]
    bars['rating_rounded'] = np.round(bars['rating'], 2)
    fig = px.scatter(bars, x="model", y="rating", error_y="error_y",
                     error_y_minus="error_y_minus", text="rating_rounded",
                     title=title)
    fig.update_layout(xaxis_title="Model", yaxis_title="Rating",
                      height=600)
    return fig

In [None]:
recent_battles = battles[-200000:] # for demo purpose
len(recent_battles)

In [None]:
X, Y, models = construct_style_matrices(recent_battles)
elo_rating_style, style_coef = fit_bt(X, Y, models)

In [None]:
print(f"BT Coefficients for the style elements")
print(f"Length: {round(style_coef[0], 3)}, Markdown Header: {round(style_coef[1], 3)}, Markdown List: {round(style_coef[2], 3)}, Markdown Bold: {round(style_coef[3], 3)}")

The BT coefficient for each style elements represent how much impact does it has on the win-rate probability. We can see that length has the strongest affect on win-rate. Now let's check the new ranking! Notice it is quite different from the vanilla leaderboard without controlling style.

In [None]:
print(elo_rating_style[:20])

In [None]:
BOOTSTRAP_ROUNDS = 100

bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
    X, Y, recent_battles, models, fit_bt, num_round=BOOTSTRAP_ROUNDS
)

In [None]:
fig = visualize_bootstrap_scores(bootstrap_df, f"Bootstrap of BT score Style Controlled")
fig