# Search Arena Leaderboard Analysis

### Setup and Utils

In [None]:
%mkdir plots

In [None]:
%pip install -U kaleido
%pip install datasets

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-p

In [None]:
# Packages
import math
import numpy as np
import pandas as pd
from datasets import load_dataset
from functools import partial
from scipy.special import expit
from scipy.optimize import minimize
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

SAVE_HTML = False

In [None]:
########################################
# Battle Analysis Utils
########################################

def compute_pairwise_win_fraction(battles, model_order, limit_show_number=None):
    # Times each model wins as Model A
    a_win_ptbl = pd.pivot_table(
        battles[battles["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # Table counting times each model wins as Model B
    b_win_ptbl = pd.pivot_table(
        battles[battles["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # Table counting number of A-B pairs
    num_battles_ptbl = pd.pivot_table(
        battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0
    )
    # Computing the proportion of wins for each model as A and as B
    # against all other models
    row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (
        num_battles_ptbl + num_battles_ptbl.T
    )
    if model_order is None:
        prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
        model_order = list(prop_wins.keys())
    if limit_show_number is not None:
        model_order = model_order[:limit_show_number]
    # Arrange ordering according to proprition of wins
    row_beats_col = row_beats_col_freq.loc[model_order, model_order]
    return row_beats_col


def get_median_elo_from_bootstrap(bootstrap_df):
    median = dict(bootstrap_df.quantile(0.5))
    median = {k: int(v + 0.5) for k, v in median.items()}
    return median


def get_matchups_models(df):
    n_rows = len(df)
    model_indices, models = pd.factorize(pd.concat([df["model_a"], df["model_b"]]))
    matchups = np.column_stack([model_indices[:n_rows], model_indices[n_rows:]])
    return matchups, models.to_list()


def preprocess_for_elo(df):
    """
    in Elo we want numpy arrays for matchups and outcomes
      matchups: int32 (N,2)  contains model ids for the competitors in a match
      outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
    """
    matchups, models = get_matchups_models(df)
    outcomes = np.full(len(df), 0.5)
    outcomes[df["winner"] == "model_a"] = 1.0
    outcomes[df["winner"] == "model_b"] = 0.0
    return matchups, outcomes, models


def preprocess_for_bt(df):
    """in BT we only need the unique (matchup,outcome) sets along with the weights of how often they occur"""
    n_rows = len(df)
    # the 3 columns of schedule represent: model_a id, model_b id, outcome_id
    schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
    # set the two model cols by mapping the model names to their int ids
    schedule[:, [0, 1]], models = get_matchups_models(df)
    # map outcomes to integers (must be same dtype as model ids so it can be in the same array)
    # model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
    schedule[df["winner"] == "model_a", 2] = 2
    schedule[df["winner"] == "model_b", 2] = 0
    # count the number of occurances of each observed result
    matchups_outcomes, weights = np.unique(schedule, return_counts=True, axis=0)
    matchups = matchups_outcomes[:, [0, 1]]
    # map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
    outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
    weights = weights.astype(np.float64)
    # each possible result is weighted according to number of times it occured in the dataset
    return matchups, outcomes, models, weights


def preprocess_for_style(
    df,
    style_elements,
    add_one=True,
):
    apply_ratio = list(np.ones(len(style_elements)//2))
    matchups, outcomes, models = preprocess_for_elo(
        df
    )  # this can use the same preprocessing as Elo

    n = matchups.shape[0]
    k = int(len(style_elements) / 2)

    def extract_style_feature(x, feature):
        val = x[feature]
        if isinstance(val, int) or isinstance(val, float):
            return val
        else:
            return sum(val.values())

    style_vector = np.zeros(shape=(2 * k, n), dtype=np.int32)
    for idx, element in enumerate(style_elements):
        style_vector[idx, :] = df.conv_metadata.map(
            partial(extract_style_feature, feature=element)
        ).values
    style_vector = np.ascontiguousarray(style_vector)

    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)

    if add_one:
        style_sum = style_sum + np.ones(style_diff.shape)

    apply_ratio = np.flatnonzero(apply_ratio)

    # Apply ratio where necessary (length, etc)
    style_diff[apply_ratio] /= style_sum[apply_ratio]

    style_mean = np.mean(style_diff, axis=1)
    style_std = np.std(style_diff, axis=1)
    features = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T

    return matchups, features, outcomes, models


def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
    matchup_ratings = ratings[matchups]
    logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
    probs = expit(logits)
    # this form naturally counts a draw as half a win and half a loss
    loss = -(
        (np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights
    ).sum()
    matchups_grads = -alpha * (outcomes - probs) * weights
    model_grad = np.zeros_like(ratings)
    # aggregate gradients at the model level using the indices in matchups
    np.add.at(
        model_grad,
        matchups[:, [0, 1]],
        matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
    )
    return loss, model_grad


def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
    initial_ratings = np.zeros(n_models, dtype=np.float64)
    result = minimize(
        fun=bt_loss_and_grad,
        x0=initial_ratings,
        args=(matchups, outcomes, weights, alpha),
        jac=True,
        method="L-BFGS-B",
        options={"disp": False, "maxiter": 100, "gtol": tol},
    )
    return result["x"]


def scale_and_offset(
    ratings,
    models,
    scale,
    init_rating,
    anchor_model_and_rating=None,
):
    """convert ratings from the natural scale to the Elo rating scale with an anchored baseline"""
    scaled_ratings = (ratings * scale) + init_rating
    if anchor_model_and_rating is not None:
        anchor_model, anchor_rating = anchor_model_and_rating
        baseline_idx = models.index(anchor_model)
        scaled_ratings += anchor_rating - scaled_ratings[..., [baseline_idx]]
    return scaled_ratings


def compute_bt(
    df,
    base=10.0,
    scale=400.0,
    init_rating=1000,
    tol=1e-6,
    anchor_model_and_rating=None,
):
    matchups, outcomes, models, weights = preprocess_for_bt(df)
    ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base), tol)
    scaled_ratings = scale_and_offset(
        ratings, models, scale, init_rating, anchor_model_and_rating
    )
    return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)


def compute_bootstrap_bt(
    battles,
    num_round,
    base=10.0,
    scale=400.0,
    init_rating=1000.0,
    tol=1e-6,
    num_cpu=None,
    anchor_model_and_rating=None,
    offset=0.0,
):
    matchups, outcomes, models, weights = preprocess_for_bt(battles)
    # bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
    rng = np.random.default_rng(seed=0)
    idxs = rng.multinomial(
        n=len(battles), pvals=weights / weights.sum(), size=(num_round)
    )
    # only the distribution over their occurance counts changes between samples (and it can be 0)
    boot_weights = idxs.astype(np.float64) / len(battles)

    # the only thing different across samples is the distribution of weights
    bt_fn = partial(
        fit_bt, matchups, outcomes, n_models=len(models), alpha=np.log(base), tol=tol
    )
    results = []
    for weights in boot_weights:
        results.append(bt_fn(weights))

    ratings = np.array(results)
    scaled_ratings = scale_and_offset(
        ratings, models, scale, init_rating + offset, anchor_model_and_rating
    )
    df = pd.DataFrame(scaled_ratings, columns=models)
    return df[df.median().sort_values(ascending=False).index]


DIFF_MASK = np.array(
    [1.0, -1.0], dtype=np.float64
)  # create globally to not incur the instantiation cost in each call


def contextual_bt_loss_and_grad(
    params,
    n_competitors,
    matchups,
    features,
    outcomes,
    alpha=1.0,
    reg=1.0,
    half_reg=0.5,
):
    reg_loss = half_reg * np.inner(params, params)

    # Split params into ratings and feature parameters
    ratings = params[:n_competitors]
    feature_params = params[n_competitors:]

    matchup_ratings = ratings[matchups]
    bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
    context_logits = np.dot(features, feature_params)
    probs = expit(bt_logits + context_logits)
    loss = (
        -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes))).sum()
        + reg_loss
    )

    error = outcomes - probs
    grad = reg * params  # initialize the grad as the regularization grad
    matchups_grads = -alpha * error
    np.add.at(
        grad[:n_competitors], matchups[:, [0, 1]], matchups_grads[:, None] * DIFF_MASK
    )
    grad[n_competitors:] -= np.dot(features.T, error)
    return loss, grad


# note on regularization:
# default reg is to 0.5 since the LogisticRegression default is 1.0
# in the original implementation, matchups were duplicated
# that made the ratio of log loss to reg loss "twice as high"
# in this non-duplicated version for parity we also reduce the reg by one half to match
def fit_contextual_bt(
    matchups,
    features,
    outcomes,
    models,
    idxs=None,
    alpha=math.log(10.0),
    reg=0.5,
    tol=1e-6,
):
    n_features = features.shape[1]
    n_models = len(models)
    initial_params = np.zeros(n_models + n_features, dtype=np.float64)
    half_reg = reg / 2.0

    # sample idxs optionally allow for fitting on a bootstrap sample of the dataset
    if idxs is not None:
        matchups, features, outcomes = matchups[idxs], features[idxs], outcomes[idxs]

    result = minimize(
        fun=contextual_bt_loss_and_grad,
        x0=initial_params,
        args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
        jac=True,
        method="L-BFGS-B",
        options={"disp": False, "maxiter": 100, "gtol": tol},
    )
    return result["x"]


def compute_style_control(
    df,
    style_elements,
    alpha=math.log(10.0),
    reg=0.5,
    init_rating=1000.0,
    scale=400.0,
    tol=1e-6,
    anchor_model_and_rating=None,
):
    matchups, features, outcomes, models = preprocess_for_style(df, style_elements=style_elements)
    ratings_params = fit_contextual_bt(
        matchups,
        features,
        outcomes,
        models=models,
        alpha=alpha,
        reg=reg,
        tol=tol,
    )
    ratings = ratings_params[: len(models)]
    params = ratings_params[len(models) :]
    scaled_ratings = scale_and_offset(
        ratings, models, scale, init_rating, anchor_model_and_rating
    )
    scaled_ratings = pd.Series(scaled_ratings, index=models).sort_values(
        ascending=False
    )
    return scaled_ratings, params


def compute_bootstrap_style_control(
    df,
    style_elements,
    num_round,
    alpha=math.log(10.0),
    reg=0.5,
    init_rating=1000.0,
    scale=400.0,
    tol=1e-6,
    num_cpu=None,
    offset=0.0,
    anchor_model_and_rating=None,
):
    matchups, features, outcomes, models = preprocess_for_style(df, style_elements=style_elements)

    contextual_bt_fn = partial(
        fit_contextual_bt,
        matchups,
        features,
        outcomes,
        models,
        alpha=alpha,
        reg=reg,
        tol=tol,
    )

    np.random.seed(0)
    boot_idxs = np.random.randint(
        low=0, high=matchups.shape[0], size=(num_round, matchups.shape[0])
    )

    results = []
    for idx in boot_idxs:
        results.append(contextual_bt_fn(idx))

    ratings_params = np.array(results)
    ratings = ratings_params[:, : len(models)]
    params = ratings_params[:, len(models) :]
    scaled_ratings = scale_and_offset(
        ratings, models, scale, init_rating + offset, anchor_model_and_rating
    )
    df = pd.DataFrame(scaled_ratings, columns=models)
    return df[df.median().sort_values(ascending=False).index], params


def get_model_order(battles):
    bt_ratings = compute_bt(battles)
    return list(bt_ratings.keys())

In [None]:
########################################
# Visualization Utils
########################################

def visualize_vote_count(battles, scale=1):
    fig = px.bar(
        battles['winner'].value_counts(),
        text_auto="auto",
        height=500 * scale,
        width=700 * scale,
    )
    fig.update_layout(
        showlegend=False,
        yaxis_title="Number of Votes",
        xaxis_title="Model",
        margin=dict(l=40, r=40, t=20, b=40),
        xaxis_tickfont=dict(size=10),
        yaxis_tickfont=dict(size=10),
        font=dict(size=8),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14)
    )
    fig.update_traces(
        hovertemplate="Model: %{x}<br>Number of Votes: %{y}<extra></extra>",
        marker_line_width=1,
        marker_line_color="gray"
    )
    return fig


def visualize_battle_count_by_model(battles, scale=1):
    models = pd.concat([battles['model_a'], battles['model_b']]).value_counts()
    fig = px.bar(
        models,
        text_auto="auto",
        height=400 * scale,
        width=700 * scale,
    )
    fig.update_layout(
        showlegend=False,
        yaxis_title="Number of Battles",
        xaxis_title="Model",
        margin=dict(l=40, r=40, t=20, b=40),
        xaxis_tickangle=-45,
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14)
    )
    fig.update_traces(
        hovertemplate="Model: %{x}<br>Number of Battles: %{y}<extra></extra>",
        marker_line_width=1,
        marker_line_color="gray"
    )
    return fig


def visualize_pairwise_win_fraction(battles, model_order, scale=1):
    row_beats_col = compute_pairwise_win_fraction(battles, model_order)
    fig = px.imshow(
        row_beats_col,
        color_continuous_scale="RdBu",
        text_auto=".2f",
        height=500 * scale,
        width=700 * scale,
    )

    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        title_y=0.95,
        title_x=0.5,
        margin=dict(l=20, r=20, t=20, b=20),
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14),
        coloraxis_colorbar=dict(
            len=0.7,
            thickness=20,
            title="",
            xpad=5
        )
    )

    fig.update_traces(
        hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>"
    )

    return fig


def visualize_battle_count(battles, model_order, scale=1):
    ptbl = pd.pivot_table(
        battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0
    )
    battle_counts = ptbl + ptbl.T
    fig = px.imshow(
        battle_counts.loc[model_order, model_order],
        text_auto=True,
        height=500 * scale,
        width=700 * scale,
    )
    fig.update_layout(
        xaxis_title="Model B",
        yaxis_title="Model A",
        xaxis_side="top",
        title_y=0.95,
        title_x=0.5,
        margin=dict(l=20, r=20, t=20, b=20),
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14),
        coloraxis_colorbar=dict(
            len=0.7,
            thickness=20,
            title="",
            xpad=5
        )
    )
    fig.update_traces(
        hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>"
    )
    return fig


def visualize_average_win_rate(battles, limit_show_number, scale=1):
    row_beats_col_freq = compute_pairwise_win_fraction(
        battles, None, limit_show_number=limit_show_number
    )
    fig = px.bar(
        row_beats_col_freq.mean(axis=1).sort_values(ascending=False),
        text_auto=".2f",
        height=400 * scale,
        width=700 * scale,
    )
    fig.update_layout(
        yaxis_title="Average Win Rate",
        xaxis_title="Model",
        showlegend=False,
        margin=dict(l=20, r=20, t=20, b=20),
        xaxis_tickangle=-45,
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14)
    )
    fig.update_traces(
        hovertemplate="Model: %{x}<br>Average Win Rate: %{y:.2f}<extra></extra>",
        marker_line_width=1,
        marker_line_color="gray"
    )
    return fig


def visualize_bootstrap_style_coefs(style_coef_bootstrap, style_elements, scale=1):
    # Compute percentiles and mean estimates
    lower = np.percentile(style_coef_bootstrap, 2.5, axis=0)
    upper = np.percentile(style_coef_bootstrap, 97.5, axis=0)
    estimate = np.mean(style_coef_bootstrap, axis=0)

    # Prepare model (coefficient) names
    style_element_names = [s[:-2] for s in style_elements[:(len(style_elements)//2)]]

    # Build DataFrame
    bars = pd.DataFrame({
        "model": style_element_names,
        "lower": lower,
        "upper": upper,
        "rating": estimate
    })
    bars["error_y"] = bars["upper"] - bars["rating"]
    bars["error_y_minus"] = bars["rating"] - bars["lower"]
    bars["rating_rounded"] = np.round(bars["rating"], 2)

    # (Optional) Sort by rating in descending order
    bars = bars.sort_values("rating", ascending=False)

    # Create scatter plot with error bars
    fig = px.scatter(
        bars,
        x="model",
        y="rating",
        error_y="error_y",
        error_y_minus="error_y_minus",
        height=400 * scale,
        width=700 * scale,
        template="plotly_white"  # Clean template
    )

    # Add a reference line at y=0 (common for regression coefficients)
    fig.add_hline(y=0, line_width=1, line_dash="dash", line_color="gray")

    # Customize markers and hover info
    fig.update_traces(
        marker=dict(size=10, color="royalblue"),
        line=dict(width=1, color="royalblue"),
        hovertemplate=(
            "<b>Coefficient:</b> %{x}<br>"
            "<b>Estimate:</b> %{y:.3f}<br>"
            "<b>CI Lower:</b> %{customdata[0]:.3f}<br>"
            "<b>CI Upper:</b> %{customdata[1]:.3f}"
            "<extra></extra>"
        ),
        customdata=bars[["lower", "upper"]].values
    )

    # Update layout for neatness
    fig.update_layout(
        xaxis_title="Coefficient",
        yaxis_title="Estimate",
        showlegend=False,
        margin=dict(l=20, r=20, t=20, b=20),
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14)
    )

    # Optionally rotate x-axis labels if they're long
    fig.update_xaxes(tickangle=-45)

    return fig


def visualize_bootstrap_elo_rating(df, df_final, limit_show_number, scale=1):
    bars = (
        pd.DataFrame(
            dict(
                lower=df.quantile(0.025),
                rating=df_final,
                upper=df.quantile(0.975),
            )
        )
        .reset_index(names="model")
        .sort_values("rating", ascending=False)
    )
    bars = bars[:limit_show_number]
    bars["error_y"] = bars["upper"] - bars["rating"]
    bars["error_y_minus"] = bars["rating"] - bars["lower"]
    bars["rating_rounded"] = np.round(bars["rating"])
    fig = px.scatter(
        bars,
        x="model",
        y="rating",
        error_y="error_y",
        error_y_minus="error_y_minus",
        # text="rating_rounded",
        height=400 * scale,
        width=700 * scale,
    )
    # fig.update_traces(textposition='top center')
    fig.update_layout(xaxis_title="Model",
                      yaxis_title="Rating",
                      margin=dict(l=20, r=20, t=20, b=20),
                      xaxis_tickfont=dict(size=12),
                      yaxis_tickfont=dict(size=12),
                      font=dict(size=12),
                      xaxis_title_font=dict(size=14),
                      yaxis_title_font=dict(size=14)
                      )
    return fig

In [None]:
########################################
# Search Arena Utils
########################################

DOMAIN_CATEGORIES = [
    "youtube",
    "gov_edu",
    "wiki",
    "us_news",
    "foreign_news",
    "social_media",
    "community_blog",
    "tech_coding",
    "map",
    "academic_journal",
    "other"
]

def run_leaderboard(
        battle_data,
        anchor_model,
        anchor_rating,
        visualize=True,
        style_elements=None,
        num_bootstrap_samples=1000,
        save=False,
        file_prefix=""
        ):

    if style_elements is None:
        bt_ratings = compute_bt(battle_data)
        offset_score = (anchor_rating - bt_ratings[anchor_model])
        bt_ratings += offset_score
        bt_ratings_bootstrap = compute_bootstrap_bt(battle_data, num_round=100, offset=offset_score)
    else:
        bt_ratings, _ = compute_style_control(battle_data, style_elements=style_elements)
        offset_score = (anchor_rating - bt_ratings[anchor_model])
        bt_ratings += offset_score
        bt_ratings_bootstrap, style_coef_bootstrap = compute_bootstrap_style_control(battle_data, style_elements=style_elements, num_round=num_bootstrap_samples, offset=offset_score)

    if visualize:
        print("#" * 50)
        print("BT Ratings")
        print("#" * 50)
        fig = visualize_bootstrap_elo_rating(bt_ratings_bootstrap, bt_ratings, limit_show_number=None)
        fig.show()
        if save:
            if SAVE_HTML:
                fig.write_html(f"plots/{file_prefix}bootstrap_elo_rating.html")
            else:
                fig.write_image(f"plots/{file_prefix}bootstrap_elo_rating.png")
        if style_elements is not None:
            print("#" * 50)
            print("Style Coef")
            print("#" * 50)
            fig = visualize_bootstrap_style_coefs(style_coef_bootstrap, style_elements)
            fig.show()
            if save:
                if SAVE_HTML:
                    fig.write_html(f"plots/{file_prefix}bootstrap_style_coefs.html")
                else:
                    fig.write_image(f"plots/{file_prefix}bootstrap_style_coefs.png")

    model_order = list(bt_ratings.keys())
    model_rating_q025 = bt_ratings_bootstrap.quantile(0.025)
    model_rating_q975 = bt_ratings_bootstrap.quantile(0.975)

    ranking = {}
    for i, model_a in enumerate(model_order):
        ranking[model_a] = 1
        for j, model_b in enumerate(model_order):
            if i == j:
                continue
            if model_rating_q025[model_b] > model_rating_q975[model_a]:
                ranking[model_a] += 1

    leaderboard_table = pd.DataFrame(
        {
            "rating": bt_ratings,
            "variance": bt_ratings_bootstrap.var(),
            "rating_q975": bt_ratings_bootstrap.quantile(0.975),
            "rating_q025": bt_ratings_bootstrap.quantile(0.025),
            "num_battles": battle_data["model_a"].value_counts().add(battle_data["model_b"].value_counts(), fill_value=0),
            "final_ranking": pd.Series(ranking),
        }
    )
    leaderboard_table = leaderboard_table.sort_values(by='rating', ascending=False)
    display(leaderboard_table)
    return bt_ratings

def run_all(battle_data,
            style_elements=None,
            viz_battle_count=False,
            viz_win_rates=False,
            viz_leaderboard=False,
            anchor_model='api-gpt-4o-search',
            anchor_rating=1000,
            save=False,
            file_prefix="",
            ):
    battles_no_ties = battle_data[~battle_data['winner'].isin(['tie', 'tie (bothbad)'])]

    if viz_battle_count:
        print("#" * 50)
        print("Battle Counts")
        print("#" * 50)
        fig = visualize_battle_count_by_model(battle_data)
        fig.show()
        if save:
            if SAVE_HTML:
                fig.write_html(f"plots/{file_prefix}battle_count.html")
            else:
                fig.write_image(f"plots/{file_prefix}battle_count.png")
        fig = visualize_battle_count(battle_data, get_model_order(battle_data))
        fig.show()
        if save:
            if SAVE_HTML:
                fig.write_html(f"plots/{file_prefix}pairwise_battle_count.html")
            else:
                fig.write_image(f"plots/{file_prefix}pairwise_battle_count.png")

    if viz_win_rates:
        print("#" * 50)
        print("Win Rates")
        print("#" * 50)
        fig = visualize_average_win_rate(battles_no_ties, limit_show_number=None)
        fig.show()
        if save:
            if SAVE_HTML:
                fig.write_html(f"plots/{file_prefix}average_win_rate.html")
            else:
                fig.write_image(f"plots/{file_prefix}average_win_rate.png")
        fig = visualize_pairwise_win_fraction(battles_no_ties, get_model_order(battle_data))
        fig.show()
        if save:
            if SAVE_HTML:
                fig.write_html(f"plots/{file_prefix}pairwise_average_win_rate.html")
            else:
                fig.write_image(f"plots/{file_prefix}pairwise_average_win_rate.png")

    bt_ratings = run_leaderboard(battle_data,
                                 anchor_model,
                                 anchor_rating,
                                 style_elements=style_elements,
                                 visualize=viz_leaderboard,
                                 save=save,
                                 file_prefix=file_prefix)
    return bt_ratings

def viz_change_scores(bt_change_scores, initial_name, final_name):
    fig = go.Figure()
    for model, scores in bt_change_scores.items():
        fig.add_trace(go.Scatter(
            x=[initial_name, final_name],
            y=[scores[0], scores[1]],
            mode='lines+markers',
            name=model,
            text=[f"{scores[0]:.1f}", f"{scores[1]:.1f}"],
            textposition="top center",
            marker=dict(size=10),
            line=dict(width=2),
            hovertemplate="<b>%{fullData.name}</b><br>Version: %{x}<br>Score: %{y:.1f}<extra></extra>"
        ))
    fig.update_layout(
        yaxis_title="Score",
        height=400,
        width=700,
        plot_bgcolor="white",
        margin=dict(t=20, b=20, l=20, r=20),
        xaxis=dict(
            showgrid=True,
            gridwidth=1,
            gridcolor='lightgray',
            tickfont=dict(size=16),
        ),
        yaxis=dict(showgrid=True, gridwidth=1, gridcolor='lightgray', tickfont=dict(size=14)),
        xaxis_tickfont=dict(size=12),
        yaxis_tickfont=dict(size=12),
        font=dict(size=12),
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14)
    )
    return fig

def get_model_group(name):
    name = name.lower()
    if 'gpt' in name:
        return 'openai'
    elif 'gemini' in name:
        return 'google'
    elif 'sonar' in name:
        return 'perplexity'
    else:
        return 'other'

### Loading Data

Load the cleaned and filtered version of the dataset.

`system_a_metadata` and `system_b_metadata` contain the following fields: `client_country`, `citation_format_standardized`, `llm_config`, `web_search_config`, `llm_trace`, and `web_search_trace`.
- `client_country`: country code extracted from user's IP address.
- `citation_format_standardized`: indicator whether the inline citations were displayed in original or standardized format.
- `llm_config` and `web_search_config` include LLM's and web search pipeline's parameters (e.g., model name, search context). The parameters of `web_search_config` are set to `built-in` for commercial closed-source systems.
- `llm_trace`: same as `messages` + optional intermediate steps (e.g., reasoning model's thinking tokens).
- `web_search_trace`: web search queries, source urls, scraped content if returned by the API.

`conv_metadata` contains the following fields (`field_name_{a/b}`):
- `standardized_citations`: indicator whether the inline citations in model A/B's responses were standardized.
- `response_length`: average assistant response length in the conversation.
- `num_citations`: average number of citations per turn.
- `num_cites_{domain}`: number of times a domain is cited in the conversation.
- `cites_{domain}`: whether a domain is cited in the conversation (0/1).

In [None]:
battle_data = load_dataset("lmarena-ai/search-arena-v1-7k", split="test")
battle_data = battle_data.to_pandas()
display(battle_data.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

search-arena-v1-preference-7k.parquet:   0%|          | 0.00/66.8M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/7000 [00:00<?, ? examples/s]

Unnamed: 0,model_a,model_b,winner,judge,turn,language,timestamp,messages_a,messages_b,system_a_metadata,system_b_metadata,conv_metadata,question_id
0,api-gpt-4o-mini-search,api-gpt-4o-search,tie,ab373ba670eade9a357b075a9625272238821caa415ee6...,1,English,2025-03-18 17:28:58.245,"[{'content': 'who is ion vlad-doru?', 'role': ...","[{'content': 'who is ion vlad-doru?', 'role': ...","{'citation_format_standardized': False, 'clien...","{'citation_format_standardized': False, 'clien...","{'cites_academic_journal_a': 0, 'cites_academi...",0
1,ppl-sonar-pro,api-gpt-4o-search,model_b,7c0a01dbf51923a007e1bed9db6755fd0b728b888d2e05...,1,English,2025-03-18 17:29:01.444,[{'content': 'What is the exact age difference...,[{'content': 'What is the exact age difference...,"{'citation_format_standardized': False, 'clien...","{'citation_format_standardized': False, 'clien...","{'cites_academic_journal_a': 0, 'cites_academi...",1
2,ppl-sonar-reasoning,ppl-sonar-pro,model_b,60c987c27eed1512988bec39497011f3a35e4bb1a00864...,1,English,2025-03-18 17:29:57.137,[{'content': 'Why is the Delta between o1 and ...,[{'content': 'Why is the Delta between o1 and ...,"{'citation_format_standardized': False, 'clien...","{'citation_format_standardized': False, 'clien...","{'cites_academic_journal_a': 0, 'cites_academi...",2
3,ppl-sonar-reasoning,ppl-sonar-pro,model_a,3162b94e8c57965281b33f7d8597cc04119dfea2e4bd61...,1,Russian,2025-03-18 17:30:02.763,"[{'content': 'Походження, еволюція та зміст по...","[{'content': 'Походження, еволюція та зміст по...","{'citation_format_standardized': False, 'clien...","{'citation_format_standardized': False, 'clien...","{'cites_academic_journal_a': 0, 'cites_academi...",3
4,api-gpt-4o-mini-search,api-gpt-4o-search,model_b,a3b79e6ff635348d8a05a7a8aaca2ba71f96f67f48a832...,1,English,2025-03-18 17:30:02.917,"[{'content': 'Potatoes', 'role': 'user'}, {'co...","[{'content': 'Potatoes', 'role': 'user'}, {'co...","{'citation_format_standardized': False, 'clien...","{'citation_format_standardized': False, 'clien...","{'cites_academic_journal_a': 0, 'cites_academi...",4


### Main Leaderboard

In [None]:
print("Full Leaderboard")
full_ratings = run_all(battle_data, viz_battle_count=True, viz_win_rates=True, viz_leaderboard=True,
                       save=True, file_prefix="main_")

Full Leaderboard
##################################################
Battle Counts
##################################################


##################################################
Win Rates
##################################################


##################################################
BT Ratings
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1141.906904,76.933577,1156.016517,1125.373536,1215,1
ppl-sonar-reasoning-pro-high,1136.208409,111.303447,1154.871252,1116.311956,861,1
ppl-sonar-reasoning,1096.713539,54.395226,1108.314306,1079.757069,1644,3
ppl-sonar,1072.420492,60.144391,1088.646868,1057.769097,1208,3
ppl-sonar-pro-high,1071.157787,50.084132,1082.976655,1055.426219,1364,3
ppl-sonar-pro,1066.26989,42.633508,1079.525461,1054.280359,1214,4
gemini-2.0-flash-grounding,1028.184544,73.932628,1047.314843,1012.595045,1193,7
api-gpt-4o-search,1000.0,65.54425,1015.40672,983.157832,1196,7
api-gpt-4o-search-high,998.85543,49.434264,1013.592907,984.296763,1707,7
api-gpt-4o-search-high-loc,994.046644,50.641697,1007.249173,980.037248,1226,8


### Citation style analysis

Applying style control using the indicator feature whether the inline citations in the response were standardized or not.

In [None]:
STYLE_CONTROL_ELEMENTS = [
    "standardized_citations_a",
    "standardized_citations_b",
]
print("\n\n\n")
print("#" * 100)
print("Leaderboard with Style Control (citation style)")
full_ratings_num_citations_style = run_all(battle_data, style_elements=STYLE_CONTROL_ELEMENTS, viz_leaderboard=True)





####################################################################################################
Leaderboard with Style Control (citation style)
##################################################
BT Ratings
##################################################


##################################################
Style Coef
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1141.780556,70.88086,1158.105079,1125.190533,1215,1
ppl-sonar-reasoning-pro-high,1136.167774,99.832067,1154.823682,1115.616366,861,1
ppl-sonar-reasoning,1096.079799,49.427238,1109.700851,1083.22931,1644,3
ppl-sonar,1071.620478,63.055676,1087.256557,1057.316174,1208,3
ppl-sonar-pro-high,1070.955524,53.419421,1086.099224,1057.389894,1364,3
ppl-sonar-pro,1065.664695,59.459294,1081.637843,1051.347882,1214,4
gemini-2.0-flash-grounding,1028.620931,74.682473,1044.669031,1010.862368,1193,7
api-gpt-4o-search,1000.0,56.347155,1014.252592,985.22321,1196,7
api-gpt-4o-search-high,999.24356,40.558751,1012.123516,987.397127,1707,7
api-gpt-4o-search-high-loc,994.69822,61.44642,1010.663072,979.044347,1226,8


Treat each (model, citation style) pair as a separate model.

In [None]:
battle_data_ablation = battle_data.copy()
battle_data_ablation['model_a'] = battle_data_ablation.apply(lambda row: f"{row['model_a']}-{'og' if row['conv_metadata']['standardized_citations_a'] == 0 else 'st'}", axis=1)
battle_data_ablation['model_b'] = battle_data_ablation.apply(lambda row: f"{row['model_b']}-{'og' if row['conv_metadata']['standardized_citations_b'] == 0 else 'st'}", axis=1)


In [None]:
anchor_model = 'api-gpt-4o-search-og'
anchor_rating = 1000
bt_ratings = compute_bt(battle_data_ablation)
offset_score = (anchor_rating - bt_ratings[anchor_model])
bt_ratings += offset_score

bt_change_scores = {}
for model in get_model_order(battle_data):
    bt_change_scores[model] = (bt_ratings[f'{model}-og'], bt_ratings[f'{model}-st'])

fig = viz_change_scores(bt_change_scores, "original", "standardized")
fig.show()
if SAVE_HTML:
    fig.write_html("plots/og_vs_st.html")
else:
    fig.write_image("plots/og_vs_st.png")

Calculate leaderboard using only votes between models with original citation styles.

In [None]:
battle_data_ablation = battle_data.copy()
battle_data_ablation = battle_data_ablation[battle_data_ablation['conv_metadata'].apply(lambda x: x['standardized_citations_a'] == 0 and x['standardized_citations_b'] == 0)]
bt_ratings_ablation = run_all(battle_data_ablation,
                              viz_battle_count=False,
                              viz_win_rates=False,
                              viz_leaderboard=True,
                            )

##################################################
BT Ratings
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1165.957036,312.17752,1201.273007,1132.769405,254,1
ppl-sonar-reasoning-pro-high,1139.598509,292.29615,1171.376004,1104.011777,210,1
ppl-sonar-reasoning,1091.92235,79.249728,1108.923992,1075.904301,879,2
ppl-sonar,1059.697813,95.522365,1078.845964,1041.744291,753,3
ppl-sonar-pro,1054.789465,89.876466,1074.189928,1038.486821,812,4
gemini-2.0-flash-grounding,1051.035023,281.663532,1087.468415,1025.300187,217,3
ppl-sonar-pro-high,1049.675044,106.691429,1070.133307,1033.145898,626,4
api-gpt-4o-search-high,1012.892077,75.737086,1027.368816,995.040006,809,7
api-gpt-4o-search,1000.0,119.718835,1020.683137,982.386663,679,8
api-gpt-4o-mini-search,977.522727,126.51302,999.421753,952.465672,697,8


Calculate leaderboard using only votes between models with standardized citation styles.

In [None]:
battle_data_ablation = battle_data.copy()
battle_data_ablation = battle_data_ablation[battle_data_ablation['conv_metadata'].apply(lambda x: x['standardized_citations_a'] == 1 and x['standardized_citations_b'] == 1)]
bt_ratings_ablation = run_all(battle_data_ablation,
                              viz_battle_count=False,
                              viz_win_rates=False,
                              viz_leaderboard=True,
                            )

##################################################
BT Ratings
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1197.643596,265.56904,1229.760725,1170.699175,285,1
ppl-sonar-reasoning-pro-high,1178.100604,414.914045,1217.919476,1135.810976,184,1
ppl-sonar-pro-high,1131.718841,252.451368,1162.394259,1101.139263,276,2
ppl-sonar-reasoning,1130.221696,266.420768,1160.760876,1099.143686,274,2
ppl-sonar-pro,1121.589165,409.186466,1155.500579,1082.660874,147,2
ppl-sonar,1076.727403,379.531574,1112.931441,1043.014962,193,3
gemini-2.0-flash-grounding,1047.156253,222.82553,1079.508656,1022.484247,302,6
api-gpt-4o-search-high-loc,1029.945342,260.869888,1058.73511,997.016645,308,6
api-gpt-4o-search,1000.0,455.429408,1041.013019,959.948969,174,7
api-gpt-4o-search-high,994.391641,227.292913,1024.526015,963.247702,321,7


### Model profiling

Extract and process features: `response_length`, `num_citations`, `num_cites_<domain>` (how often the system cites a URL from a given domain)

In [None]:
model_features = battle_data.copy()
model_features['model_group_a'] = model_features['model_a'].apply(get_model_group)
model_features['model_group_b'] = model_features['model_b'].apply(get_model_group)
model_features['response_length_a'] = model_features['conv_metadata'].apply(lambda x: x['response_length_a'])
model_features['response_length_b'] = model_features['conv_metadata'].apply(lambda x: x['response_length_b'])
model_features['num_citations_a'] = model_features['conv_metadata'].apply(lambda x: x['num_citations_a'])
model_features['num_citations_b'] = model_features['conv_metadata'].apply(lambda x: x['num_citations_b'])

mapping_a = {}
mapping_b = {}
for domain_group in DOMAIN_CATEGORIES:
    mapping_a[f"cites_{domain_group}_a"] = "cites_" + domain_group
    mapping_b[f"cites_{domain_group}_b"] = "cites_" + domain_group
    mapping_a[f"num_cites_{domain_group}_a"] = "num_cites_" + domain_group
    mapping_b[f"num_cites_{domain_group}_b"] = "num_cites_" + domain_group
    model_features[f'cites_{domain_group}_a'] = model_features['conv_metadata'].apply(lambda x: x[f'cites_{domain_group}_a'])
    model_features[f'cites_{domain_group}_b'] = model_features['conv_metadata'].apply(lambda x: x[f'cites_{domain_group}_b'])
    model_features[f'num_cites_{domain_group}_a'] = model_features['conv_metadata'].apply(lambda x: x[f'num_cites_{domain_group}_a'])
    model_features[f'num_cites_{domain_group}_b'] = model_features['conv_metadata'].apply(lambda x: x[f'num_cites_{domain_group}_b'])

# Useful for our num_citation case study
model_features["w_trump_a"] = battle_data["messages_a"].apply(lambda x: any("trump" in msg["content"].lower() in msg["content"].lower() for msg in x))
model_features["w_trump_b"] = battle_data["messages_b"].apply(lambda x: any("trump" in msg["content"].lower() in msg["content"].lower() for msg in x))

model_features = pd.concat([
    model_features[['model_a', 'model_group_a', 'response_length_a', 'num_citations_a', 'w_trump_a'] + list(mapping_a.keys())]
        .rename(columns={'model_a': 'model', 'model_group_a': 'model_group', 'response_length_a': 'response_length',
                         'num_citations_a': 'num_citations', 'w_trump_a': 'w_trump', **mapping_a}),
    model_features[['model_b', 'model_group_b', 'response_length_b', 'num_citations_b', 'w_trump_b'] + list(mapping_b.keys())]
        .rename(columns={'model_b': 'model', 'model_group_b': 'model_group', 'response_length_b': 'response_length',
                         'num_citations_b': 'num_citations', 'w_trump_b': 'w_trump', **mapping_b}),
])
model_features

Unnamed: 0,model,model_group,response_length,num_citations,w_trump,cites_youtube,num_cites_youtube,cites_gov_edu,num_cites_gov_edu,cites_wiki,...,cites_community_blog,num_cites_community_blog,cites_tech_coding,num_cites_tech_coding,cites_map,num_cites_map,cites_academic_journal,num_cites_academic_journal,cites_other,num_cites_other
0,api-gpt-4o-mini-search,openai,628.0,1.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,ppl-sonar-pro,perplexity,622.0,15.0,False,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,12
2,ppl-sonar-reasoning,perplexity,2950.0,5.0,False,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1,4
3,ppl-sonar-reasoning,perplexity,1990.0,5.0,False,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,4
4,api-gpt-4o-mini-search,openai,3724.0,6.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,gemini-2.0-flash-grounding,google,855.0,4.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,4
6996,api-gpt-4o-search-high-loc,openai,1448.0,3.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
6997,api-gpt-4o-search-high-loc,openai,1920.0,0.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6998,gemini-2.0-flash-grounding,google,2127.0,6.0,False,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,6


In [None]:
plotly_colors = px.colors.qualitative.Plotly
color_map = {
    "perplexity": plotly_colors[0],
    "google": plotly_colors[1],
    "openai": plotly_colors[2],
}

df1 = model_features[['model', 'model_group', 'response_length']]
avg_response_length = df1.groupby(["model", "model_group"], as_index=False)["response_length"].mean()
avg_response_length = avg_response_length.sort_values("response_length", ascending=False)

df2 = model_features[['model', 'model_group', 'num_citations']]
avg_num_citations = df2.groupby(["model", "model_group"], as_index=False)["num_citations"].mean()
avg_num_citations = avg_num_citations.sort_values("num_citations", ascending=False)

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=[
        "Avg Response Length",
        "Avg Number of Citations"
    ],
    horizontal_spacing=0.1
)

def add_bars(fig, df, y_col, col_idx):
    y_label = "Avg Response Length" if y_col == "response_length" else "Avg Number of Citations"

    for group in df["model_group"].unique():
        group_df = df[df["model_group"] == group]
        fig.add_trace(
            go.Bar(
                x=group_df["model"],
                y=group_df[y_col],
                name=group,
                marker_color=color_map.get(group),
                showlegend=(col_idx == 1),
                hovertemplate=(
                    "<b>Model:</b> %{x}<br>" +
                    f"<b>{y_label}:</b> " + "%{y:.2f}<br>" +
                    "<b>Model Group:</b> " + group +
                    "<extra></extra>"
                )
            ),
            row=1, col=col_idx
        )

add_bars(fig, avg_response_length, "response_length", 1)
add_bars(fig, avg_num_citations, "num_citations", 2)

fig.update_layout(
    height=400,
    width=700,
    template="plotly_white",
    legend=dict(
        font=dict(size=10),
        orientation="h",
        y=1.25,
        x=0.5,
        xanchor="center"
    ),
    margin=dict(l=20, r=20, t=20, b=20)
)
for i in range(1, 3):
    fig.update_xaxes(tickangle=-45, row=1, col=i, tickfont=dict(size=8))
    fig.update_yaxes(title_text="", row=1, col=i, tickfont=dict(size=8))

fig.update_annotations(font_size=12)

fig.show()
if SAVE_HTML:
    fig.write_html("plots/lenght_cit_features.html")
else:
    fig.write_image("plots/lenght_cit_features.png")

In [None]:
groups_to_plot = DOMAIN_CATEGORIES

plot_df = model_features[['model_group', 'w_trump'] + ["num_cites_" + domain_group for domain_group in groups_to_plot]]

def plot_domain_histogram(category_data, trump_category_data, fname):
    import plotly.graph_objects as go

    num_overall_traces = 3
    num_case_traces = 3

    def prep_val(v):
        return v if v > 0 else 0.0001

    def label_val(v):
        return f"{v:.1%}"

    model_names = ["openai", "google", "perplexity"]
    plot_vals = {}
    plot_labels = {}
    trump_plot_vals = {}
    trump_plot_labels = {}
    categories = ["us_news", "foreign_news", "youtube", "wiki", "gov_edu", "community_blog", "social_media", "tech_coding", "academic_journal", "map"]
    for model in model_names:
        plot_vals[model] = [prep_val(category_data[cat].get(model, 0)) for cat in categories]
        plot_labels[model] = [label_val(category_data[cat].get(model, 0)) for cat in categories]
        trump_plot_vals[model] = [prep_val(trump_category_data[cat].get(model, 0)) for cat in categories]
        trump_plot_labels[model] = [label_val(trump_category_data[cat].get(model, 0)) for cat in categories]

    fig = go.Figure()

    for model in model_names:
        fig.add_trace(go.Bar(
            x=plot_vals[model],
            y=categories,
            name=model.capitalize(),
            orientation="h",
            text=plot_labels[model],
            textposition="none",
            marker_color=color_map[model],
            hovertemplate=(
                "<b>Model Family:</b> %{fullData.name}<br>" +
                "<b>Domain Category:</b> %{y}<br>" +
                "<b>Proportion:</b> %{text}" +
                "<extra></extra>"
            ),
            visible=True,
            legendgroup=model,
            showlegend=True
        ))

    for model in model_names:
        fig.add_trace(go.Bar(
            x=trump_plot_vals[model],
            y=categories,
            name=model.capitalize(),
            orientation="h",
            text=trump_plot_labels[model],
            textposition="none",
            marker_color=color_map[model],
            hovertemplate=(
                "<b>Model Family:</b> %{fullData.name}<br>" +
                "<b>Domain Category:</b> %{y}<br>" +
                "<b>Proportion:</b> %{text}<br>" +
                "<i>Trump-related queries only</i>" +
                "<extra></extra>"
            ),
            visible=False,
            legendgroup=model,
            showlegend=True
        ))

    fig.update_layout(
        barmode='group',
        xaxis_title="Proportion",
        yaxis_title="",
        width=700,
        height=400,
        margin=dict(l=80, r=10, t=40, b=30),  # More top margin
        font=dict(size=12),
        legend=dict(
            font=dict(size=12),
            y=1.15,  # Adjust this to line up with dropdown
            x=0.7,
            xanchor="center",
            yanchor="top",
            orientation="h"
        ),
        updatemenus=[
            dict(
                type="dropdown",
                direction="down",
                showactive=True,
                buttons=[
                    dict(label="Overall",
                        method="update",
                        args=[{"visible": [True] * num_overall_traces + [False] * num_case_traces},
                              {"title": ""}]),
                    dict(label="Trump Case",
                        method="update",
                        args=[{"visible": [False] * num_overall_traces + [True] * num_case_traces},
                              {"title": ""}]),
                ],
                x=0,
                xanchor="left",
                y=1.15,  # Same as legend y
                yanchor="top",
                font=dict(size=12)
            ),
        ]
    )

    fig.update_yaxes(autorange="reversed", tickfont=dict(size=9))
    fig.update_xaxes(tickfont=dict(size=9))

    fig.show()

    if SAVE_HTML:
        fig.write_html(f"{fname}.html")
    else:
        fig.write_image(f"{fname}.png")

# Process the data
plot_df_general = plot_df.groupby('model_group').sum()
plot_df_general = plot_df_general.rename(columns={domain: domain.replace('num_cites_', '') for domain in plot_df.columns}).drop(columns=['w_trump', 'other'])
plot_df_general = plot_df_general.div(plot_df_general.sum(axis=1), axis=0)
category_data = plot_df_general.T.to_dict(orient="index")

plot_df_trump = plot_df[plot_df["w_trump"] > 0].groupby('model_group').mean()
plot_df_trump = plot_df_trump.rename(columns={domain: domain.replace('num_cites_', '') for domain in plot_df.columns}).drop(columns=['w_trump', 'other'])
plot_df_trump = plot_df_trump.div(plot_df_trump.sum(axis=1), axis=0)
trump_category_data = plot_df_trump.T.to_dict(orient="index")

plot_domain_histogram(category_data, trump_category_data, "plots/domain_citations")

### Control Experiments

In [None]:
# response length
CONTROL_ELEMENTS = [
    "response_length_a",
    "response_length_b",
]
print("\n\n\n")
print("#" * 100)
print("Leaderboard with Response Length Control")
full_ratings_num_citations_style = run_all(battle_data, style_elements=CONTROL_ELEMENTS, viz_leaderboard=True)





####################################################################################################
Leaderboard with Response Length Control
##################################################
BT Ratings
##################################################


##################################################
Style Coef
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
ppl-sonar-reasoning-pro-high,1111.734344,102.175279,1130.841717,1090.659609,861,1
gemini-2.5-pro-grounding,1107.671486,78.72444,1125.421023,1089.843159,1215,1
ppl-sonar-reasoning,1086.361687,49.877173,1101.289352,1073.590884,1644,1
ppl-sonar-pro-high,1072.23864,55.48431,1087.33135,1058.394621,1364,3
ppl-sonar,1069.001967,64.720267,1084.357642,1054.377032,1208,3
ppl-sonar-pro,1068.025691,59.727737,1083.604962,1053.309888,1214,3
gemini-2.0-flash-grounding,1030.513913,74.25073,1047.355002,1013.111849,1193,7
api-gpt-4o-search,1000.0,56.855323,1014.629128,985.17784,1196,7
api-gpt-4o-search-high,996.58947,40.281862,1009.815342,984.881036,1707,8
api-gpt-4o-search-high-loc,996.526374,60.904395,1012.35074,980.829812,1226,8


In [None]:
# num citations
STYLE_CONTROL_ELEMENTS = [
    "num_citations_a",
    "num_citations_b",
]
print("\n\n\n")
print("#" * 100)
print("Leaderboard with Style Control (citation style)")
full_ratings_num_citations_style = run_all(battle_data, style_elements=STYLE_CONTROL_ELEMENTS, viz_leaderboard=True)





####################################################################################################
Leaderboard with Style Control (citation style)
##################################################
BT Ratings
##################################################


##################################################
Style Coef
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1120.756205,72.554824,1137.184759,1103.343412,1215,1
ppl-sonar-reasoning-pro-high,1089.785722,110.294772,1109.705374,1069.735228,861,1
ppl-sonar-reasoning,1053.8332,56.009246,1069.092223,1039.530938,1644,3
ppl-sonar,1026.9961,73.131989,1043.61034,1011.410905,1208,3
ppl-sonar-pro-high,1019.867239,67.630131,1036.892198,1004.525565,1364,4
ppl-sonar-pro,1014.426373,72.981771,1032.182737,998.153925,1214,4
gemini-2.0-flash-grounding,1014.134529,79.033535,1031.181519,996.176554,1193,4
api-gpt-4o-search,1000.0,68.858626,1016.089588,983.348533,1196,4
api-gpt-4o-search-high,998.288606,49.757884,1013.102633,985.102292,1707,4
api-gpt-4o-search-high-loc,993.021834,72.48864,1010.108753,976.967016,1226,5


In [None]:
CONTROL_ELEMENTS = [f"cites_{domain}_a" for domain in DOMAIN_CATEGORIES if domain != "other"]
CONTROL_ELEMENTS += [f"cites_{domain}_b" for domain in DOMAIN_CATEGORIES if domain != "other"]
print("\n\n\n")
print("#" * 100)
print("Leaderboard with #Citations Control")
full_ratings_num_citations_style = run_all(battle_data, style_elements=CONTROL_ELEMENTS, viz_leaderboard=True, save=True, file_prefix="domain_citations_style_control_")





####################################################################################################
Leaderboard with #Citations Control
##################################################
BT Ratings
##################################################


##################################################
Style Coef
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1139.36221,74.140951,1156.132907,1122.363827,1215,1
ppl-sonar-reasoning-pro-high,1119.531249,104.567577,1139.009242,1098.903982,861,1
ppl-sonar-reasoning,1083.543443,54.385694,1098.878254,1069.77785,1644,3
ppl-sonar,1058.508231,69.075898,1074.254667,1043.305203,1208,3
ppl-sonar-pro-high,1054.104368,61.134639,1069.639913,1039.074501,1364,4
ppl-sonar-pro,1051.380178,68.194206,1068.59936,1035.757609,1214,4
gemini-2.0-flash-grounding,1026.486229,75.469615,1043.24005,1008.168681,1193,5
api-gpt-4o-search,1000.0,64.191984,1015.161876,984.315835,1196,7
api-gpt-4o-search-high,999.353953,47.556059,1012.954235,986.033339,1707,7
api-gpt-4o-search-high-loc,996.508028,69.116102,1012.776646,980.31565,1226,7


In [None]:
CONTROL_ELEMENTS = ["response_length_a", "num_citations_a"] + [f"cites_{domain}_a" for domain in DOMAIN_CATEGORIES if domain != "other"]
CONTROL_ELEMENTS += ["response_length_b", "num_citations_b"] + [f"cites_{domain}_b" for domain in DOMAIN_CATEGORIES if domain != "other"]

print("\n\n\n")
print("#" * 100)
print("Leaderboard with Domain Citations Control")
full_ratings_num_citations_style = run_all(battle_data, style_elements=CONTROL_ELEMENTS, viz_leaderboard=True)





####################################################################################################
Leaderboard with Domain Citations Control
##################################################
BT Ratings
##################################################


##################################################
Style Coef
##################################################


Unnamed: 0,rating,variance,rating_q975,rating_q025,num_battles,final_ranking
gemini-2.5-pro-grounding,1098.588666,83.835015,1116.87305,1080.054522,1215,1
ppl-sonar-reasoning-pro-high,1081.843326,111.701734,1102.076877,1061.027411,861,1
ppl-sonar-reasoning,1058.228835,59.370849,1074.154723,1043.90559,1644,2
ppl-sonar,1038.182678,80.669256,1055.714317,1022.178576,1208,3
ppl-sonar-pro-high,1036.827832,73.783139,1053.956593,1020.946696,1364,3
ppl-sonar-pro,1033.020924,80.204076,1050.661655,1015.296455,1214,3
gemini-2.0-flash-grounding,1021.38011,78.083717,1038.222848,1003.4852,1193,4
api-gpt-4o-search,1000.0,73.619337,1016.269557,983.454414,1196,6
api-gpt-4o-search-high,997.115456,53.913875,1011.888355,983.441946,1707,7
api-gpt-4o-search-high-loc,997.042812,75.124238,1013.819352,980.459886,1226,7


In [None]:
import random

CONTROL_ELEMENTS = ["response_length_a", "num_citations_a"] + [f"cites_{domain}_a" for domain in DOMAIN_CATEGORIES if domain != "other"]
CONTROL_ELEMENTS += ["response_length_b", "num_citations_b"] + [f"cites_{domain}_b" for domain in DOMAIN_CATEGORIES if domain != "other"]

anchor_model = 'api-gpt-4o-search'
bt_ratings = compute_bt(battle_data)
offset_score = (anchor_rating - bt_ratings[anchor_model])
bt_ratings += offset_score

bt_ratings_style, _ = compute_style_control(battle_data, style_elements=CONTROL_ELEMENTS)
offset_score = (anchor_rating - bt_ratings_style[anchor_model])
bt_ratings_style += offset_score

bt_change_scores = {}
for model in get_model_order(battle_data):
    bt_change_scores[model] = (bt_ratings[model] + random.uniform(-3.0, 3.0), bt_ratings_style[model] + random.uniform(-3.0, 3.0))

fig = viz_change_scores(bt_change_scores, "original", "controlled")
fig.show()
if SAVE_HTML:
    fig.write_html("plots/style_control.html")
else:
    fig.write_image("plots/style_control.png")