In [None]:
%load_ext autoreload
%autoreload 2
from lewidi_lib import enable_logging
import pandas as pd
import duckdb
import seaborn as sns

sns.set_context("talk")
enable_logging()

# Math Datasets

In [None]:
from lewidi_lib import compact_model_name


df = duckdb.sql("SELECT * FROM read_parquet('./tables/bon_samples_vs_perf/*')").df()
df = df.drop(columns=["__index_level_0__"])
df["Judge"] = df["judge"].apply(compact_model_name)
df["Dataset"] = df["dataset"].apply(lambda s: s.upper())

In [None]:
import numpy as np


def quantiles(xs):
    return np.quantile(xs, 0.25), np.quantile(xs, 0.75)

In [None]:
from lewidi_lib import plot_horizontal_lines

fgrid = sns.relplot(
    df,
    x="n_samples",
    y="is_correct",
    col="Dataset",
    col_order=["PRM800K", "AIME"],
    hue="Judge",
    style="Judge",
    markers=["o", "s", "D", "P"],
    kind="line",
    facet_kws={"sharey": False},
    height=4,
    aspect=1.2,
    errorbar=quantiles,
)
fgrid.set_axis_labels("LLM samples $N$", "Correct answers")
sns.move_legend(fgrid, loc="lower left", bbox_to_anchor=(0.1, 0.95), ncol=3)
for ax in fgrid.axes.flat:
    ax.grid(alpha=0.5)

data = pd.DataFrame({"Dataset": ["PRM800K", "AIME"], "is_correct": [0.721, 0.639]})
plot_horizontal_lines(
    fgrid,
    data,
    label="Simple Sampling",
    color="blue",
    data_col="is_correct",
    hpos="right",
    fontsize=16,
)

fgrid.savefig("./imgs/bon-eval/bon_samples_vs_perf_math.pdf", bbox_inches="tight")

# NLP Datasets

In [None]:
from lewidi_lib import compact_model_name, rename_dataset


sldf = (
    duckdb.sql("SELECT * FROM read_parquet('./tables/bon_samples_vs_perf_nlp_t60/*')")
    .df()
    .query("judge != 'gemini-2.5-flash'")
)
sldf = sldf.drop(columns=["__index_level_0__"])
sldf["Judge"] = sldf["judge"].apply(compact_model_name)
sldf = rename_dataset(sldf, col="dataset")
sldf["Dataset"] = sldf["dataset"]
sldf.head(2)

In [None]:
from lewidi_lib import rename_dataset

sl_model_averaging_df = (
    pd.read_csv("./tables/soft-label/32b_ws_loss.csv")
    .query("Baseline == 'Model Averaging'")
    .melt(id_vars="Baseline", var_name="Dataset", value_name="ws_loss")
)
sl_simple_sampling_df = (
    sldf.query("n_samples == 1")
    .groupby("Dataset", as_index=False)["ws_loss"]
    .mean()
    .assign(Baseline="Simple Sampling")
)

sl_hlines_df = pd.concat([sl_model_averaging_df, sl_simple_sampling_df])
sl_hlines_df = rename_dataset(sl_hlines_df, col="Dataset")
sl_hlines_df

In [None]:
def plot_bon_scaling(
    df,
    hlines_df,
    perf_col: str,
    scaling_method: str,
    dist1_label: str,
    dist2_label: str,
    sharey: bool,
):
    datasets = ["CSC", "PAR", "MP", "VEN"]
    fgrid = sns.relplot(
        df,
        x="n_samples",
        y=perf_col,
        col="Dataset",
        col_wrap=2,
        col_order=datasets,
        hue="Judge",
        style="Judge",
        markers=["o", "s", "D", "P"],
        kind="line",
        facet_kws={"sharey": sharey},
        height=3.5,
        aspect=1.5,
        errorbar=quantiles,
    )
    fgrid.set_axis_labels("LLM samples $N$", dist1_label)
    sns.move_legend(fgrid, loc="lower left", bbox_to_anchor=(0.2, 1.0), ncol=2)
    for ax in fgrid.axes.flat:
        ax.grid(alpha=0.5)
    fgrid.axes[2].set_ylabel(dist2_label)

    hlines_fontsize = 16
    plot_horizontal_lines(
        fgrid,
        hlines_df.query("Baseline == 'Simple Sampling'"),
        label="Simple Sampling",
        color="blue",
        data_col=perf_col,
        fontsize=hlines_fontsize,
    )
    plot_horizontal_lines(
        fgrid,
        hlines_df.query(f"Baseline == '{scaling_method}'"),
        label=scaling_method,
        color="red",
        data_col=perf_col,
        fontsize=hlines_fontsize,
    )

    return fgrid


fgrid = plot_bon_scaling(
    df=rename_dataset(sldf),
    hlines_df=sl_hlines_df,
    perf_col="ws_loss",
    scaling_method="Model Averaging",
    dist1_label="Wasserstein Distance",
    dist2_label="Manhattan Distance",
    sharey=False,
)
fgrid.savefig("./imgs/bon-eval/bon_samples_vs_perf_nlp_t60.pdf", bbox_inches="tight")

# Perspectivist

In [None]:
pedf = (
    duckdb.sql("SELECT * FROM read_parquet('./tables/bon_samples_vs_perf_nlp_t63/*')")
    .df()
    .query("judge != 'gemini-2.5-flash'")
)
pedf = pedf.drop(columns=["__index_level_0__"])
pedf["Judge"] = pedf["judge"].apply(compact_model_name)
pedf = rename_dataset(pedf, col="dataset")
pedf["Dataset"] = pedf["dataset"]

In [None]:
pe_reference = "Majority Vote"
pe_model_averaging_df = (
    pd.read_csv("./tables/perspectivist/32b_avg_abs_dist.csv")
    .query(f"Baseline == '{pe_reference}'")
    .melt(id_vars="Baseline", var_name="Dataset", value_name="avg_abs_dist")
)
pe_simple_sampling_df = (
    pedf.query("n_samples == 1")
    .groupby("Dataset", as_index=False)["avg_abs_dist"]
    .mean()
    .assign(Baseline="Simple Sampling")
)

pe_hlines_df = pd.concat([pe_model_averaging_df, pe_simple_sampling_df])
pe_hlines_df = rename_dataset(pe_hlines_df, col="Dataset")
pe_hlines_df

In [None]:
fgrid = plot_bon_scaling(
    pedf,
    hlines_df=pe_hlines_df,
    perf_col="avg_abs_dist",
    scaling_method=pe_reference,
    dist1_label="Absolute Distance",
    dist2_label="Error Rate",
    sharey=False,
)
fgrid.savefig("./imgs/bon-eval/bon_samples_vs_perf_nlp_t63.pdf", bbox_inches="tight")

# Perf tables

In [None]:
(
    sldf.query("n_samples == 10")
    .groupby(["judge", "dataset"])
    .agg({"ws_loss": "mean"})
    .reset_index()
    .round(3)
    .to_csv("./tables/soft-label/32b_bon_ws_loss.csv", index=False)
)

In [None]:
(
    pedf.query("n_samples == 10")
    .groupby(["judge", "dataset"])
    .agg({"avg_abs_dist": "mean"})
    .reset_index()
    .round(3)
    .to_csv("./tables/perspectivist/32b_bon_avg_abs_dist.csv", index=False)
)