In [None]:
%load_ext autoreload
%autoreload 2
from logging import getLogger
from lewidi_lib import enable_logging
import pandas as pd
import seaborn as sns

pd.set_option("display.max_colwidth", 100)
enable_logging()
logger = getLogger(__name__)
sns.set_context("talk")

In [None]:
from lewidi_lib import assert_path_exists, preds_file

dataset = "CSC"
task = "soft-label"
file = preds_file(
    dataset=dataset,
    split="train",
    template="62",
    model_id="Qwen/Qwen3-14B",
    run_name="1000ex_10loops",
)
rdf = pd.read_parquet(assert_path_exists(file))

In [None]:
from lewidi_lib import (
    assign_cols_perf_metrics_softlabel,
    compute_diversity_by_problem,
    join_dataset,
    keep_only_highest_diversity_problems,
    process_rdf,
)

rdf = process_rdf(
    rdf, task=task, response_contains_steps=True, discard_invalid_pred=True
)
if task == "soft-label":
    answer_diversity = compute_diversity_by_problem(rdf)
    rdf = rdf.merge(answer_diversity, on="dataset_idx")
len(rdf)

In [None]:
from lewidi_lib import assign_cols_perf_metrics


joint_df = join_dataset(rdf, task=task)
joint_df = assign_cols_perf_metrics(joint_df, task=task)

if False:
    joint_df_subset = keep_only_highest_diversity_problems(joint_df)
else:
    joint_df_subset = joint_df

In [None]:
# answer_diversity.query("diversity == 'Q5'")[["dataset_idx", "diversity"]].to_parquet("high_diversity_ids.parquet")

# """
# copy (
#     select rdf.*
#     from (select * from '../../1000ex_10loops/preds/responses.parquet') as rdf
#     join 'high_diversity_ids.parquet' as ids on rdf.dataset_idx = ids.dataset_idx
#     )
# to 'responses.parquet';
# """

# Load Ratings

In [None]:
from lewidi_lib import (
    assign_col_response_parsed,
    discard_na_response_rows,
    process_ratings,
)
import numpy as np
from prm800k import mapping

# judge = "gemini-2.5-flash"
# judge = "Qwen/Qwen3-32B"
judge = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# judge = "qwen/qwen3-235b-a22b-2507"
# judge = "Qwen/Qwen3-235B-A22B-Thinking-2507"

judge_file = {
    "gemini-2.5-flash": "gemini-2.5-flash/t24/responses.parquet",
    "Qwen/Qwen3-32B": "Qwen/Qwen3-32B/set2/t24/1000ex_10loops_q5div/responses.parquet",
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/set2/t24/1000ex_10loops_q5div/responses.parquet",
    "qwen/qwen3-235b-a22b-2507": "qwen/qwen3-235b-a22b-2507/set2/t24/1000ex_10loops_q5div/responses.parquet",
    "Qwen/Qwen3-235B-A22B-Thinking-2507": "Qwen/Qwen3-235B-A22B-Thinking-2507/set2/t24/1000ex_10loops_q5div/responses.parquet",
}
ratings_file = file.parent.parent / "judge" / judge_file[judge]

ratings = pd.read_parquet(ratings_file)
logger.info(
    "Loaded %d ratings for %d different dataset_idxs",
    len(ratings),
    ratings["dataset_idx"].nunique(),
)
ratings = discard_na_response_rows(ratings)
ratings = assign_col_response_parsed(ratings)
ratings = process_ratings(
    ratings, operation=np.mean, cat_mapping=mapping(ok=0.0, bad=0)
)
ratings = ratings[
    ["dataset_idx", "run_idx", "step_ratings", "score", "reasoning", "response_parsed"]
]
ratings.rename(columns={"reasoning": "judge_reasoning"}, inplace=True)

In [None]:
joint_df_subset = joint_df_subset.merge(
    ratings, on=["dataset_idx", "run_idx"], how="left"
)
joint_df_subset = discard_na_response_rows(joint_df_subset, col="score")
logger.info(
    "We have score for %d examples of %d different dataset_idxs",
    len(joint_df_subset),
    joint_df_subset["dataset_idx"].nunique(),
)

In [None]:
from lewidi_lib import compute_n_steps_equality

joint_df_subset = assign_col_response_parsed(joint_df_subset)
compute_n_steps_equality(joint_df_subset, step_source="response_parsed")

# BoN Loss Stats

In [None]:
len(joint_df_subset)

In [None]:
from lewidi_lib import bootstrap_avg

if task == "soft-label":
    perf_col = "ws_loss"
else:
    perf_col = "avg_abs_dist"

np.random.seed(0)
bootstrap_avg(
    [
        joint_df_subset.groupby("dataset_idx").sample(n=1)[perf_col].mean()
        for _ in range(100)
    ]
)

In [None]:
from lewidi_lib import select_max_score_df

np.random.seed(0)
bootstrap_avg(
    [
        select_max_score_df(joint_df_subset.sample(frac=1.0))[perf_col].mean()
        for _ in range(20)
    ]
)

In [None]:
oracle = joint_df_subset.loc[joint_df_subset.groupby("dataset_idx")[perf_col].idxmin()]
oracle[[perf_col, "score"]].apply(bootstrap_avg)

# Problem-Level Correlation

In [None]:
def corr(df, perf_col: str):
    coeff = np.corrcoef(df["score"], df[perf_col])[0, 1]
    return coeff


corrs = (
    joint_df_subset.groupby("dataset_idx")[["score", perf_col]]
    .apply(corr, perf_col=perf_col)
    .fillna(0)
)
bootstrap_avg(corrs)

# Individual Examples

In [None]:
# max_loss = joint_df_subset.loc[joint_df_subset.groupby("dataset_idx")["ws_loss"].idxmax()]
# row = max_loss.iloc[0]
# print(row["response"])

# Aggregate Stats

In [None]:
ax = sns.histplot(joint_df_subset, x="score")
ax.grid(alpha=0.5, axis="y")
ax.figure.set_size_inches(6, 3)

# How does performance change with the number of samples?

In [None]:
from lewidi_lib import (
    assign_col_avg_abs_dist,
    compute_average_baseline_and_assing_perf_metrics,
    compute_maj_vote_baseline,
    compute_oracle_baseline,
)


if task == "soft-label":
    baseline_df = compute_average_baseline_and_assing_perf_metrics(joint_df_subset)
else:
    baseline_df = compute_maj_vote_baseline(joint_df_subset)
    baseline_df = assign_col_avg_abs_dist(baseline_df)

baseline_perf = baseline_df[perf_col].mean()
oracle = compute_oracle_baseline(joint_df_subset, perf_col=perf_col)
oracle[[perf_col, "score"]].apply(bootstrap_avg)

In [None]:
from lewidi_lib import compute_oracle_baseline

oracle_ws_loss = compute_oracle_baseline(joint_df_subset, perf_col=perf_col)[
    perf_col
].mean()

In [None]:
from lewidi_lib import draw_bon_k_times

np.random.seed(0)
perf_vs_samples = pd.DataFrame(
    {"n_samples": range(1, joint_df_subset["run_idx"].nunique() + 1)}
)
perf_vs_samples[perf_col] = perf_vs_samples["n_samples"].apply(
    draw_bon_k_times, joint_df=joint_df_subset, k=100, performance_col=perf_col
)
perf_vs_samples = perf_vs_samples.explode(perf_col)

In [None]:
axs = sns.lineplot(data=perf_vs_samples, x="n_samples", y=perf_col, errorbar="sd")
axs.grid(alpha=0.5)
axs.set_xlabel("Number of samples")
axs.set_ylabel("Wasserstein Distance")
axs.axhline(y=baseline_perf, color="orange", linestyle="--", label="Model Averaging")
axs.axhline(y=oracle_ws_loss, color="red", linestyle="--", label="BoN Oracle")
axs.legend()

In [None]:
# judge_alias = judge.replace("/", "_")
# perf_vs_samples.assign(judge=judge, dataset=dataset).to_parquet(
#     f"../notebook/tables/bon_samples_vs_perf_nlp/{judge_alias}_{dataset}.parquet"
# )