In [None]:
%load_ext autoreload
%autoreload 
import pandas as pd

pd.set_option("display.max_colwidth", 100)

In [None]:
def is_response_valid(response: dict) -> bool:
    return isinstance(response, dict) and "final_response" in response


def extract_final_response(d: dict) -> str:
    if "final_response" not in d:
        return None
    return d["final_response"]

In [None]:
from logging import getLogger
from lewidi_lib import (
    discard_failed_rows,
    discard_na_response_rows,
    drop_duplicates_in_ds_idx_run_idx,
    enable_logging,
    join_dataset,
    preds_file,
    recompute_success,
)
import pandas as pd
import json_repair

enable_logging()
logger = getLogger(__name__)

dataset = "prm800k"
file = preds_file(
    dataset=dataset,
    split="train",
    template="60",
    model_id="Qwen/Qwen3-32B",
    run_name="allex_10loops",
)
rdf = pd.read_parquet(file)
logger.info("rdf has %d rows", len(rdf))
rdf = recompute_success(rdf)
rdf = discard_failed_rows(rdf)
rdf = discard_na_response_rows(rdf)
rdf["response_parsed"] = rdf["response"].apply(json_repair.loads)
rdf["pred"] = rdf["response_parsed"].apply(extract_final_response)
rdf = discard_na_response_rows(rdf, col="pred")
rdf = drop_duplicates_in_ds_idx_run_idx(rdf)
logger.info(
    "rdf has %d rows for %d different dataset_idx",
    len(rdf),
    rdf["dataset_idx"].nunique(),
)

In [None]:
is_correct_file = (
    file.parent.parent
    / "judge"
    / "Qwen/Qwen3-14B"
    / "set2"
    / "t60"
    / "allex_10loops"
    / "responses.parquet"
)
is_correct = pd.read_parquet(is_correct_file)
logger.info(
    "Loaded %d is_correct rows for %d different dataset_idxs",
    len(is_correct),
    is_correct["dataset_idx"].nunique(),
)
is_correct = discard_na_response_rows(is_correct)
is_correct = is_correct[
    ["dataset", "dataset_idx", "run_idx", "response", "reasoning"]
].rename(columns={"response": "is_correct", "reasoning": "is_correct_reasoning"})
is_correct = is_correct.astype({"is_correct": "int"})
is_correct = drop_duplicates_in_ds_idx_run_idx(is_correct)

In [None]:
from lewidi_lib import compute_is_correct_crosstab

perf_ct = compute_is_correct_crosstab(is_correct, long=True)
perf_ct["correct_level"].value_counts()

In [None]:
# import duckdb


# mixed_perf = perf_ct.query("correct_level == 'mixed'")
# tgt_mp_file = file.parent.parent.parent / "allex_10loops_mixed_perf_subset" / "preds" / "ids.parquet"
# tgt_responses_file = tgt_mp_file.parent / "responses.parquet"
# tgt_mp_file.parent.mkdir(parents=True, exist_ok=True)
# mixed_perf.to_parquet(tgt_mp_file, index=False)

# sql = f"""
# COPY (
#     SELECT rdf.*
#     FROM '{str(file)}' as rdf
#     JOIN (
#         SELECT *
#         FROM '{str(tgt_mp_file)}'
#         WHERE success = true
#     ) as ids ON rdf.dataset_idx = ids.dataset_idx
# ) TO '{str(tgt_responses_file)}';
# """
# duckdb.sql(sql)

In [None]:
from lewidi_lib import bootstrap_avg

joint_df = join_dataset(rdf, parse_tgt=False)
joint_df = joint_df.merge(is_correct, on=["dataset", "dataset_idx", "run_idx"])
joint_df = joint_df.merge(perf_ct.query("correct_level == 'mixed'"), on="dataset_idx")
assert len(joint_df) > 0
logger.info("joint_df has %d different dataset_idx", joint_df["dataset_idx"].nunique())

In [None]:
# ratings_file = "/Users/tomasruiz/datasets/dss_home/lewidi-data/sbatch/di38bec/Qwen_Qwen3-32B/set2/t60/prm800k/train/1000ex_10loops_mixed_perf_subset/judge/gemini-2.5-flash/responses.parquet"
# ratings_file = "/Users/tomasruiz/datasets/dss_home/lewidi-data/sbatch/di38bec/Qwen_Qwen3-32B/set2/t60/prm800k/train/1000ex_10loops_mixed_perf_subset/judge/Qwen/Qwen3-32B/set2/t23/1000ex_10loops/responses.parquet"
# ratings_file = "/Users/tomasruiz/datasets/dss_home/lewidi-data/sbatch/di38bec/Qwen_Qwen3-32B/set2/t60/prm800k/train/1000ex_10loops_mixed_perf_subset/judge/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/set2/t23/1000ex_10loops/responses.parquet"

In [None]:
from lewidi_lib import assign_col_response_parsed, process_ratings
import numpy as np
from prm800k import mapping
import pandas as pd

judge = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
# judge = "gemini-2.5-flash"
# judge = "Qwen/Qwen3-32B"
judge_to_file = {
    "gemini-2.5-flash": "allex_10loops_mixed_perf_subset/judge/gemini-2.5-flash/t24/allex_10loops_mp/responses.parquet",
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B": "allex_10loops_mixed_perf_subset/judge/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/set2/t24/allex_10loops_mp/responses.parquet",
    "Qwen/Qwen3-32B": "allex_10loops_mixed_perf_subset/judge/Qwen/Qwen3-32B/set2/t24/allex_10loops_mp/responses.parquet",
}
ratings_file = file.parent.parent.parent / judge_to_file[judge]
ratings = pd.read_parquet(ratings_file)
ratings = recompute_success(ratings)
ratings = discard_failed_rows(ratings)
logger.info(
    "Loaded %d ratings for %d different dataset_idx",
    len(ratings),
    len(ratings["dataset_idx"].unique()),
)
ratings = discard_na_response_rows(ratings)
ratings = assign_col_response_parsed(ratings)
ratings = process_ratings(
    ratings, operation=np.mean, cat_mapping=mapping(ok=0.0, bad=0)
)
ratings.rename(columns={"reasoning": "judge_reasoning"}, inplace=True)

In [None]:
ratings_cols = [
    "dataset",
    "dataset_idx",
    "run_idx",
    "step_ratings",
    "score",
    "judge_reasoning",
]
joint_df = joint_df.merge(
    ratings[ratings_cols], on=["dataset", "dataset_idx", "run_idx"], how="left"
)
joint_df = discard_na_response_rows(joint_df, col="score")
assert len(joint_df) != 0
joint_df = joint_df.drop_duplicates(subset=["dataset_idx", "run_idx"])

In [None]:
from lewidi_lib import compute_n_steps_equality


compute_n_steps_equality(joint_df, step_source="response_parsed")

In [None]:
import seaborn as sns

sns.set_context("talk")
sns.lmplot(joint_df, x="score", y="is_correct", logistic=True)

In [None]:
len(joint_df)

In [None]:
np.random.seed(0)
bootstrap_avg(
    [
        joint_df.groupby("dataset_idx").sample(n=1)["is_correct"].mean()
        for _ in range(100)
    ]
)

In [None]:
# There is randomness due to multiple preds being "the best", so we choose one randomly
from lewidi_lib import select_max_score_df

np.random.seed(0)
bootstrap_avg(
    [
        select_max_score_df(joint_df.sample(frac=1.0))["is_correct"].mean()
        for _ in range(20)
    ]
)

In [None]:
def corr(df):
    coeff = np.corrcoef(df["score"], df["is_correct"])[0, 1]
    return coeff


bootstrap_avg(
    joint_df.groupby("dataset_idx")[["score", "is_correct"]].apply(corr).fillna(0)
)

# Does the Reduction Operation Matter?
Using Ok=0 rather than Ok=1 helps.

In [None]:
from lewidi_lib import create_rating_matrix


perf_data = ratings.merge(is_correct, on=["dataset", "dataset_idx", "run_idx"])
create_rating_matrix(perf_data, performance_col="is_correct")

# Aggregate Stats

In [None]:
ax = sns.histplot(joint_df, x="score")
ax.grid(alpha=0.5, axis="y")
ax.figure.set_size_inches(6, 3)

# How Does BoN improve performance with more samples?

In [None]:
from lewidi_lib import draw_bon_k_times

np.random.seed(0)
perf_vs_samples = pd.DataFrame({"n_samples": range(1, 11)})
perf_vs_samples["is_correct"] = perf_vs_samples["n_samples"].apply(
    draw_bon_k_times, joint_df=joint_df, k=100
)
perf_vs_samples = perf_vs_samples.explode("is_correct")

In [None]:
axs = sns.lineplot(data=perf_vs_samples, x="n_samples", y="is_correct")
axs.grid(alpha=0.5)
axs.set_xlabel("Number of samples")
axs.set_ylabel("Correct Answers")


In [None]:
judge_alias = judge.replace("/", "_")
perf_vs_samples.assign(judge=judge, dataset=dataset).to_parquet(
    f"../notebook/tables/bon_samples_vs_perf/{judge_alias}_{dataset}.parquet"
)