In [None]:
%load_ext autoreload
%autoreload 
import pandas as pd

pd.set_option("display.max_colwidth", 100)

In [None]:
def is_response_valid(response: dict) -> bool:
    return isinstance(response, dict) and "final_response" in response


def extract_final_response(d: dict) -> str:
    if "final_response" not in d:
        return None
    return d["final_response"]

In [None]:
from logging import getLogger
from lewidi_lib import (
    discard_failed_rows,
    discard_na_response_rows,
    drop_duplicates_in_ds_idx_run_idx,
    enable_logging,
    join_dataset,
    preds_file,
    recompute_success,
)
import pandas as pd
import json_repair

enable_logging()
logger = getLogger(__name__)


file = preds_file(
    dataset="prm800k",
    split="train",
    template="60",
    model_id="Qwen/Qwen3-32B",
    run_name="1000ex_10loops",
)
rdf = pd.read_parquet(file)
logger.info("rdf has %d rows", len(rdf))
rdf = recompute_success(rdf)
rdf = discard_failed_rows(rdf)
rdf = discard_na_response_rows(rdf)
rdf["response_parsed"] = rdf["response"].apply(json_repair.loads)
rdf["pred"] = rdf["response_parsed"].apply(extract_final_response)
rdf = discard_na_response_rows(rdf, col="pred")
rdf = drop_duplicates_in_ds_idx_run_idx(rdf)

In [None]:
is_correct_file = (
    file.parent.parent
    / "judge"
    / "Qwen/Qwen3-14B"
    / "set2"
    / "t60"
    / "1000ex_10loops"
    / "responses.parquet"
)
is_correct = pd.read_parquet(is_correct_file)
logger.info("Loaded %d is_correct rows", len(is_correct))
is_correct = discard_na_response_rows(is_correct)
is_correct = is_correct[["dataset_idx", "run_idx", "response", "reasoning"]].rename(
    columns={"response": "is_correct", "reasoning": "is_correct_reasoning"}
)
is_correct = is_correct.astype({"is_correct": "int"})
is_correct = drop_duplicates_in_ds_idx_run_idx(is_correct)

In [None]:
from lewidi_lib import compute_is_correct_crosstab

perf_ct = compute_is_correct_crosstab(is_correct, long=True)
perf_ct["correct_level"].value_counts()

In [None]:
# mixed_perf = perf_ct.query("correct_level == 'mixed'")
# tgt_mp_file = file.parent.parent.parent / "1000ex_10loops_mixed_perf_subset" / "preds" / "ids.parquet"
# tgt_mp_file.parent.mkdir(parents=True, exist_ok=True)
# mixed_perf.to_parquet(tgt_mp_file, index=False)

# """
# COPY (
#     SELECT rdf.*
#     FROM '../../1000ex_10loops/preds/responses.parquet' as rdf
#     JOIN (
#         SELECT *
#         FROM 'ids.parquet'
#     ) as ids ON rdf.dataset_idx = ids.dataset_idx
# ) TO 'responses.parquet' (FORMAT 'parquet');
# """

In [None]:
from lewidi_lib import bootstrap_avg

joint_df = join_dataset(rdf, parse_tgt=False)
joint_df = joint_df.merge(is_correct, on=["dataset_idx", "run_idx"])
bootstrap_avg(joint_df.groupby("dataset_idx")["is_correct"].mean())

In [None]:
from lewidi_lib import assign_col_response_parsed, process_ratings
import numpy as np
from prm800k import mapping
import pandas as pd

#ratings_file = "/Users/tomasruiz/datasets/dss_home/lewidi-data/sbatch/di38bec/Qwen_Qwen3-32B/set2/t60/prm800k/train/1000ex_10loops_mixed_perf_subset/judge/gemini-2.5-flash/responses.parquet"
ratings_file = "/Users/tomasruiz/datasets/dss_home/lewidi-data/sbatch/di38bec/Qwen_Qwen3-32B/set2/t60/prm800k/train/1000ex_10loops_mixed_perf_subset/judge/Qwen/Qwen3-32B/set2/t23/1000ex_10loops/responses.parquet"
ratings = pd.read_parquet(ratings_file)
logger.info("Loaded %d ratings", len(ratings))
ratings = discard_na_response_rows(ratings)
ratings = assign_col_response_parsed(ratings)
ratings = process_ratings(
    ratings, operation=np.mean, cat_mapping=mapping(ok=0.0, bad=0)
)
ratings = ratings[
    ["dataset_idx", "run_idx", "step_ratings", "score", "reasoning", "response_parsed"]
]
ratings.rename(columns={"reasoning": "judge_reasoning"}, inplace=True)

In [None]:
joint_df = joint_df.merge(ratings, on=["dataset_idx", "run_idx"], how="left")
joint_df = discard_na_response_rows(joint_df, col="score")

In [None]:
import seaborn as sns

sns.lmplot(joint_df, x="score", y="is_correct", logistic=True)

In [None]:
len(joint_df[["dataset_idx", "run_idx"]].drop_duplicates())

In [None]:
g_ = joint_df.groupby("dataset_idx")
g_[["score", "is_correct"]].mean().apply(bootstrap_avg)

In [None]:
max_score_df = joint_df.loc[joint_df.groupby("dataset_idx")["score"].idxmax()]
max_score_df[["score", "is_correct"]].apply(bootstrap_avg)

In [None]:
def corr(df):
    coeff = np.corrcoef(df["score"], df["is_correct"])[0, 1]
    return coeff


bootstrap_avg(
    joint_df.groupby("dataset_idx")[["score", "is_correct"]].apply(corr).fillna(0)
)

# Does the Reduction Operation Matter?
It does not seem so. However, product with bad=-1 could lead to misleading results with several -1 being multiplied.

In [None]:
from lewidi_lib import bootstrap_avg, create_rating_matrix

create_rating_matrix(ratings).merge(
    joint_df[["dataset_idx", "run_idx", "is_correct"]], on=["dataset_idx", "run_idx"]
).groupby(["rating_type", "reduction"]).agg(is_correct=("is_correct", bootstrap_avg))