In [None]:
%load_ext autoreload
%autoreload 2
from lewidi_lib import enable_logging

enable_logging()

In [None]:
from pathlib import Path
import duckdb
from lewidi_lib import preds_file
import pandas as pd

datasets = ["MP", "CSC", "Paraphrase", "VariErrNLI", "prm800k", "aime"]


def qwen32b_preds_file(dataset: str) -> Path:
    if is_math(dataset):
        run_name = "allex_10loops"
    else:
        run_name = "1000ex_10loops"
    return preds_file(
        dataset=dataset,
        split="train",
        template="60",
        model_id="Qwen/Qwen3-32B",
        run_name=run_name,
    )


judge = "Qwen/Qwen3-32B"

judge_file_nlp = {
    "gemini-2.5-flash": "1000ex_10loops/judge/gemini-2.5-flash/t24/responses.parquet",
    "Qwen/Qwen3-32B": "1000ex_10loops/judge/Qwen/Qwen3-32B/set2/t24/1000ex_10loops_q5div/responses.parquet",
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B": "1000ex_10loops/judge/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/set2/t24/1000ex_10loops_q5div/responses.parquet",
}
judge_to_file_math = {
    "gemini-2.5-flash": "allex_10loops_mixed_perf_subset/judge/gemini-2.5-flash/t24/allex_10loops_mp/responses.parquet",
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B": "allex_10loops_mixed_perf_subset/judge/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B/set2/t24/allex_10loops_mp/responses.parquet",
    "Qwen/Qwen3-32B": "allex_10loops_mixed_perf_subset/judge/Qwen/Qwen3-32B/set2/t24/allex_10loops_mp/responses.parquet",
}


def judge_to_file(judge, dataset):
    if is_math(dataset):
        return judge_to_file_math[judge]
    else:
        return judge_file_nlp[judge]


def is_math(dataset: str) -> bool:
    return dataset.lower() in ["prm800k", "aime"]


def assign_col_domain(df: pd.DataFrame) -> pd.DataFrame:
    col = df["dataset"].apply(lambda x: "Math" if is_math(x) else "Subjective NLP")
    return df.assign(domain=col)


preds_files = []
judge_files = []
for dataset in datasets:
    pfile = qwen32b_preds_file(dataset)
    preds_files.append(pfile)
    assert pfile.exists()
    for judge in judge_file_nlp.keys():
        judge_file = pfile.parent.parent.parent / judge_to_file(judge, dataset)
        assert judge_file.exists(), f"{judge_file} does not exist"
        judge_files.append(judge_file)

In [None]:
import json_repair
from lewidi_lib import assign_col_response_parsed, process_ratings
import numpy as np
from prm800k import extract_rating, mapping

con = duckdb.connect()
df = con.sql(
    f"""
SELECT dataset, judge_model_id, dataset_idx, run_idx, response, reasoning
FROM read_parquet({[str(f) for f in judge_files]}, union_by_name=True)
WHERE success = true
"""
).df()


def extract(x):
    try:
        return [e["rating"] for e in json_repair.loads(x)]
    except Exception:
        return None


df = assign_col_response_parsed(df)
df = process_ratings(df, operation=np.mean, cat_mapping=mapping(ok=0.0, bad=0))
df = df.assign(
    reasoning_len=df["reasoning"].str.len(),
    response_len=df["response"].str.len(),
)
df.head(2)

In [None]:
step_rating_data = df.explode("step_ratings").rename(columns={"step_ratings": "rating"})

In [None]:
(
    step_rating_data.groupby(["judge_model_id", "dataset"])["rating"]
    .value_counts(normalize=True)
    .reset_index()
    .query("rating == 1")
    .pivot(index="judge_model_id", columns="dataset", values="proportion")
    * 100
).round(1)

In [None]:
cot_len_df = df.query("judge_model_id != 'gemini-2.5-flash'")
cot_len_df.groupby(["judge_model_id", "dataset"], as_index=False)[
    "reasoning_len"
].mean().pivot(index="judge_model_id", columns="dataset", values="reasoning_len").round(
    0
)

In [None]:
df.head(2)

In [None]:
len_data_judge = df.melt(
    id_vars=["judge_model_id", "dataset"],
    value_vars=["response_len", "reasoning_len"],
    value_name="length",
    var_name="block",
).assign(model="Judge")

block_names = {"response_len": "response", "reasoning_len": "reasoning"}
len_data_judge["block"] = len_data_judge["block"].map(block_names)
len_data_judge = assign_col_domain(len_data_judge)
len_data_judge = len_data_judge.query(
    "judge_model_id != 'gemini-2.5-flash' and block == 'reasoning'"
)
model_map = {
    "Qwen/Qwen3-32B": "Qwen3-32B",
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B": "DeepSeek-R1-8B",
}
ds_map = {"prm800k": "PRM800K", "aime": "AIME"}
len_data_judge = len_data_judge.assign(
    model_id=len_data_judge["judge_model_id"].map(model_map),
    dataset=len_data_judge["dataset"].map(lambda s: ds_map.get(s, s)),
)
len_data_judge["title"] = len_data_judge["model"] + ": " + len_data_judge["model_id"]
len_data_judge.head(2)

# Preds Stats

In [None]:
long = con.sql(
    f"""
SELECT dataset, model_id, dataset_idx, run_idx, response, reasoning
FROM read_parquet({[str(f) for f in preds_files]}, union_by_name=True)
WHERE success = true
"""
).df()

long = long.assign(model_id=long["model_id"].map(model_map))
long = long.assign(
    response_len=long["response"].apply(len),
    reasoning_len=long["reasoning"].apply(len),
)
len_data_preds = long.melt(
    id_vars=["model_id", "dataset"],
    value_vars=["response_len", "reasoning_len"],
    value_name="length",
    var_name="block",
).assign(model="LLM")
len_data_preds["block"] = len_data_preds["block"].map(block_names)
len_data_preds["dataset"] = len_data_preds["dataset"].map(lambda x: ds_map.get(x, x))
len_data_preds["title"] = len_data_preds["model"] + ": " + len_data_preds["model_id"]
len_data_preds = assign_col_domain(len_data_preds)
len_data_preds.head(2)

In [None]:
long.groupby(["dataset"], as_index=False).agg(
    response=("response_len", "mean"),
    reasoning=("reasoning_len", "mean"),
).round(0).melt(id_vars=["dataset"], var_name="type", value_name="length").pivot(
    index="dataset", columns="type", values="length"
)

In [None]:
preds_data = len_data_preds.query("block == 'reasoning'")
judge_data = len_data_judge[preds_data.columns]
joint_data = pd.concat([preds_data, judge_data], ignore_index=True)
joint_data.head(2)

In [None]:
import seaborn as sns

sns.set_context("talk")


def plot_num_chars(len_data):
    fgrid = sns.catplot(
        len_data,
        y="dataset",
        x="length",
        hue="domain",
        kind="bar",
        col="title",
        # showfliers=False,
        margin_titles=True,
        errorbar=lambda x: (np.quantile(x, 0.25), np.quantile(x, 0.75)),
        sharex=False,
        aspect=0.8,
    )
    fgrid.set_titles(col_template="{col_name}")
    fgrid.set_axis_labels("Number of Chars", "Dataset")
    sns.move_legend(
        fgrid, loc="lower left", bbox_to_anchor=(0.3, 1.0), ncol=2, title="Domain"
    )
    for ax in fgrid.axes.flat:
        ax.grid(alpha=0.5, axis="x")
    return fgrid


fgrid = plot_num_chars(joint_data)

fgrid.savefig(
    "imgs/domain_comp/lens-of-responses-and-reasonings.pdf", bbox_inches="tight"
)

In [None]:
# fgrid = sns.catplot(
#     len_data_judge,
#     y="dataset",
#     x="length",
#     kind="box",
#     hue="domain",
#     col="model_id",
#     showfliers=False,
#     errorbar=lambda x: (np.quantile(x, 0.25), np.quantile(x, 0.75)),
#     sharex=False,
#     margin_titles=True,
# )
# fgrid.set_axis_labels("Number of Chars", "Dataset")
# sns.move_legend(
#     fgrid, loc="lower left", bbox_to_anchor=(0.3, 1.0), ncol=2, title="Domain"
# )
# fgrid.set_titles(col_template="{col_name}")
# for ax in fgrid.axes.flat:
#     ax.grid(alpha=0.5, axis="x")
# fgrid.savefig("imgs/domain_comp/lens-of-reasonings-judge.pdf", bbox_inches="tight")