In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from lewidi_lib import (
    assign_cols_perf_metrics,
    enable_logging,
    join_correct_responses,
    load_preds,
    make_query_from_dict,
    process_rdf,
)
import pandas as pd
import logging

logger = logging.getLogger(__name__)

enable_logging()


ratings = pd.read_json(
    "../parquets/reasoning-ratings/template-2-reasoning-judge-responses.jsonl",
    # "/mnt/disk16tb/globus_shared/from-lrz-ai-systems/tasks_0_cscfull_t31_Qwen_Qwen3-32B_set2/judge/Qwen3-32B.jsonl",
    lines=True,
)
if "split" not in ratings.columns:
    ratings = ratings.assign(split="train")
print("len(ratings)=", len(ratings))


rdf = load_preds(parquets_dir="../parquets")
# rdf = load_preds(
#     parquets_dir="/mnt/disk16tb/globus_shared/from-lrz-ai-systems/tasks_0_cscfull_t31_Qwen_Qwen3-32B_set2/preds"
# )
rdf.drop_duplicates(inplace=True)


def preprocess(rdf: pd.DataFrame, model_id="Qwen/Qwen3-32B") -> pd.DataFrame:
    metadata = {
        "template_id": 31,
        "model_id": model_id,
        "gen_kwargs": "set2",
        "dataset": "CSC",
        "judge_model_id": "gemini-2.5-pro",
    }
    query = make_query_from_dict(metadata, rdf.columns)
    rdf = rdf.query(query)
    rdf = process_rdf(rdf)
    rdf = join_correct_responses(rdf)
    rdf = assign_cols_perf_metrics(rdf)
    return rdf


rdf = preprocess(rdf)

In [None]:
from itertools import product
import json_repair
from lewidi_lib import (
    create_rating_matrix,
    drop_failed_rows,
    drop_na_response_rows,
    process_ratings,
)
from prm800k import mapping

ratings = drop_failed_rows(ratings)
ratings = drop_na_response_rows(ratings)
ratings["response_parsed"] = ratings["response"].apply(json_repair.loads)
ratings = process_ratings(ratings, cat_mapping=mapping(ok=0, bad=0))

In [4]:
join_cols = ["dataset", "dataset_idx", "run_idx"]  # expand when more cols!
ratings_cols = [
    "response",
    "step_ratings",
    "score",
    "reasoning",
    "judge_model_id",
    "dataset",
    "split",
    "dataset_idx",
    "run_idx",
]


def join_ratings(rdf: pd.DataFrame, ratings: pd.DataFrame, ratings_cols=ratings_cols):
    return ratings[ratings_cols].merge(
        rdf, on=join_cols, how="inner", suffixes=("_judge", "")
    )


joint = join_ratings(rdf, ratings)
# assert len(joint) == len(ratings), (len(joint), len(ratings))

In [None]:
all_best_rows = create_rating_matrix(ratings)
join_ratings(
    rdf, all_best_rows, ratings_cols=[*ratings_cols, "rating_type", "reduction"]
).groupby(["reduction", "rating_type"]).agg(
    score=("score", "mean"),
    ws_loss=("ws_loss", "mean"),
    # pred_entropy=("pred_entropy", "mean"),
).round(2)

In [None]:
joint[["score", "ws_loss"]].corr()

In [None]:
import seaborn as sns

# Using JointGrid directly for more control
fgrid = sns.JointGrid(data=joint, x="score", y="ws_loss")
fgrid.plot_joint(sns.scatterplot, alpha=0.5)
fgrid.plot_joint(sns.regplot, scatter=False)  # Add regression line
fgrid.plot_marginals(sns.histplot)

In [None]:
(joint.groupby("dataset_idx").size() == 10).all()

In [None]:
rdflarge = load_preds(
    "/mnt/disk16tb/globus_shared/from-lrz-ai-systems/tasks_0_cscfull_t31_Qwen_Qwen3-32B_set2/preds"
)
rdflarge.drop_duplicates(inplace=True)
rdflarge = preprocess(rdflarge)

In [None]:
# there is almost no performance difference between the normal outputs
# and those selected for top trace ratings
avg_ws_loss = rdflarge.groupby("dataset_idx", as_index=False).agg(
    ws_loss=("ws_loss", "mean"), pred_entropy=("pred_entropy", "mean")
)
avg_ws_loss.mean()

In [11]:
best_by_judge = joint.loc[joint.groupby("dataset_idx")["score"].idxmax()][
    [
        "dataset_idx",
        "score",
        "tgt_has_holes",
        "ws_loss",
        "pred_entropy",
        "target_entropy",
    ]
]

In [12]:
rdflarge = rdflarge.assign(reasoning_len_chars=rdflarge["reasoning"].apply(len))
most_cot_chars = rdflarge.loc[
    rdflarge.groupby("dataset_idx")["reasoning_len_chars"].idxmax()
]

In [13]:
import nltk

fst_n_dataset_idxs = rdflarge["dataset_idx"].unique()[:100]
rdflarge_n_ixs = rdflarge.query("dataset_idx.isin(@fst_n_dataset_idxs)")
rdflarge_n_ixs = rdflarge_n_ixs.assign(
    reasoning_len_steps=rdflarge_n_ixs["reasoning"].apply(
        lambda r: len(nltk.sent_tokenize(r))
    )
)
most_cot_steps = rdflarge_n_ixs.loc[
    rdflarge_n_ixs.groupby("dataset_idx")["reasoning_len_steps"].idxmax()
]

In [None]:
from lewidi_lib import (
    agg_perf_metrics,
    compute_average_baseline,
    process_rdf_and_add_perf_metrics,
)

model_avg_baseline = compute_average_baseline(rdflarge)
gemini_raw = (
    load_preds("../parquets/baseline")
    .query("template_id == 31")
    .pipe(process_rdf_and_add_perf_metrics)
)
gemini_agg = agg_perf_metrics(gemini_raw)
gemini_model_avg = agg_perf_metrics(compute_average_baseline(gemini_raw))

In [None]:
rdflarge = rdflarge.assign(
    entropy_rank=rdflarge.groupby("dataset_idx")["pred_entropy"]
    .rank(method="first")
    .astype(int)
)
by_entropy = rdflarge.groupby("entropy_rank", as_index=False)[
    ["ws_loss", "pred_entropy"]
].mean()
by_entropy["type"] = (
    "entropy"  # "entropy r" + (by_entropy["entropy_rank"] - 1).astype(str)
)
by_entropy.head(2)

In [16]:
cols = ["pred_entropy", "ws_loss"]
loss_vs_entropy = pd.DataFrame(
    {
        "best_by_judge": best_by_judge[cols].mean(),
        "simple": avg_ws_loss[cols].mean(),
        "model_avg_baseline": model_avg_baseline[cols].mean(),
        "gemini-2.5-pro": gemini_agg[cols].mean(),
        "gemini-2.5-pro-model-avg": gemini_model_avg[cols].mean(),
        "most_cot_chars": most_cot_chars[cols].mean(),
        "most_cot_steps": most_cot_steps[cols].mean(),
    }
).T.reset_index(names="type")
loss_vs_entropy = pd.concat([loss_vs_entropy, by_entropy.drop(columns="entropy_rank")])

In [None]:
sns.set_context("talk")
grid = sns.JointGrid(data=loss_vs_entropy, x="pred_entropy", y="ws_loss")
grid.plot_joint(
    sns.scatterplot, hue=loss_vs_entropy["type"], style=loss_vs_entropy["type"]
)
grid.plot_marginals(sns.histplot, multiple="stack")
grid.ax_joint.legend(bbox_to_anchor=(1.2, 1), loc="upper left")
grid.ax_joint.grid(alpha=0.5)

In [None]:
hist_data = pd.concat(
    [
        rdflarge.assign(type="simple"),
        model_avg_baseline.assign(type="model_avg_baseline"),
        best_by_judge.assign(type="best_by_judge"),
    ]
)

sns.violinplot(hist_data, x="type", y="ws_loss", common_norm=False)

In [None]:
sns.lmplot(best_by_judge, x="pred_entropy", y="ws_loss")

In [None]:
ax = sns.scatterplot(
    joint.query("dataset_idx == dataset_idx.unique()[0]"),
    x="pred_entropy",
    y="ws_loss",
    hue="score",
)
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left")