In [None]:
# %load_ext cudf.pandas
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import logging

logger = logging.getLogger(__name__)

In [None]:
from lewidi_lib import load_dataset, enable_logging

enable_logging()

datasets = ["CSC", "Paraphrase", "MP", "VariErrNLI"]
splits = ["train"]  # , "dev"]
if False:
    task = "perspectivist"
    template_ids = ["63"]
else:
    task = "soft-label"
    template_ids = ["60"]
run_name = "1000ex_10loops"

ddf = pd.concat([load_dataset(d, split=s, task=task) for d in datasets for s in splits])

In [None]:
import duckdb
from lewidi_lib import process_rdf, list_preds

preds_files_df = list_preds().query(
    f"split == 'train' and run_name == '{run_name}' and template_id.isin({template_ids}) and dataset in @datasets and exists and model_id == 'Qwen/Qwen3-32B'"
)
preds_files_df


In [None]:
from lewidi_lib import load_listof_parquets

files = preds_files_df["preds_file"].tolist()
rdf = load_listof_parquets(files)

In [None]:
rdf = process_rdf(rdf, response_contains_steps=True, task=task)

In [None]:
rdf["dataset"].unique()

In [None]:
rdf.groupby(["dataset", "model_size"]).size()

In [None]:
import seaborn as sns

sns.set_context(context="talk")

In [None]:
(rdf.groupby("dataset")["is_valid_pred"].mean() * 100).reset_index().round(1).to_csv(
    f"./tables/{task}/32b_valid_preds_by_dataset.csv", index=False
)

In [None]:
logger.info(
    "Dropping %d predictions that are not valid", len(rdf.query("~is_valid_pred"))
)
rdf.query("is_valid_pred", inplace=True)
rdf.groupby(["dataset", "model_size"]).size()

In [None]:
from lewidi_lib import (
    assign_col_avg_abs_dist,
    assign_cols_perf_metrics,
    join_dataset_and_preds,
)

joint_df = join_dataset_and_preds(ddf, rdf)
joint_df = assign_cols_perf_metrics(joint_df, task=task)

# Baselines

In [None]:
from lewidi_lib import (
    compute_average_baseline_and_assing_perf_metrics,
    compute_baseline_entropy,
    compute_maj_vote_baseline,
    compute_majority_baseline,
    compute_most_frequent_baseline,
    compute_pe_rand_baseline,
    compute_target_entropy,
    compute_unif_baseline_perf_metrics,
    agg_perf_metrics,
    compute_smoothed_baseline,
    compute_oracle_baseline,
)

if task == "soft-label":
    perf_col = "ws_loss"
    perf_col_label = "Wasserstein Distance"
else:
    perf_col = "avg_abs_dist"
    perf_col_label = "Absolute Dist."

if task == "soft-label":
    majority_baseline = compute_majority_baseline(ddf)
    agg_majority_baseline = agg_perf_metrics(majority_baseline)
    average_baseline = compute_average_baseline_and_assing_perf_metrics(rdf)
    smoothed_baseline = compute_smoothed_baseline(rdf)
    unif_baseline_perf_metrics = compute_unif_baseline_perf_metrics(ddf)
    unif_baseline_entropy = compute_baseline_entropy(datasets)
    target_entropy = compute_target_entropy(ddf)
else:
    rand_baseline = agg_perf_metrics(
        compute_pe_rand_baseline(ddf), cols=["avg_abs_dist"]
    )
    most_frequent_baseline = agg_perf_metrics(
        compute_most_frequent_baseline(ddf), cols=["avg_abs_dist"]
    )
    maj_vote_baseline = compute_maj_vote_baseline(joint_df)
best_perf_baseline = compute_oracle_baseline(joint_df, perf_col=perf_col)

# Is Performance Correlated With Size?

In [None]:
from pathlib import Path
from lewidi_lib import plot_horizontal_lines
import seaborn as sns

cols_ = [
    # "model_id",
    # "model_size",
    "dataset",
    perf_col,
]
data_ = joint_df[cols_].assign(Baseline="Simple Sampling")

if task == "soft-label":
    data_ = pd.concat(
        [
            # unif_baseline_perf_metrics[cols_].assign(Baseline="Uniform"),
            agg_majority_baseline[cols_].assign(Baseline="Most Frequent"),
            data_,
            average_baseline[cols_].assign(Baseline="Model Averaging"),
            smoothed_baseline[cols_].assign(Baseline="Smoothing"),
            best_perf_baseline[cols_].assign(Baseline="BoN Oracle"),
        ]
    )
else:
    data_ = pd.concat(
        [
            most_frequent_baseline[cols_].assign(Baseline="Most Frequent"),
            data_,
            maj_vote_baseline[cols_].assign(Baseline="Majority Vote"),
            best_perf_baseline[cols_].assign(Baseline="BoN Oracle"),
        ]
    )

col_wrap = 2
desired = [
    "Most Frequent",
    "Simple Sampling",
    "Model Averaging",
    "BoN Oracle",
    "Majority Vote",
]
g = sns.catplot(
    data_.query("Baseline in @desired").reset_index(drop=True),
    x=perf_col,
    y="Baseline",
    hue="Baseline",
    col="dataset",
    col_wrap=2,
    col_order=["CSC", "MP", "Paraphrase", "VariErrNLI"],
    kind="bar",
    sharex=False,
    height=2.5,
    aspect=1.8,
)
# g.set(ylim=(0, None))
g.set_axis_labels(perf_col_label, "")
# sns.move_legend(g, loc="lower left", bbox_to_anchor=(0.1, 1.0), ncol=3)
for ax in g.axes.flat:
    ax.grid(alpha=0.5, axis="x")

if task == "soft-label":
    g.axes[3].set_xlabel("Manhattan Distance")
else:
    g.axes[3].set_xlabel("Error Rate")

tgt_path = Path(f"./imgs/{task}/baselines/{perf_col}_32b.pdf")
tgt_path.parent.mkdir(parents=True, exist_ok=True)
g.figure.savefig(tgt_path, bbox_inches="tight")

In [None]:
data_32b = data_  # .query("model_id.str.contains('32B')")
if len(data_32b) == 0:
    logger.warning("No 32B data found!")
else:
    if task == "soft-label":
        method_order = [
            "Simple Sampling",
            "Most Frequent",
            "Model Averaging",
            "BoN Oracle",
        ]
    else:
        method_order = [
            "Simple Sampling",
            "Most Frequent",
            "Majority Vote",
            "BoN Oracle",
        ]

    perf_32b = duckdb.sql(
        f"PIVOT data_32b ON dataset using mean({perf_col}) GROUP BY Baseline"
    ).df()
    perf_32b = perf_32b.set_index("Baseline").loc[method_order].reset_index()
    perf_32b.round(3).to_csv(f"tables/{task}/32b_{perf_col}.csv", index=False)

# Is performance correlated with avg entropy?

In [None]:
if task == "soft-label":
    ent_data_ = pd.concat(
        [
            joint_df.assign(Baseline="Simple Sampling"),
            average_baseline.assign(Baseline="Model Averaging"),
            smoothed_baseline.assign(Baseline="Smoothing"),
            # best_wsloss_baseline.assign(Baseline="BoN Oracle"),
        ]
    )

    g = sns.catplot(
        data=ent_data_.reset_index(drop=True),
        x="pred_entropy",
        y="Baseline",
        hue="Baseline",
        # style="Baseline",
        # markers=["o", "s", "D"],
        # col="template_alias",
        # col_order=sorted(ent_data_["template_alias"].unique()),
        col="dataset",
        col_order=datasets,
        col_wrap=col_wrap,
        kind="bar",
        # marker="o",
        height=2.5,
        aspect=1.6,
        sharex=False,
    )
    # sns.move_legend(g, loc="lower left", bbox_to_anchor=(0.2, 1.0), ncol=3)
    g.set_axis_labels("Entropy", "")
    for ax in g.axes.flat:
        ax.grid(alpha=0.5)
    # plot_horizontal_lines(
    #     g,
    #     unif_baseline_entropy,
    #     label="Uniform Entropy",
    #     color="blue",
    #     data_col="entropy",
    # )
    # plot_horizontal_lines(
    #     g,
    #     target_entropy,
    #     label="Human Entropy",
    #     color="green",
    #     data_col="entropy",
    #     pos="right",
    # )
    tgt_path = Path(f"./imgs/{task}/baselines/entropy_32b.pdf")
    tgt_path.parent.mkdir(parents=True, exist_ok=True)
    g.figure.savefig(tgt_path, bbox_inches="tight")

In [None]:
if task == "soft-label":
    (
        ent_data_.groupby("Baseline")
        .agg(
            pred_entropy=("pred_entropy", "mean"),
            ws_loss=("ws_loss", "mean"),
        )
        .sort_values("pred_entropy")
        * 100
    ).round(1).to_csv(f"./tables/{task}/32b_entropy_vs_perf.csv")