In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import logging

logger = logging.getLogger(__name__)

In [None]:
from lewidi_lib import load_dataset, enable_logging

enable_logging()

datasets = ["CSC", "MP"]
splits = ["train", "dev"]
ddf = pd.concat([load_dataset(d, split=s) for d in datasets for s in splits])
print(len(ddf))
ddf.head(2)

In [None]:
import duckdb
from lewidi_lib import load_preds, process_rdf

con = duckdb.connect()
rdf = load_preds(parquets_dir="../parquets")
rdf = process_rdf(rdf)

In [None]:
import seaborn as sns

g = sns.relplot(
    data=rdf,
    x="model_size",
    y="is_valid_pred",
    hue="gen_kwargs",
    col="template_id",
    row="split",
    # row_order=["train", "dev"],
    kind="line",
    style="gen_kwargs",
    marker="o",
    height=3,
    aspect=1.2,
)
g.set_axis_labels("Model Params [B]", "Valid Preds")
for ax in g.axes.flat:
    ax.set_ylim(0, 1.05)
    ax.grid(alpha=0.5)


In [None]:
logger.info(
    "Dropping %d predictions that don't sum to 1", len(rdf.query("~is_valid_pred"))
)
rdf.query("is_valid_pred", inplace=True)

In [7]:
from lewidi_lib import assign_cols_perf_metrics, join_dataset_and_preds

joint_df = join_dataset_and_preds(ddf, rdf).pipe(assign_cols_perf_metrics)

# Baselines

In [None]:
rdf.groupby(["model_id", "gen_kwargs", "dataset", "split", "template_id", "dataset_idx"]).size()

In [None]:
from lewidi_lib import (
    compute_baseline_entropy,
    compute_unif_baseline_perf_metrics,
    compute_strong_baselines_perf_metrics,
)

strong_baselines = compute_strong_baselines_perf_metrics()
unif_baseline_perf_metrics = compute_unif_baseline_perf_metrics(ddf).merge(
    pd.Series(rdf["template_id"].unique(), name="template_id"), how="cross"
)
unif_baseline_entropy = compute_baseline_entropy(datasets)

# Is Performance Correlated With Size?

In [None]:
from lewidi_lib import plot_baseline_losses
import seaborn as sns

g = sns.relplot(
    data=joint_df,
    x="model_size",
    y="ws_loss",
    # hue="template_id",
    col="template_id",
    # col_order=["train", "dev"],
    row="dataset",
    # row_order=["CSC", "MP"],
    kind="line",
    marker="o",
    hue="gen_kwargs",
    style="gen_kwargs",
    height=3,
    aspect=1.2,
    facet_kws={"sharey": True},
)
# g.set(ylim=(0, None))
g.set_axis_labels("Model Params [B]", "Wasserstein Distance")
plot_baseline_losses(
    g, unif_baseline_perf_metrics, split="train", label="Uniform Baseline", color="blue"
)
plot_baseline_losses(
    g, strong_baselines, split="train", label="Gemini 2.5 Pro", color="red"
)

# Is performance correlated with avg entropy?

In [None]:
from lewidi_lib import plot_baseline_entropy


g = sns.relplot(
    data=joint_df,
    x="model_size",
    y="pred_entropy",
    hue="gen_kwargs",
    col="split",
    # col_order=["train", "dev"],
    row="dataset",
    kind="line",
    # style="gen_kwargs",
    marker="o",
    height=2.5,
    aspect=1.2,
    facet_kws={"sharey": False, "sharex": True},
)
for ax in g.axes.flat:
    ax.grid(alpha=0.5)
plot_baseline_entropy(g, unif_baseline_entropy)

In [None]:
ent_df = joint_df.groupby(
    ["model_size", "gen_kwargs", "dataset", "split"], as_index=False
).agg(
    avg_entropy=("pred_entropy", "mean"),
    avg_ws_loss=("ws_loss", "mean"),
)


In [None]:
g = sns.relplot(
    ent_df,
    x="avg_entropy",
    y="avg_ws_loss",
    hue="model_size",
    style="gen_kwargs",
    col="split",
    # col_order=["train", "dev"],
    row="dataset",
    # row_order=["CSC", "MP"],
    kind="scatter",
    height=2.5,
    aspect=1.2,
    facet_kws={"sharey": False, "sharex": False},
    palette="viridis",
)
for ax in g.axes.flat:
    ax.grid(alpha=0.5)
plot_baseline_losses(
    g, unif_baseline_perf_metrics, label="Uniform Baseline", color="blue"
)
plot_baseline_losses(g, strong_baselines, label="Gemini 2.5 Pro", color="red")
