# When does Model Averaging improve?

In [1]:
from lewidi_lib import (
    load_preds,
    make_query_from_dict,
    process_rdf_and_add_perf_metrics,
)


rdf = load_preds(
    "/mnt/disk16tb/globus_shared/from-lrz-ai-systems/tasks_0_cscfull_t31_Qwen_Qwen3-32B_set2/preds"
)
rdf.drop_duplicates(inplace=True)

metadata = {
    "template_id": 31,
    "model_id": "Qwen/Qwen3-32B",
    "gen_kwargs": "set2",
    "dataset": "CSC",
    "judge_model_id": "gemini-2.5-pro",
}
query = make_query_from_dict(metadata, rdf.columns)
rdf = rdf.query(query)
rdf = process_rdf_and_add_perf_metrics(rdf)

In [2]:
from lewidi_lib import compute_average_baseline


model_avg_baseline = compute_average_baseline(rdf)

In [None]:
improvements = (
    rdf.groupby(["dataset_idx", "tgt_has_holes"], as_index=False)
    .agg(
        avg_ws_loss=("ws_loss", "mean"),
        avg_pred_entropy=("pred_entropy", "mean"),
    )
    .merge(model_avg_baseline[["dataset_idx", "ws_loss", "pred_entropy"]])
    .assign(improvement=lambda df: df["avg_ws_loss"] - df["ws_loss"])
    .assign(pred_entropy_diff=lambda df: df["pred_entropy"] - df["avg_pred_entropy"])
)

improvements.sort_values("improvement", ascending=False).head(3)

In [None]:
import seaborn as sns

sns.scatterplot(
    improvements,
    x="improvement",
    y="pred_entropy_diff",
)

In [None]:
sns.histplot(improvements, x="improvement", hue="tgt_has_holes")