In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

pd.set_option("display.max_colwidth", 100)

In [None]:
from lewidi_lib import enable_logging, load_preds

enable_logging()

folder = "../parquets/perspectivist/"
rdf = load_preds(folder)

In [None]:
from lewidi_lib import process_rdf
from functools import lru_cache


@lru_cache
def process_rdf_cached() -> pd.DataFrame:
    return process_rdf(rdf, task="perspectivist", discard_invalid_pred=False)

In [None]:
rdf = process_rdf_cached()

# Model Compliance
Only 0.6B fails to comply with the output format

In [None]:
rdf.groupby(["dataset", "model_size"], observed=True)["is_valid_pred"].mean()

In [None]:
from lewidi_lib import discard_invalid_preds

rdf = discard_invalid_preds(rdf)

# Baselines

In [None]:
from lewidi_lib import load_dataset
from lewidi_lib import compute_pe_rand_baseline

datasets = ["CSC", "MP", "Paraphrase", "VariErrNLI"]
ddf = pd.concat(
    [load_dataset(ds, split="train", task="perspectivist") for ds in datasets]
)
rand_baseline = compute_pe_rand_baseline(ddf)
rand_baseline  # there is a bit of noise in the random baseline

In [None]:
from lewidi_lib import compute_most_frequent_baseline


most_frequent_baseline = compute_most_frequent_baseline(ddf)
most_frequent_baseline

# Qwen3 Performance

In [None]:
from lewidi_lib import (
    discard_rows_with_distinct_n_annotators,
    assign_col_avg_abs_diff,
    join_correct_responses,
)

joint_df = join_correct_responses(rdf, task="perspectivist")
joint_df = discard_rows_with_distinct_n_annotators(joint_df)
joint_df = assign_col_avg_abs_diff(joint_df)
joint_df.groupby(["dataset", "split", "model_id"])["avg_abs_diff"].mean()

In [None]:
from lewidi_lib import plot_horizontal_lines
import seaborn as sns

fgrid = sns.relplot(
    joint_df,
    x="model_size",
    y="avg_abs_diff",
    col="dataset",
    kind="line",
    marker="o",
    facet_kws={"sharey": False},
    height=3,
    aspect=1.2,
)
for ax in fgrid.axes.flat:
    ax.grid(alpha=0.5)

plot_horizontal_lines(
    fgrid,
    data=rand_baseline,
    label="Random baseline",
    color="red",
    data_col="avg_abs_diff",
)
plot_horizontal_lines(
    fgrid,
    data=most_frequent_baseline,
    label="Most frequent baseline",
    color="blue",
    data_col="avg_abs_diff",
)