In [None]:
%load_ext autoreload
%autoreload 2
from lewidi_lib import configure_pandas_display, enable_logging


enable_logging()
configure_pandas_display()

In [None]:
from lewidi_lib import (
    discard_rows_with_different_pred_and_tgt_lengths,
    join_dataset,
    load_listof_parquets,
    preds_file,
    process_rdf,
)


task = "perspectivist"
file = preds_file(
    dataset="MP",
    split="train",
    template="63",
    model_id="Qwen/Qwen3-32B",
    run_name="1000ex_10loops",
)
assert file.exists()
rdf = load_listof_parquets([file])
rdf = process_rdf(
    rdf, discard_invalid_pred=True, response_contains_steps=True, task=task
)
joint_df = join_dataset(rdf, task=task)
joint_df = discard_rows_with_different_pred_and_tgt_lengths(joint_df)

In [None]:
import pandas as pd

cols = ["annotator_metadata", "pred", "target"]
long = joint_df.query("run_idx == 1").explode(cols)[cols].reset_index(drop=True)
long = long.astype({"target": "int", "pred": "int"})
long = long.assign(is_correct=lambda df: df["pred"] == df["target"])
json_df = pd.json_normalize(long["annotator_metadata"])
long = pd.concat([long, json_df], axis=1)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(
    long["target"], long["pred"], average="binary"
)
avg_correct = long["is_correct"].mean()
print(f"{precision=:.2f}")
print(f"{recall=:.2f}")
print(f"{f1=:.2f}")
print(f"{avg_correct=:.2f}")

In [None]:
long.groupby("Gender")["is_correct"].mean()

In [None]:
# India has lower correctness levels, almost chance level
from lewidi_lib import bootstrap_avg
long.groupby("Nationality")["is_correct"].agg(bootstrap_avg)