In [None]:
import pandas as pd

DIMS = ['correctness_topical', 'coherence_logical', 'coherence_stylistic', 'coverage_broad',
       'coverage_deep', 'consistency_internal', 'quality_overall']

In [None]:
import krippendorff

def get_agreement(df):
    data = []
    for dim in DIMS:
        d = (
            df
            .pivot(index="item_id", columns="worker_id", values=dim)
            .values
            .astype("U")
            .T
        )
        alpha = round(float(krippendorff.alpha(reliability_data=d, level_of_measurement="ordinal", value_domain=["a", "n", "b"])), 3)
        
        data.append({"dimension": dim, "alpha": alpha})
        
    return pd.DataFrame(data)

In [None]:
df_llm = (
    pd.read_json("../data/artifacts/llm_ratings.jsonl.gz", lines=True)
    .apply(lambda row: row.replace({"A": row["response_a"], "B": row["response_b"]}), axis=1)
    .assign(
        response_a_new = lambda df: df.apply(lambda row: sorted([row["response_a"], row["response_b"]])[0], axis=1),
        response_b_new = lambda df: df.apply(lambda row: sorted([row["response_a"], row["response_b"]])[1], axis=1)
    )
    .drop(columns=["response_a", "response_b"])
    .rename(columns={"response_a_new": "response_a", "response_b_new": "response_b"})
    .set_index(["response_a", "response_b"])
    .apply(lambda row: row.replace({row.name[0]: "A", row.name[1]: "B"}), axis=1)
    .reset_index()
    .sort_values(["inference", "query_id", "response_a", "response_b"])
    .assign(worker_id=lambda df: df.groupby(["inference", "query_id", "response_a", "response_b"]).cumcount())
    .groupby(["inference", "query_id", "response_a", "response_b"])
    .head(2)
    .replace({"A": "a", "N": "n", "B": "b"})
)

In [None]:
df_human = (
    pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
    .assign(
        pair_1=lambda df: df["response_a"] + "_" + df["response_b"],
        pair_2=lambda df: df["response_b"] + "_" + df["response_a"]
    )
)
df_human = (
    df_human
    .loc[df_human["pair_1"].isin(df_human["pair_2"]), :]
)
df_human

df_human = (
    df_human
    .rename(columns={k+"_gold": k for k in DIMS})
    .assign(inference="human")
    .assign(
        response_a_new = lambda df: df.apply(lambda row: sorted([row["response_a"], row["response_b"]])[0], axis=1),
        response_b_new = lambda df: df.apply(lambda row: sorted([row["response_a"], row["response_b"]])[1], axis=1)
    )
    .drop(columns=["response_a", "response_b"])
    .rename(columns={"response_a_new": "response_a", "response_b_new": "response_b"})
    .set_index(["response_a", "response_b"])
    .apply(lambda row: row.replace({row.name[0]: "A", row.name[1]: "B"}), axis=1)
    .reset_index()
    .sort_values(["inference", "query_id", "response_a", "response_b"])
    .assign(worker_id=lambda df: df.groupby(["inference", "query_id", "response_a", "response_b"]).cumcount())
    .groupby(["inference", "query_id", "response_a", "response_b"])
    .head(2)
    .rename(columns={k+"_gold": k for k in DIMS})
    .assign(inference="human")
    .loc[:, ["response_a", "response_b", "inference"] + DIMS]
)

In [None]:
get_agreement(
    df_llm
    .query("inference == 'combined'")
    .assign(item_id=lambda df: df["response_a"] + "_" + df["response_b"], axis=1)
    .assign(worker_id=lambda df: df.groupby("item_id").cumcount())
    .loc[:, ["item_id", "worker_id"] + DIMS]
).round(2)

In [None]:
get_agreement(
    df_llm
    .query("inference == 'individual'")
    .assign(item_id=lambda df: df["response_a"] + "_" + df["response_b"], axis=1)
    .assign(worker_id=lambda df: df.groupby("item_id").cumcount())
    .loc[:, ["item_id", "worker_id"] + DIMS]
).round(2)

In [None]:
get_agreement(
    pd.concat([
        df_llm.query("inference == 'combined'"),
        df_human
    ])
    .assign(item_id=lambda df: df["response_a"] + "_" + df["response_b"], axis=1)
    .assign(worker_id=lambda df: df.groupby("item_id").cumcount())
    .loc[:, ["item_id", "worker_id"] + DIMS]
).round(2)

In [None]:
get_agreement(
    pd.concat([
        df_llm.query("inference == 'individual'"),
        df_human
    ])
    .assign(item_id=lambda df: df["response_a"] + "_" + df["response_b"], axis=1)
    .assign(worker_id=lambda df: df.groupby("item_id").cumcount())
    .loc[:, ["item_id", "worker_id"] + DIMS]
).round(2)

In [None]:
df_llm

In [None]:
get_agreement(
    pd.concat([
        df_llm.query("inference == 'individual'"),
        df_llm.query("inference == 'combined'")
    ])
    .assign(item_id=lambda df: df["response_a"] + "_" + df["response_b"] + df["worker_id"].astype(str), axis=1)
    .assign(worker_id=lambda df: df.groupby("item_id").cumcount())
    .loc[:, ["item_id", "worker_id"] + DIMS]
).round(2)

In [None]:
(
    df_llm
    .query("inference == 'combined'")
    .set_index(["response_a", "response_b"])
    .loc[:, DIMS]
    .replace({"a": -1, "n": 0, "b": 1})
    .corr(method="spearman")
    .agg(["min", "mean"], axis=1)
    .round(2)
)

In [None]:
(
    df_llm
    .query("inference == 'individual'")
    .set_index(["response_a", "response_b"])
    .loc[:, DIMS]
    .replace({"a": -1, "n": 0, "b": 1})
    .corr(method="spearman")
    .agg(["min", "mean"], axis=1)
    .round(2)
)

In [None]:
df_llm