In [None]:
from typing import List
import krippendorff
import pandas as pd
from hashlib import sha256

def invert_votes(votes: List[str], invert: bool):
    if invert:
        return [{"A": "B", "B": "A", "N": "N"}[x] for x in votes]
    else:
        return votes
    
def get_agreement(
    df, 
    item_column_name: str = "item", 
    worker_column_name: str = "worker", 
    value_column_name: str = "value",
    value_domain: List[str] = None
):
    if value_domain is None:
        value_domain = ["A", "N", "B"]
        
    rel_data = df.pivot(index=worker_column_name, columns=item_column_name, values=value_column_name).values.astype("U")
    alpha = krippendorff.alpha(reliability_data=rel_data, level_of_measurement="ordinal", value_domain=value_domain)
    return round(float(alpha), 3)

In [None]:
cols =  ["correctness_topical", "coherence_logical", "coherence_stylistic", "coverage_broad", "coverage_deep", "consistency_internal", "quality_overall"]

In [None]:
data = []

for col in cols:
    df = (
        pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
        .assign(pair = lambda df: df.loc[:, ["response_a", "response_b"]].apply(sorted, axis=1).apply(lambda cell: sha256(cell.__str__().encode("utf-8")).hexdigest()))
        .loc[:, ["pair", "response_a", "response_b", f"{col}_vote", f"{col}_spam_probability", "worker"]]
    )
    df = df[df["pair"].isin(df.groupby("pair").count().query("response_a == 2").index)]
    
    df_uncorrected = (
        df
        .assign(
            vote=lambda df: df.apply(lambda row: invert_votes(row[f"{col}_vote"], row["response_a"] > row["response_b"]), axis=1),
            group=lambda df: (df["response_a"] > df["response_b"]).replace({True: "X", False: "Y"})
        )
        .drop(columns=[f"{col}_vote"])
        .explode(["worker", "vote"])
        .drop(columns=["response_a", "response_b"])
        .drop_duplicates(subset=["pair", "worker"], keep="first")
    )
    
    df_corrected = (
        df
        .assign(
            vote=lambda df: df.apply(lambda row: invert_votes(row[f"{col}_vote"], row["response_a"] > row["response_b"]), axis=1),
            group=lambda df: (df["response_a"] > df["response_b"]).replace({True: "X", False: "Y"})
        )
        .drop(columns=f"{col}_vote")
        .explode(["worker", "vote", f"{col}_spam_probability"])
        .query(f"{col}_spam_probability < 0.7")
        .drop(columns=f"{col}_spam_probability")
        .drop(columns=["response_a", "response_b"])
        .drop_duplicates(subset=["pair", "worker"], keep="first")
    )
    
    alpha_x = get_agreement(df_uncorrected[df_uncorrected["group"] == "X"], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    alpha_y = get_agreement(df_uncorrected[df_uncorrected["group"] == "Y"], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    alpha_both = get_agreement(df_uncorrected, item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    
    alpha_x_corrected = get_agreement(df_corrected[df_corrected["group"] == "X"], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    alpha_y_corrected = get_agreement(df_corrected[df_corrected["group"] == "Y"], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    alpha_both_corrected = get_agreement(df_corrected, item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    
    data.append({
        "dimension": col,
        "alpha_x": alpha_x,
        "alpha_y": alpha_y,
        "alpha_both": alpha_both,
        "delta": max(round(abs(alpha_both - alpha_x), 2), round(abs(alpha_both - alpha_y), 2)),
        "alpha_x_corrected": alpha_x_corrected,
        "alpha_y_corrected": alpha_y_corrected,
        "alpha_both_corrected": alpha_both_corrected,
        "delta_corrected": max(round(abs(alpha_both_corrected - alpha_x_corrected), 2), round(abs(alpha_both_corrected - alpha_y_corrected), 2)),
    })
    

In [None]:
pd.DataFrame(data).set_index("dimension").round(2)

In [None]:
pd.DataFrame(data).set_index("dimension").mean(axis=0).round(2)

In [None]:
df["pair"].nunique()

In [None]:
data = []

for col in cols:
    df = (
        pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
        .assign(pair = lambda df: df.loc[:, ["response_a", "response_b"]].apply(list, axis=1).apply(lambda cell: sha256(cell.__str__().encode("utf-8")).hexdigest()))
        .loc[:, ["pair", "response_a", "response_b", f"{col}_vote", "worker"]]
        .explode([f"{col}_vote", "worker"])
        .merge(
            pd.read_json("../data/raw/study2_responses.jsonl.gz", lines=True).loc[:, ["worker", "response_a", "response_b", "item_index"]],
            on=["worker", "response_a", "response_b"],
            how="left"
        )
        .rename(columns={f"{col}_vote": "vote"})
        .drop_duplicates(subset=["pair", "worker"], keep="first")
    )
    alpha_first = get_agreement(df[df.item_index <= 7], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    alpha_last = get_agreement(df[df.item_index > 7], item_column_name="pair", worker_column_name="worker", value_column_name="vote")
    data.append({"dimension": col, "alpha_first": alpha_first, "alpha_last": alpha_last, "delta": abs(alpha_first - alpha_last)})

In [None]:
pd.DataFrame(data).set_index("dimension").round(2)