In [None]:
from typing import List
import pandas as pd
import krippendorff

def has_majority_vote(data: List[str]):
    return pd.Series(data).value_counts().sort_values(ascending=False).iloc[0] > 2

def get_agreement(
    df, 
    item_column_name: str = "item", 
    worker_column_name: str = "worker", 
    value_column_name: str = "value",
    value_domain: List[str] = None
):
    if value_domain is None:
        value_domain = ["A", "N", "B"]
        
    rel_data = df.pivot(index=worker_column_name, columns=item_column_name, values=value_column_name).values.astype("U")
    alpha = krippendorff.alpha(reliability_data=rel_data, level_of_measurement="ordinal", value_domain=value_domain)
    return round(float(alpha), 3)

In [None]:
cols =  ["correctness_topical", "coherence_logical", "coherence_stylistic", "coverage_broad", "coverage_deep", "consistency_internal", "quality_overall"]

In [None]:
df_expert = (
    pd.read_json("../data/raw/expert-response.jsonl.gz").query("prolific_pid in ['annot1', 'annot2', 'annot4']")
    .rename(columns={"validity": "correctness_topical"})
)

agreement_data = []
for col in cols:
    alpha_expert_hard = get_agreement(
        df_expert
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            worker=lambda df: df["prolific_pid"], 
            value=lambda df: df[col]
        )
    )
    
    alpha_crowd_hard = get_agreement(
        df_expert.loc[:, ["query_id", "response_a", "response_b"]]
        .merge(
            pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
            .loc[:, ["query_id", "response_a", "response_b", f"{col}_vote", "worker"]]
        )
        .explode([f"{col}_vote", "worker"])
        .drop_duplicates()
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            value=lambda df: df[f"{col}_vote"]
        )
    )
    
    alpha_crowd_corrected = get_agreement(
        pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
        .assign(ab=lambda df: df.loc[:, ["response_a", "response_b"]].apply(sorted, axis=1))
        .drop_duplicates(subset="ab").drop(columns="ab")
        .loc[:, ["query_id", "response_a", "response_b", f"{col}_vote", f"{col}_spam_probability", "worker"]]
        .explode([f"{col}_vote", f"{col}_spam_probability", "worker", ])
        .query(f"{col}_spam_probability <= 0.7")
        .drop_duplicates()
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            value=lambda df: df[f"{col}_vote"]
        )
    )
    
    alpha_crowd_uncorrected = get_agreement(
        pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
        .loc[:, ["query_id", "response_a", "response_b", f"{col}_vote", f"{col}_spam_probability", "worker"]]
        .explode([f"{col}_vote", f"{col}_spam_probability", "worker", ])
        .drop_duplicates()
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            value=lambda df: df[f"{col}_vote"]
        )
    )
    
    alpha_crowd_easy_uncorrected = get_agreement(
        (
            pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
            .assign(
                decidable = lambda df: (
                    # Filter question cols
                    df.loc[:, [col for col in df.columns if "vote" in col]]
                    # Count which questions have a majority vote
                    .apply(lambda col: col.apply(has_majority_vote), axis=1)
                    # Check if the majority of questions have a majority vote
                    .apply(lambda row: row.sum() > 4, axis=1)
                )
            )
            .query("decidable == True")
        )
        .loc[:, ["query_id", "response_a", "response_b", f"{col}_vote", f"{col}_spam_probability", "worker"]]
        .explode([f"{col}_vote", f"{col}_spam_probability", "worker", ])
        .drop_duplicates()
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            value=lambda df: df[f"{col}_vote"]
        )
    )
    
    alpha_crowd_easy_corrected = get_agreement(
        (
            pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
            .assign(
                decidable = lambda df: (
                    # Filter question cols
                    df.loc[:, [col for col in df.columns if "vote" in col]]
                    # Count which questions have a majority vote
                    .apply(lambda col: col.apply(has_majority_vote), axis=1)
                    # Check if the majority of questions have a majority vote
                    .apply(lambda row: row.sum() > 4, axis=1)
                )
            )
            .query("decidable == True")
        )
        .loc[:, ["query_id", "response_a", "response_b", f"{col}_vote", f"{col}_spam_probability", "worker"]]
        .explode([f"{col}_vote", f"{col}_spam_probability", "worker", ])
        .query(f"{col}_spam_probability <= 0.7")
        .drop_duplicates()
        .assign(
            item=lambda df: df["response_a"] + df["response_b"], 
            value=lambda df: df[f"{col}_vote"]
        )
    )
    
    agreement_data.append({
        "dimension": col, 
        "expert_hard": alpha_expert_hard, 
        "crowd_hard": alpha_crowd_hard,
        "crowd_complete_uncorrected": alpha_crowd_uncorrected,
        "crowd_complete_corrected": alpha_crowd_corrected,
        "crowd_easy_uncorrected": alpha_crowd_easy_uncorrected,
        "crowd_easy_corrected": alpha_crowd_easy_corrected,
    })

print(pd.DataFrame(agreement_data).round(2).to_latex())

In [None]:
pd.DataFrame(agreement_data).round(2)

In [None]:
pd.DataFrame(agreement_data).set_index("dimension").mean(axis=0).round(2)

In [None]:
len(pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True))

In [None]:
len(
    pd.read_json("../data/artifacts/ratings.jsonl.gz", lines=True)
    .assign(
        decidable = lambda df: (
            # Filter question cols
            df.loc[:, [col for col in df.columns if "vote" in col]]
            # Count which questions have a majority vote
            .apply(lambda col: col.apply(has_majority_vote), axis=1)
            # Check if the majority of questions have a majority vote
            .apply(lambda row: row.sum() > 4, axis=1)
        )
    )
    .query("decidable == False")
)

In [None]:
len(
    pd.read_json("../data/raw/expert-response.jsonl.gz")
    .query("prolific_pid in ['annot1', 'annot2', 'annot4']")
    .groupby(["response_a", "response_b"])
    .count()
)