In [None]:
import pandas as pd
import pyterrier as pt
dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')
from collections import Counter
import krippendorff
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def maj_vote(nums):
    if not nums:
        return None  # Return None if the list is empty
    
    count = Counter(sorted(nums, reverse=True))  # Count the frequency of each number
    most_common = count.most_common(1)[0][0]  # Get the number with the highest frequency
    return most_common

def avg_vote(nums):
    if not nums:
        return 0  # Return 0 if the list is empty
    total_sum = sum(nums)  # Calculate the total sum of the list
    average = total_sum / len(nums)  # Calculate the average
    return round(average, 2)

def extract_labels(row, only_human=True):
    if only_human:
        print(row.index[row.index.str.contains('label_rater')])
        row = row[row.index.str.contains('label_rater')]
    else:
        row = row[row.index.str.contains('label')]
    result = []
    for entry in row:
        result.append(entry)

    return result

def calc_cohens_kappa(df, names, vis = True):
    rater1_labels = df[names[0]]
    rater2_labels = df[names[1]]

    # Compute Cohen's Weighted Kappa with quadratic weights
    kappa = cohen_kappa_score(rater1_labels, rater2_labels, weights='quadratic')

    if vis:
        conf_matrix = confusion_matrix(rater1_labels, rater2_labels, labels=[0, 1, 2])

        # Create a heatmap for visualization
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                    xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
        plt.xlabel(f'{names[1]} Labels')
        plt.ylabel(f'{names[0]} Labels')
        plt.title(f'Confusion Matrix between {names[0]} and {names[1]}')
        plt.show()

    return kappa

In [None]:
ratings_r1 = pd.read_json("/workspaces/CORD19_Plus/data/ratings_r1.json")
ratings_r2 = pd.read_json("/workspaces/CORD19_Plus/data/ratings_r2.json")
ratings_r3 = pd.read_json("/workspaces/CORD19_Plus/data/ratings_r3.json")
ratings_r4 = pd.read_json("/workspaces/CORD19_Plus/data/ratings_r4.json")
ratings_r5 = pd.read_json("/workspaces/CORD19_Plus/data/ratings_r5.json")


rating_gpt5 = pd.read_json("/workspaces/CORD19_Plus/data/labeling/table_pool_qrels5.json")

rating_surrogate = pd.read_json("/workspaces/CORD19_Plus/data/ratings_surrogate.json")

#set missing values to 0
rating_surrogate['label_surrogate'] = rating_surrogate['label_surrogate'].apply(lambda x: 0 if x == -1 else x)

df1_renamed = ratings_r1.rename(columns={'label': 'label_rater_1','parsing' : 'pars_1'})
df2_renamed = ratings_r2.rename(columns={'label': 'label_rater_2', 'parsing' : 'pars_2'})
df3_renamed = ratings_r3.rename(columns={'label': 'label_rater_3', 'parsing' : 'pars_3'})
df4_renamed = ratings_r4.rename(columns={'label': 'label_rater_4', 'parsing' : 'pars_4'})
df5_renamed = ratings_r5.rename(columns={'label': 'label_rater_5', 'parsing' : 'pars_5'})

df6_renamed = rating_surrogate.rename(columns={'label': 'label_surrogate'})


df7_renamed = rating_gpt5.rename(columns={'label': 'label_gpt5'})




dfs = [df1_renamed, df2_renamed, df3_renamed, df4_renamed, df5_renamed, df6_renamed, df7_renamed]


In [None]:
#also include a surrogate rater where the judgment origins from the trec-covid dataset.
#label_map = {}
#surrogate_ratings = dataset.get_qrels()
#for _, row in surrogate_ratings.iterrows():
#    label_map[(row['qid'], row['docno'])] = row['label']

#dummy_df['label_surrogate'] = dummy_df.apply(lambda row: label_map[(str(row['qid']), row['docno'].split("_")[0])] if (str(row['qid']), row['docno'].split("_")[0]) in label_map.keys() else -1, axis=1)


In [None]:
merged_df = pd.merge(
    df1_renamed,
    df2_renamed, 
    on=['qid', 'docno'],
    how='inner'
)

for df in dfs[2:]:
    merged_df = pd.merge(
    merged_df,
    df,
    on=['qid', 'docno'],
    how='inner'
)

In [None]:
merged_df['maj_vote'] = merged_df.apply(lambda row: maj_vote(extract_labels(row)), axis = 1)
merged_df['avg_vote'] = merged_df.apply(lambda row: avg_vote(extract_labels(row)), axis = 1)


In [None]:
merged_df

In [None]:
rater_columns = [col for col in merged_df.columns if col.startswith('label_rater')]
ratings = merged_df[rater_columns]

# Transpose the DataFrame to match krippendorff's expected input
# Each row represents a rater, each column represents an item
reliability_data = ratings.to_numpy().T

# Compute Krippendorff's Alpha for ordinal data
alpha = krippendorff.alpha(reliability_data, level_of_measurement='ordinal')

print(f"Krippendorff's Alpha (ordinal): {alpha:.4f}")

In [None]:
all_names = ['maj_vote']
all_names += [name for name in merged_df.columns if 'label' in name]


In [None]:
possible_pairs = []

for i in range(len(all_names)):
    for j in range(i + 1, len(all_names)):
        possible_pairs.append((all_names[i], all_names[j]))

In [None]:
for pair in possible_pairs:
    if "maj" in pair[0]:
        print(f"Cohen's Weighted Kappa for {str(pair):<45} : {calc_cohens_kappa(merged_df, pair, vis=False):>10.4f}")

In [None]:
parsing_cols = [col for col in merged_df.columns if col.startswith('pars')]

In [None]:
merged_df.loc[:,parsing_cols].mean().mean()

In [None]:
merged_df.loc[:,parsing_cols].mean().std()

In [None]:
rater_prefix = 'label_rater'
gpt_prefix = 'label_gpt'

# Collect all labeler columns
labeler_cols = [col for col in merged_df.columns if col.startswith(rater_prefix)]

print("Labeler Columns:", labeler_cols)

In [None]:
aggregated_df = merged_df.groupby('qid')[labeler_cols].sum().reset_index()

print(aggregated_df)

In [None]:
melted_aggregated_df = aggregated_df.melt(id_vars='qid', var_name='labeler', value_name='sum_labels')

print(melted_aggregated_df)

In [None]:
pivot_df = melted_aggregated_df.pivot(index='qid', columns='labeler', values='sum_labels').fillna(0).reset_index()

print(pivot_df)