In [1]:
import pandas as pd
import numpy as np
import ast
import krippendorff
from scipy.stats import mode
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from statsmodels.stats.proportion import test_proportions_2indep
from sklearn.metrics import cohen_kappa_score

In [2]:
# Read in dataset key
df = pd.read_csv("output/human_eval/cochrane_full.csv")
df['order'] = df['order'].apply(ast.literal_eval)

order_dict = {row['id']: row['order'] for i, row in df.iterrows()}

In [3]:
# Specify human eval files
files = [
    # "output/human_eval/Full Simplification Human Evaluation - Lj - Cochrane - Arman Criteria.csv",
    # "output/human_eval/Full Simplification Human Evaluation - Kejian - Cochrane - Arman Criteria.csv",
    # "output/human_eval/Full Simplification H
    # uman Evaluation - Heyuan - Cochrane - Arman Criteria.csv",
    "output/human_eval/Full Simplification Human Evaluation - Lj - Cochrane.csv",
    "output/human_eval/Full Simplification Human Evaluation - Kejian - Cochrane.csv",
    "output/human_eval/Full Simplification Human Evaluation - Heyuan - Cochrane.csv",
]

In [4]:
def reorder(df_scores_, order_dict_):
    df_scores = df_scores_.copy()
    order_dict = order_dict_.copy()

    for i, row in df_scores.loc[1:].iterrows():
        report_id = row[0]
        curr_order = order_dict[int(report_id)]
        right_order = np.argsort(curr_order)
        
        df_scores.iloc[i, 1:5] = np.array(row[1:5])[right_order]
        df_scores.iloc[i, 5:9] = np.array(row[5:9])[right_order]
    
    return df_scores

In [5]:
df_scores_list = []

for f in files:
    df_human = pd.read_csv(f) # Read
    df_human.iloc[1:,1:] = df_human.iloc[1:,1:].applymap(int) # Cast as int
    df_human = reorder(df_human, order_dict) # Reorder by model
    df_scores_list.append(df_human)

In [6]:
# Stack all the scores across evaluators
scores = np.stack([x.values[1:,1:] for x in df_scores_list], axis=2)

# Get the majority vote
scores = mode(scores, axis=2).mode.squeeze(2)

  scores = mode(scores, axis=2).mode.squeeze(2)
  scores = mode(scores, axis=2).mode.squeeze(2)


In [7]:
# from collections import Counter

# df_stacked = np.vstack([x.values[1:,1:] for x in df_scores_list])
# for i in range(8):
#     x = Counter(df_stacked[:, i])
#     print("/".join([str(round(x[k]/90,2)) for k in sorted(x.keys(), reverse=True)]))

# for i in range(8):
#     x = Counter(scores[:, i])
#     print("/".join([str(round(x[k]/30, 2)) for k in sorted(x.keys(), reverse=True)]))

In [10]:
# Krippendorff's Alpha
for col_idx in range(8):
    krippen = krippendorff.alpha(
        [list(df_scores_list[rater_idx].values[1:,col_idx+1].astype(int)) for \
         rater_idx in range(3)]
         )
    print(col_idx, krippen)

0 0.1998001998001998
1 -0.10559006211180133
2 0.015585721468074465
3 0.24533634821933292
4 0.05066666666666664
5 -0.015376984126984183
6 0.023319615912208436
7 0.40622683469236476


In [None]:
# Average pairwise Cohen's kappa
for col_idx in range(8):
    kappa_lst = []
    for rater_idx_1 in range(2):
        for rater_idx_2 in range(rater_idx_1+1, 3):
            pair_kappa = cohen_kappa_score(
                y1=df_scores_list[rater_idx_1].values[1:,col_idx+1].astype(int),
                y2=df_scores_list[rater_idx_2].values[1:,col_idx+1].astype(int),
            )
            kappa_lst.append(pair_kappa)
    print(col_idx, np.mean(kappa_lst))

In [22]:
# Get the percentage stats
scores.mean(axis=0)

array([1.7, 0.6333333333333333, 1.3666666666666667, 1.0333333333333334,
       1.9666666666666666, 1.5666666666666667, 0.7666666666666667,
       1.0333333333333334], dtype=object)

In [23]:
scores_read = scores.sum(axis=0)[:4]
scores_fact = scores.sum(axis=0)[4:]

In [24]:
# Hypothesis testing
for i in [0,1]:
    for j in [2,3]:
        result = test_proportions_2indep(
            count1 = scores_read[i], 
            nobs1  = 30, 
            count2 = scores_read[j], 
            nobs2  = 30, 
            compare='diff', 
            alternative='two-sided')
        print(i, j, scores_read[i], scores_read[j], result.pvalue)

for i in [0,1]:
    for j in [2,3]:
        result = test_proportions_2indep(
            count1 = scores_fact[i], 
            nobs1  = 30, 
            count2 = scores_fact[j], 
            nobs2  = 30, 
            compare='diff', 
            alternative='two-sided')
        print(i, j, scores_fact[i], scores_fact[j], result.pvalue)

0 2 51 41 nan
0 3 51 31 nan
1 2 19 41 nan
1 3 19 31 1.1771339097614998e-05
0 2 59 23 nan
0 3 59 31 nan
1 2 47 23 nan
1 3 47 31 nan


  statistic = diff_stat / np.sqrt(var)
  statistic, pvalue = _zstat_generic2(diff_stat, np.sqrt(var),


In [40]:
# Print Fleiss-Kappa agreement scores
for i in range(1,9):
    temp_df = pd.concat([x.iloc[1:,i] for x in df_scores_list], axis=1)
    print(i, fleiss_kappa(aggregate_raters(temp_df)[0]))

1 0.2799999999999999
2 -0.30937098844672645
3 -0.20000000000000004
4 0.15275994865211812
5 0.2546583850931675
6 -0.3396004700352528
7 -0.1538461538461544
8 0.2546583850931675
