In [1]:
import pandas as pd
import numpy as np
import ast
from scipy.stats import mode
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from statsmodels.stats.proportion import test_proportions_2indep

In [2]:
# Read in dataset key
df = pd.read_csv("output/human_eval/cochrane_full.csv")
df['order'] = df['order'].apply(ast.literal_eval)

order_dict = {row['id']: row['order'] for i, row in df.iterrows()}

In [3]:
# Specify human eval files
files = [
    "output/human_eval/Full Simplification Human Evaluation - Lj - Cochrane.csv",
    "output/human_eval/Full Simplification Human Evaluation - Kejian - Cochrane.csv",
    "output/human_eval/Full Simplification Human Evaluation - Heyuan - Cochrane.csv",
]

In [4]:
def reorder(df_scores_, order_dict_):
    df_scores = df_scores_.copy()
    order_dict = order_dict_.copy()

    for i, row in df_scores.loc[1:].iterrows():
        report_id = row[0]
        curr_order = order_dict[int(report_id)]
        right_order = np.argsort(curr_order)
        
        df_scores.iloc[i, 1:5] = np.array(row[1:5])[right_order]
        df_scores.iloc[i, 5:9] = np.array(row[5:9])[right_order]
    
    return df_scores

In [5]:
df_scores_list = []

for f in files:
    df_human = pd.read_csv(f) # Read
    df_human.iloc[1:,1:] = df_human.iloc[1:,1:].applymap(int) # Cast as int
    df_human = reorder(df_human, order_dict) # Reorder by model
    df_scores_list.append(df_human)

In [6]:
# Stack all the scores across evaluators
scores = np.stack([x.values[1:,1:] for x in df_scores_list], axis=2)

# Get the majority vote
scores = mode(scores, axis=2).mode.squeeze(2)

  scores = mode(scores, axis=2).mode.squeeze(2)
  scores = mode(scores, axis=2).mode.squeeze(2)


In [7]:
# Get the percentage stats
scores.mean(axis=0)

array([0.9333333333333333, 0.03333333333333333, 0.43333333333333335,
       0.26666666666666666, 0.9, 0.5, 0.03333333333333333, 0.2],
      dtype=object)

In [8]:
scores_read = scores.sum(axis=0)[:4]
scores_fact = scores.sum(axis=0)[4:]

In [9]:
# Hypothesis testing
for i in [0,1]:
    for j in [2,3]:
        result = test_proportions_2indep(
            count1 = scores_read[i], 
            nobs1  = 30, 
            count2 = scores_read[j], 
            nobs2  = 30, 
            compare='diff', 
            alternative='two-sided')
        print(i, j, scores_read[i], scores_read[j], result.pvalue)

for i in [0,1]:
    for j in [2,3]:
        result = test_proportions_2indep(
            count1 = scores_fact[i], 
            nobs1  = 30, 
            count2 = scores_fact[j], 
            nobs2  = 30, 
            compare='diff', 
            alternative='two-sided')
        print(i, j, scores_fact[i], scores_fact[j], result.pvalue)

0 2 28 13 4.054435145443759e-06
0 3 28 8 4.15930479501648e-11
1 2 1 13 0.00012150197564308133
1 3 1 8 0.01537780600910247
0 2 27 1 3.4571313718918064e-29
0 3 27 6 2.346254838823322e-12
1 2 15 1 8.384597405174142e-06
1 3 15 6 0.014193453412194966


In [10]:
# Print Fleiss-Kappa agreement scores
for i in range(1,9):
    temp_df = pd.concat([x.iloc[1:,i] for x in df_scores_list], axis=1)
    print(i, fleiss_kappa(aggregate_raters(temp_df)[0], 'fleiss'))

1 0.1908091908091907
2 -0.11801242236024921
3 0.004524886877828309
4 0.23685698134539257
5 0.03999999999999907
6 -0.026785714285714236
7 0.012345679012345146
8 0.3995552260934028
