In [1]:
# requirements
import numpy as np
import pandas as pd
import krippendorff
import json
import itertools

from utils.iaa_util import chonker, no_noise_lists, filter_noise_scores, dia_ex, sum_ex

# Inter Annotator Agreement & Correlation Calculation

This notebook is intended to make the calculations easy to do and gather the results within this notebook.\
No results are stored anywhere else on this drive.

A script version for different sections of this notebook can be found in the same folder as this file:\
`ablation_corr.py` and `originalVpresent_iaa_corr.py`\
That file will store the results on the drive in csv files in the 'results' folder

To run the present notebook correctly please make sure to adjust the filepaths in the cell below to adhere to the correct file names and paths. When that is set, running all cells will output the results in the various sections of this file.


In [2]:
# Filepaths
# Ablation results
ann1_ablation_path = 'data/ablation_ann1.xlsx'
ann2_ablation_path = 'data/ablation_ann2.xlsx'
ann3_ablation_path = 'data/ablation_ann3.xlsx'

# Full results
ann1_results_path = f'data/saved_df_ann1.csv'
ann2_results_path = f'data/saved_df_ann2.csv'
ann3_results_path = f'data/saved_df_ann3.csv'

# Original paper results
original_results_path = 'data/original_human_judgment.jsonl'

# Load data

In [3]:
# Present paper data
ann1 = pd.read_csv(ann1_results_path, delimiter=';')
ann2 = pd.read_csv(ann2_results_path, delimiter=';')
ann3 = pd.read_csv(ann3_results_path, delimiter=';')

# original data
fname = original_results_path
data = [] #NEWstart
with open(fname, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.rstrip('\n|\r')))


In [4]:
# transform original data to match our data
original_annotator_scores= []

# a row in the data: id, dialogue, summary, annotations, model_id
# loop through annotator and each dimension 
for annotator in [0,1,2]:  
    scores = []  
    for type in ['consistency', 'coherence', 'fluency', 'relevance']:
        list_annotations=[]
        # store all annotations for an annotator 
        for row in data:
            annotations = row.get('annotations')
            list_annotations.append(annotations[annotator])
        dimension_list=[]
        # 
        for dimensions in list_annotations:
            dimension=dimensions.get(type.lower())
            dimension_list.append(dimension)
        scores.append(dimension_list)
    original_annotator_scores.append(scores)

orig_ann1 = pd.DataFrame(zip(original_annotator_scores[0][0], original_annotator_scores[0][1], original_annotator_scores[0][2], original_annotator_scores[0][3]), 
        columns=['consistency_results', 'coherence_results', 'fluency_results', 'relevance_results'])

orig_ann2 = pd.DataFrame(zip(original_annotator_scores[1][0], original_annotator_scores[1][1], original_annotator_scores[1][2], original_annotator_scores[1][3]), 
        columns=['consistency_results', 'coherence_results', 'fluency_results', 'relevance_results'])

orig_ann3 = pd.DataFrame(zip(original_annotator_scores[2][0], original_annotator_scores[2][1], original_annotator_scores[2][2], original_annotator_scores[2][3]), 
        columns=['consistency_results', 'coherence_results', 'fluency_results', 'relevance_results'])

In [5]:
# Ablation data
tab_names = ['A', 'B', 'C', 'D', 'E' ,'F', 'G' ,'H' ,'I','J' ,'K', 'L', 'M' , 'N']
df_1 = pd.concat(pd.read_excel(ann1_ablation_path, sheet_name=tab_names))
df_2 = pd.concat(pd.read_excel(ann2_ablation_path, sheet_name=tab_names))
df_3 = pd.concat(pd.read_excel(ann3_ablation_path, sheet_name=tab_names))

# The full data has its text displayed differently from the ablation data
# The following code aligns this:
for df in [ann1, ann2, ann3]:
    df['dialogue'] = df.texts.transform(dia_ex)
    df['summary'] = df.texts.transform(sum_ex)

# extract dialogues from full
ann1_selection = ann1[ann1['summary'].isin(list(df_1['summary']))]
ann2_selection = ann2[ann2['summary'].isin(list(df_2['summary']))]
ann3_selection = ann3[ann3['summary'].isin(list(df_3['summary']))]


# Prepare data
Set up variables to do IAA and correlations.\
This section does the following:
1. Split every dimension's results into nested lists of 14 scores (1 list per dialogue). 
2. replace noisy scores by `np.nan` (see paper for a discussion)
3. Calculate an averaged score for each summary. Results in an array of (100, 14)

In [6]:
# 1. 
# Present paper annotations
# transform the list of evaluation values into: [[n*14]*100]
coherence_ann1 = chonker(list(ann1['coherence_results']), 14)
coherence_ann2 = chonker(list(ann2['coherence_results']), 14)
coherence_ann3 = chonker(list(ann3['coherence_results']), 14)

consistency_ann1 = chonker(list(ann1['consistency_results']), 14)
consistency_ann2 = chonker(list(ann2['consistency_results']), 14)
consistency_ann3 = chonker(list(ann3['consistency_results']), 14)

fluency_ann1 = chonker(list(ann1['fluency_results']), 14)
fluency_ann2 = chonker(list(ann2['fluency_results']), 14)
fluency_ann3 = chonker(list(ann3['fluency_results']), 14)

relevance_ann1 = chonker(list(ann1['relevance_results']), 14)
relevance_ann2 = chonker(list(ann2['relevance_results']), 14)
relevance_ann3 = chonker(list(ann3['relevance_results']), 14)

# Original Annotators
# transform the list of evaluation values into: [[n*14]*100]
coherence_orig_ann1 = chonker(list(orig_ann1['coherence_results']), 14)
coherence_orig_ann2 = chonker(list(orig_ann2['coherence_results']), 14)
coherence_orig_ann3 = chonker(list(orig_ann3['coherence_results']), 14)

consistency_orig_ann1 = chonker(list(orig_ann1['consistency_results']), 14)
consistency_orig_ann2 = chonker(list(orig_ann2['consistency_results']), 14)
consistency_orig_ann3 = chonker(list(orig_ann3['consistency_results']), 14)

fluency_orig_ann1 = chonker(list(orig_ann1['fluency_results']), 14)
fluency_orig_ann2 = chonker(list(orig_ann2['fluency_results']), 14)
fluency_orig_ann3 = chonker(list(orig_ann3['fluency_results']), 14)

relevance_orig_ann1 = chonker(list(orig_ann1['relevance_results']), 14)
relevance_orig_ann2 = chonker(list(orig_ann2['relevance_results']), 14)
relevance_orig_ann3 = chonker(list(orig_ann3['relevance_results']), 14)

In [7]:
# 1. 
# Do the same as the cell before, but for all the ablation data
# Our full annotation (but selection)
# transform the list of evaluation values into: [[n*14]*10]
coherence_ann1_sel = chonker(list(ann1_selection['coherence_results']), 14)
coherence_ann2_sel = chonker(list(ann2_selection['coherence_results']), 14)
coherence_ann3_sel = chonker(list(ann3_selection['coherence_results']), 14)

consistency_ann1_sel = chonker(list(ann1_selection['consistency_results']), 14)
consistency_ann2_sel = chonker(list(ann2_selection['consistency_results']), 14)
consistency_ann3_sel = chonker(list(ann3_selection['consistency_results']), 14)

fluency_ann1_sel = chonker(list(ann1_selection['fluency_results']), 14)
fluency_ann2_sel = chonker(list(ann2_selection['fluency_results']), 14)
fluency_ann3_sel = chonker(list(ann3_selection['fluency_results']), 14)

relevance_ann1_sel = chonker(list(ann1_selection['relevance_results']), 14)
relevance_ann2_sel = chonker(list(ann2_selection['relevance_results']), 14)
relevance_ann3_sel = chonker(list(ann3_selection['relevance_results']), 14)


# new ablation
# transform the list of evaluation values into: [[n*14]*10]
coherence_ann1_abl = [[item[i] for item in chonker(list(df_1['Coherence']), 10)] for i in range(10)]
coherence_ann2_abl = [[item[i] for item in chonker(list(df_2['Coherence']), 10)] for i in range(10)]
coherence_ann3_abl = [[item[i] for item in chonker(list(df_3['Coherence']), 10)] for i in range(10)]

consistency_ann1_abl = [[item[i] for item in chonker(list(df_1['Consistency']), 10)] for i in range(10)]
consistency_ann2_abl = [[item[i] for item in chonker(list(df_2['Consistency']), 10)] for i in range(10)]
consistency_ann3_abl = [[item[i] for item in chonker(list(df_3['Consistency']), 10)] for i in range(10)]

fluency_ann1_abl = [[item[i] for item in chonker(list(df_1['Fluency']), 10)] for i in range(10)]
fluency_ann2_abl = [[item[i] for item in chonker(list(df_2['Fluency']), 10)] for i in range(10)]
fluency_ann3_abl = [[item[i] for item in chonker(list(df_3['Fluency']), 10)] for i in range(10)]

relevance_ann1_abl = [[item[i] for item in chonker(list(df_1['Relevance']), 10)] for i in range(10)]
relevance_ann2_abl = [[item[i] for item in chonker(list(df_2['Relevance']), 10)] for i in range(10)]
relevance_ann3_abl = [[item[i] for item in chonker(list(df_3['Relevance']), 10)] for i in range(10)]

In [8]:
# 2. 
# Noise replacement
orig_coherence_nonoise1, orig_coherence_nonoise2, orig_coherence_nonoise3 = no_noise_lists(coherence_orig_ann1, coherence_orig_ann2, coherence_orig_ann3)
orig_consistency_nonoise1, orig_consistency_nonoise2, orig_consistency_nonoise3 = no_noise_lists(consistency_orig_ann1, consistency_orig_ann2, consistency_orig_ann3)
orig_fluency_nonoise1, orig_fluency_nonoise2, orig_fluency_nonoise3 = no_noise_lists(fluency_orig_ann1, fluency_orig_ann2, fluency_orig_ann3)
orig_relevance_nonoise1, orig_relevance_nonoise2, orig_relevance_nonoise3 = no_noise_lists(relevance_orig_ann1, relevance_orig_ann2, relevance_orig_ann3)


Number of scores that were not filtered: 3161
Number of scores that were not filtered: 3360
Number of scores that were not filtered: 3050
Number of scores that were not filtered: 3439


In [9]:
coherence_nonoise1, coherence_nonoise2, coherence_nonoise3 = no_noise_lists(coherence_ann1, coherence_ann2, coherence_ann3)
consistency_nonoise1,consistency_nonoise2,consistency_nonoise3 = no_noise_lists(consistency_ann1, consistency_ann2, consistency_ann3)
fluency_nonoise1, fluency_nonoise2,fluency_nonoise3 = no_noise_lists(fluency_ann1, fluency_ann2, fluency_ann3)
relevance_nonoise1,relevance_nonoise2,relevance_nonoise3 = no_noise_lists(relevance_ann1, relevance_ann2, relevance_ann3)

Number of scores that were not filtered: 3607
Number of scores that were not filtered: 3754
Number of scores that were not filtered: 3625
Number of scores that were not filtered: 3394


In [10]:
# 3.
# for each dimension filer out the noise and take the average
# our results
coherence_filtered = filter_noise_scores(coherence_ann1, coherence_ann2, coherence_ann3, 100)
consistency_filtered = filter_noise_scores(consistency_ann1, consistency_ann2, consistency_ann3, 100)
fluency_filtered = filter_noise_scores(fluency_ann1, fluency_ann2, fluency_ann3, 100)
relevance_filtered = filter_noise_scores(relevance_ann1, relevance_ann2, relevance_ann3, 100) 

# original results
orig_coherence_filtered = filter_noise_scores(coherence_orig_ann1, coherence_orig_ann2, coherence_orig_ann3, 100)
orig_consistency_filtered = filter_noise_scores(consistency_orig_ann1, consistency_orig_ann2, consistency_orig_ann3, 100)
orig_fluency_filtered = filter_noise_scores(fluency_orig_ann1, fluency_orig_ann2, fluency_orig_ann3, 100)
orig_relevance_filtered = filter_noise_scores(relevance_orig_ann1, relevance_orig_ann2, relevance_orig_ann3, 100)


In [11]:
# 3. 
# same as above but for ablation data
coherence_filtered_sel = filter_noise_scores(coherence_ann1_sel, coherence_ann2_sel, coherence_ann3_sel, 10)
consistency_filtered_sel = filter_noise_scores(consistency_ann1_sel, consistency_ann2_sel, consistency_ann3_sel, 10)
fluency_filtered_sel = filter_noise_scores(fluency_ann1_sel, fluency_ann2_sel, fluency_ann3_sel, 10)
relevance_filtered_sel = filter_noise_scores(relevance_ann1_sel, relevance_ann2_sel, relevance_ann3_sel, 10) 

coherence_filtered_abl = filter_noise_scores(coherence_ann1_abl, coherence_ann2_abl, coherence_ann3_abl, 10)
consistency_filtered_abl = filter_noise_scores(consistency_ann1_abl, consistency_ann2_abl, consistency_ann3_abl, 10)
fluency_filtered_abl = filter_noise_scores(fluency_ann1_abl, fluency_ann2_abl, fluency_ann3_abl, 10)
relevance_filtered_abl = filter_noise_scores(relevance_ann1_abl, relevance_ann2_abl, relevance_ann3_abl, 10) 

# calculations

### IAA calculations
Using krippendorff's alpha

In [12]:
# Original IAA
coherence_IAA = krippendorff.alpha(reliability_data=[list(orig_ann1['coherence_results']),list(orig_ann2['coherence_results']),list(orig_ann3['coherence_results'])], level_of_measurement="interval")
fluency_IAA = krippendorff.alpha(reliability_data=[list(orig_ann1['fluency_results']),list(orig_ann2['fluency_results']),list(orig_ann3['fluency_results'])],level_of_measurement="interval")
consistency_IAA = krippendorff.alpha(reliability_data=[list(orig_ann1['consistency_results']),list(orig_ann2['consistency_results']),list(orig_ann3['consistency_results'])],level_of_measurement="interval")
relevance_IAA = krippendorff.alpha(reliability_data=[list(orig_ann1['relevance_results']),list(orig_ann2['relevance_results']),list(orig_ann3['relevance_results'])],level_of_measurement="interval")

print(f'Original IAA scores:\nCoherence: {coherence_IAA} \nConsistency: {consistency_IAA} \nFluency: {fluency_IAA} \nRelevance: {relevance_IAA}')

Original IAA scores:
Coherence: 0.3785576910327453 
Consistency: 0.492762621233651 
Fluency: 0.13361325787541334 
Relevance: 0.38671951755027045


In [13]:
coherence_IAA = krippendorff.alpha(reliability_data=[orig_coherence_nonoise1, orig_coherence_nonoise2, orig_coherence_nonoise3], level_of_measurement="interval")
fluency_IAA = krippendorff.alpha(reliability_data=[orig_fluency_nonoise1, orig_fluency_nonoise2, orig_fluency_nonoise3],level_of_measurement="interval")
consistency_IAA = krippendorff.alpha(reliability_data=[orig_consistency_nonoise1, orig_consistency_nonoise2, orig_consistency_nonoise3],level_of_measurement="interval")
relevance_IAA = krippendorff.alpha(reliability_data=[orig_relevance_nonoise1, orig_relevance_nonoise2, orig_relevance_nonoise3],level_of_measurement="interval")

print(f'Original IAA scores (cleaned):\nCoherence: {coherence_IAA} \nConsistency: {consistency_IAA} \nFluency: {fluency_IAA} \nRelevance: {relevance_IAA}')

Original IAA scores (cleaned):
Coherence: 0.7564020052749023 
Consistency: 0.6709247287723659 
Fluency: 0.6781873766563293 
Relevance: 0.5620838662718799


In [14]:
# Our IAA
coherence_IAA = krippendorff.alpha(reliability_data=[list(ann1['coherence_results']),list(ann2['coherence_results']),list(ann3['coherence_results'])], level_of_measurement="interval")
fluency_IAA = krippendorff.alpha(reliability_data=[list(ann1['fluency_results']),list(ann2['fluency_results']),list(ann3['fluency_results'])],level_of_measurement="interval")
consistency_IAA = krippendorff.alpha(reliability_data=[list(ann1['consistency_results']),list(ann2['consistency_results']),list(ann3['consistency_results'])],level_of_measurement="interval")
relevance_IAA = krippendorff.alpha(reliability_data=[list(ann1['relevance_results']),list(ann2['relevance_results']),list(ann3['relevance_results'])],level_of_measurement="interval")

print(f'Our IAA scores:\nCoherence: {coherence_IAA} \nConsistency: {consistency_IAA} \nFluency: {fluency_IAA} \nRelevance: {relevance_IAA}')

Our IAA scores:
Coherence: 0.6074598852525589 
Consistency: 0.7938139086383895 
Fluency: 0.5233497800176004 
Relevance: 0.5159398644189812


In [15]:
coherence_IAA = krippendorff.alpha(reliability_data=[coherence_nonoise1, coherence_nonoise2, coherence_nonoise3], level_of_measurement="interval")
fluency_IAA = krippendorff.alpha(reliability_data=[fluency_nonoise1, fluency_nonoise2, fluency_nonoise3],level_of_measurement="interval")
consistency_IAA = krippendorff.alpha(reliability_data=[consistency_nonoise1, consistency_nonoise2, consistency_nonoise3],level_of_measurement="interval")
relevance_IAA = krippendorff.alpha(reliability_data=[relevance_nonoise1, relevance_nonoise2, relevance_nonoise3],level_of_measurement="interval")

print(f'Our IAA scores (cleaned):\nCoherence: {coherence_IAA} \nConsistency: {consistency_IAA} \nFluency: {fluency_IAA} \nRelevance: {relevance_IAA}')

Our IAA scores (cleaned):
Coherence: 0.7780447258554797 
Consistency: 0.9227800328928509 
Fluency: 0.76903271937251 
Relevance: 0.7160557952595286


### Correlation between original paper annotations and present paper's
using Pearson's R

In [18]:
# first transform filtered arrays into a single long list
# for each dimension, for both present paper and original results
list_coh_fil = list(itertools.chain(*coherence_filtered))
list_coh_fil_orig = list(itertools.chain(*orig_coherence_filtered))

list_con_fil = list(itertools.chain(*consistency_filtered))
list_con_fil_orig = list(itertools.chain(*orig_consistency_filtered))

list_flu_fil = list(itertools.chain(*fluency_filtered))
list_flu_fil_orig = list(itertools.chain(*orig_fluency_filtered))

list_rel_fil = list(itertools.chain(*relevance_filtered))
list_rel_fil_orig = list(itertools.chain(*orig_relevance_filtered))

print('Coherence correlation', round(np.corrcoef(list_coh_fil, list_coh_fil_orig)[0][1],3))
print('Consistency correlation', round(np.corrcoef(list_con_fil, list_con_fil_orig)[0][1],3))
print('Fluency correlation', round(np.corrcoef(list_flu_fil, list_flu_fil_orig)[0][1],3))
print('Relevance correlation', round(np.corrcoef(list_rel_fil, list_rel_fil_orig)[0][1],3))

Coherence correlation 0.417
Consistency correlation 0.77
Fluency correlation 0.546
Relevance correlation 0.685


### Ablation study Correlation
Using Pearson's R

In [19]:
list_coh_fil = list(itertools.chain(*coherence_filtered_sel))
list_coh_fil_orig = list(itertools.chain(*coherence_filtered_abl))
list_con_fil = list(itertools.chain(*consistency_filtered_sel))
list_con_fil_orig = list(itertools.chain(*consistency_filtered_abl))
list_flu_fil = list(itertools.chain(*fluency_filtered_sel))
list_flu_fil_orig = list(itertools.chain(*fluency_filtered_abl))
list_rel_fil = list(itertools.chain(*relevance_filtered_sel))
list_rel_fil_orig = list(itertools.chain(*relevance_filtered_abl))

print('Coherence correlation', round(np.corrcoef(list_coh_fil, list_coh_fil_orig)[0][1],3))
print('Consistency correlation', round(np.corrcoef(list_con_fil, list_con_fil_orig)[0][1],3))
print('Fluency correlation', round(np.corrcoef(list_flu_fil, list_flu_fil_orig)[0][1],3))
print('Relevance correlation', round(np.corrcoef(list_rel_fil, list_rel_fil_orig)[0][1],3))

Coherence correlation 0.705
Consistency correlation 0.664
Fluency correlation 0.767
Relevance correlation 0.51
