### For setup: change working directory to parent and load config for correct paths

In [1]:
import pandas as pd
import krippendorff
from pathlib import Path
import os
os.chdir('..')
from config import Config
cfg = Config.get()

Read evaluation files

In [2]:
work_path = cfg.working_dir
out_path = cfg.output_dir

# load evaluation files
labels_a = pd.read_csv(work_path.joinpath('image_eval_a.txt'), sep=' ')
labels_b = pd.read_csv(work_path.joinpath('image_eval_b.txt'), sep=' ')

labels_a.rename(columns={'Topic_correct': 'topic_correct_a'}, inplace=True)
labels_b.rename(columns={'Topic_correct': 'topic_correct_b'}, inplace=True)

Further preprocessing

In [3]:
# merge files, eliminate unwanted records
labels = pd.merge(labels_a, labels_b, how='outer', on=['image_id']).dropna() \
    .drop_duplicates(subset='image_id', keep="first")

# ensure typing
labels['topic_correct_b'] = labels['topic_correct_b'].astype(bool)
labels['topic_correct_a'] = labels['topic_correct_a'].astype(bool)

# compute agreement between the annotators
labels['agree'] = (labels['topic_correct_a'] == labels['topic_correct_b'])

#### Compute the intercoder reliability data

In [4]:
# create intercoder reliability DataFrame
reliability = labels[['Topic_x', 'agree']].groupby(by='Topic_x').sum()
sizes = labels[['Topic_x']].groupby(by='Topic_x').size()
reliability['size'] = sizes
reliability['intercoder_reliability'] = (reliability['agree'] / reliability['size'])

reliability = reliability.reset_index().rename(columns={'Topic_x': 'topic_id'})

# compute overall values
agree_sum = reliability['agree'].sum()
size_sum = reliability['size'].sum()
ir_all = reliability['agree'].sum() / reliability['size'].sum()
reliability.loc[len(reliability)] = ['all', agree_sum, size_sum, ir_all]

# compute Krippendorff's Alpha for each topic
alphas = []
for topic in labels['Topic_x'].unique():
    reliability_data = labels.loc[labels['Topic_x'] == topic, ['topic_correct_b', 'topic_correct_a']].astype(int)
    ka = krippendorff.alpha(reliability_data.transpose(), level_of_measurement='nominal')
    alphas.append(ka)

# compute the overall alpha
alphas.append(krippendorff.alpha(labels[['topic_correct_b', 'topic_correct_a']].astype(int).transpose(), level_of_measurement='nominal'))

# add to reliability dataframe
reliability['krippendorff'] = alphas

Show reliability results

In [5]:
reliability

Unnamed: 0,topic_id,agree,size,intercoder_reliability,krippendorff
0,51,86,101,0.851485,0.655704
1,55,90,100,0.9,0.647413
2,76,82,100,0.82,0.478147
3,81,90,99,0.909091,0.67438
4,100,87,100,0.87,0.689025
5,all,435,500,0.87,0.636648


#### Compute the validity data

In [6]:
# create topic validity DataFrame
# merge annotators evaluations, clean DataFrame
validity_b = labels[['Topic_y', 'topic_correct_b']]
validity_a = labels[['Topic_x', 'topic_correct_a']]
validity = pd.merge(validity_b, validity_a, left_index=True, right_index=True) \
    .drop(columns='Topic_y').rename(columns={'Topic_x': 'topic'})

# add column for cases where both annotated true
validity['topic_correct_both'] = validity['topic_correct_b'] & validity['topic_correct_a']

# sum up by topic and create new columns
validity = validity.groupby(by='topic').sum()
validity['size'] = sizes
validity['percent_correct_b'] = (validity['topic_correct_b'] / validity['size'])
validity['percent_correct_a'] = (validity['topic_correct_a'] / validity['size'])
validity['percent_correct_both'] = (validity['topic_correct_both'] / validity['size'])

validity = validity.reset_index()

Show the validity data

In [7]:
validity

Unnamed: 0,topic,topic_correct_b,topic_correct_a,topic_correct_both,size,percent_correct_b,percent_correct_a,percent_correct_both
0,51,70,69,62,101,0.693069,0.683168,0.613861
1,55,86,80,78,100,0.86,0.8,0.78
2,76,86,70,69,100,0.86,0.7,0.69
3,81,81,84,78,99,0.818182,0.848485,0.787879
4,100,67,74,64,100,0.67,0.74,0.64
