# Manual Stereotype and Omission Annotation: IAA By Description

Calculate IAA measures at the description level, rather than the annotation level, for comparison with the document classification models.

In [None]:
import utils
import pandas as pd
import numpy as np
import re
from pathlib import Path
from intervaltree import Interval, IntervalTree

Generalize manual annotators' labels from the token level to the description level so their IAA can be compared to the model's performance.

First, load the manual annotator data for the Contextual labels, and keep only the rows with Stereotype and Omission labels. 

In [2]:
manual_ann_dir = "../data/annotator_data/"
f0 = manual_ann_dir+"labels0C.csv"
f3 = manual_ann_dir+"labels3.csv"
f4 = manual_ann_dir+"labels4.csv"
ann_files = [f0, f3, f4]
manual = pd.DataFrame()
for f in ann_files:
    df = pd.read_csv(f, index_col=0)
    manual = pd.concat([manual, df])
manual.annotator.value_counts()  # Looks good

Annotator 0    9243
Annotator 3    5125
Annotator 4    4486
Name: annotator, dtype: int64

In [3]:
manual = manual[manual.label.isin(["Omission", "Stereotype"])]
manual.head()  # Looks good

Unnamed: 0,file,entity,label,start,end,text,annotator,category
6,Coll-1444_00100.ann,T7,Omission,444,448,M.Ed,Annotator 0,Contextual
20,Coll-1444_00100.ann,T21,Omission,2065,2069,Bell,Annotator 0,Contextual
31,Coll-1326_00100.ann,T5,Omission,48,62,Dr. Rutherford,Annotator 0,Contextual
33,Coll-1326_00100.ann,T7,Omission,67,78,Dr. Gregory,Annotator 0,Contextual
57,Coll-1326_00100.ann,T31,Stereotype,1484,1528,considered to be one of the leading chemists,Annotator 0,Contextual


In [4]:
manual.annotator.value_counts()  # Looks good

Annotator 0    6507
Annotator 3    2715
Annotator 4    2710
Name: annotator, dtype: int64

In [5]:
# agg_df = pd.read_csv("../data/aggregated_data/aggregated_final.csv", index_col=0)
# agg_df.head()
desc_df = pd.read_csv("../data/crc_metadata/annot_descs.csv", usecols=["description_id", "file", "start_offset", "end_offset"])
print(desc_df.shape)
desc_df.head()

(27908, 4)


Unnamed: 0,description_id,file,start_offset,end_offset
0,0,AA5_00100.txt,0,16
1,1,AA5_00100.txt,17,76
2,2,AA5_00100.txt,77,633
3,3,AA5_00100.txt,634,1725
4,4,AA6_00100.txt,0,16


In [6]:
def removeFileType(file_list):
    new_file_list = []
    for f in file_list:
        new_file_list += [f[:-4]]
    return new_file_list

In [7]:
manual.insert(len(manual.columns), "filename", removeFileType(list(manual.file)))
desc_df.insert(len(desc_df.columns), "filename", removeFileType(list(desc_df.file)))
manual.head()

Unnamed: 0,file,entity,label,start,end,text,annotator,category,filename
6,Coll-1444_00100.ann,T7,Omission,444,448,M.Ed,Annotator 0,Contextual,Coll-1444_00100
20,Coll-1444_00100.ann,T21,Omission,2065,2069,Bell,Annotator 0,Contextual,Coll-1444_00100
31,Coll-1326_00100.ann,T5,Omission,48,62,Dr. Rutherford,Annotator 0,Contextual,Coll-1326_00100
33,Coll-1326_00100.ann,T7,Omission,67,78,Dr. Gregory,Annotator 0,Contextual,Coll-1326_00100
57,Coll-1326_00100.ann,T31,Stereotype,1484,1528,considered to be one of the leading chemists,Annotator 0,Contextual,Coll-1326_00100


In [8]:
manual_grouped = manual.groupby("filename", group_keys=False).apply(lambda x: x)
manual_grouped = manual_grouped.sort_values(by=["filename", "start", "end"])
manual_grouped = manual_grouped.reset_index().reset_index()
manual_grouped = manual_grouped.drop(columns=["index"])
manual_grouped = manual_grouped.rename(columns={"level_0": "manual_id"})
manual_grouped.head()

Unnamed: 0,manual_id,file,entity,label,start,end,text,annotator,category,filename
0,0,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Annotator 4,Contextual,AA5_00100
1,1,AA5_00100.ann,T15,Stereotype,696,723,leading Scottish Theologian,Annotator 4,Contextual,AA5_00100
2,2,AA6_00100.ann,T14,Stereotype,655,675,to William and Agnes,Annotator 3,Contextual,AA6_00100
3,3,AA6_00100.ann,T10,Omission,658,665,William,Annotator 4,Contextual,AA6_00100
4,4,AA6_00100.ann,T11,Omission,670,675,Agnes,Annotator 4,Contextual,AA6_00100


In [9]:
assert len(list(manual_grouped.manual_id)) == len(set(list(manual_grouped.manual_id)))

Only save description IDs for descriptions in files that have manual annotations:

In [10]:
manually_annot_files = list(set(list(manual_grouped.filename)))
desc_df = desc_df[desc_df.filename.isin(manually_annot_files)]
print(desc_df.shape)  # Looks good

(20975, 5)


In [11]:
desc_df = desc_df.drop(columns=["file"])
desc_df = desc_df.rename(columns={"start_offset":"start", "end_offset":"end"})
desc_df = desc_df.sort_values(by=["filename", "start", "end"])
desc_df = desc_df.set_index("description_id")
desc_df.head()

Unnamed: 0_level_0,start,end,filename
description_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,16,AA5_00100
1,17,76,AA5_00100
2,77,633,AA5_00100
3,634,1725,AA5_00100
4,0,16,AA6_00100


Assign description IDs to the manual annotators' data using the filenames and start offsets (no annotation extends beyond a single description):

In [12]:
desc_dict = desc_df.to_dict(orient="index")

In [13]:
manual_dict = dict.fromkeys(list(manual_grouped.manual_id))
for desc_id,d in desc_dict.items():
    f = d["filename"]
    manual_file_df = manual_grouped[manual_grouped.filename == f]
    manual_file_df = manual_file_df[manual_file_df.start >= d["start"]]
    manual_file_df = manual_file_df[manual_file_df.end <= d["end"]]
    if manual_file_df.shape[0] > 0:
        manual_id_list = list(manual_file_df.manual_id)
        for manual_id in manual_id_list:
            manual_dict[manual_id] = desc_id
print(manual_dict[4596]) # Looks good

4514


In [14]:
manualid_to_descid = pd.DataFrame({"manual_id":manual_dict.keys(), "description_id":manual_dict.values()})
manualid_to_descid.tail()

Unnamed: 0,manual_id,description_id
11927,11927,27866
11928,11928,27873
11929,11929,27879
11930,11930,27887
11931,11931,27900


In [15]:
manual_joined = manual_grouped.join(manualid_to_descid.set_index("manual_id"), on=["manual_id"])
manual_joined.tail()

Unnamed: 0,manual_id,file,entity,label,start,end,text,annotator,category,filename,description_id
11927,11927,Coll-1497_00300.ann,T10,Omission,4760,4769,Wolfenden,Annotator 4,Contextual,Coll-1497_00300,27866
11928,11928,Coll-1497_00300.ann,T11,Omission,5407,5416,Wolfenden,Annotator 4,Contextual,Coll-1497_00300,27873
11929,11929,Coll-1497_00400.ann,T2,Omission,433,442,Wolfenden,Annotator 4,Contextual,Coll-1497_00400,27879
11930,11930,Coll-1497_00400.ann,T5,Omission,1325,1333,Longford,Annotator 4,Contextual,Coll-1497_00400,27887
11931,11931,Coll-1497_00400.ann,T8,Omission,3681,3690,Wolfenden,Annotator 4,Contextual,Coll-1497_00400,27900


In [16]:
# desc_df[desc_df.index == 27900] # all in head and tail look good!

In [17]:
manual_joined = manual_joined.set_index("manual_id")
manual_joined.head()

Unnamed: 0_level_0,file,entity,label,start,end,text,annotator,category,filename,description_id
manual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,AA5_00100.ann,T7,Stereotype,34,63,The Very Rev Prof James Whyte,Annotator 4,Contextual,AA5_00100,1
1,AA5_00100.ann,T15,Stereotype,696,723,leading Scottish Theologian,Annotator 4,Contextual,AA5_00100,3
2,AA6_00100.ann,T14,Stereotype,655,675,to William and Agnes,Annotator 3,Contextual,AA6_00100,7
3,AA6_00100.ann,T10,Omission,658,665,William,Annotator 4,Contextual,AA6_00100,7
4,AA6_00100.ann,T11,Omission,670,675,Agnes,Annotator 4,Contextual,AA6_00100,7


In [18]:
manual_joined.to_csv(manual_ann_dir+"labels0C34_withdescid.csv")

Calculate IAA between annotators:

In [19]:
labels0C = manual_joined[manual_joined.annotator == "Annotator 0"]
labels3 = manual_joined[manual_joined.annotator == "Annotator 3"]
labels4 = manual_joined[manual_joined.annotator == "Annotator 4"]

In [37]:
# Find the files both input annotators labeled
def findCommonFiles(df_a, df_b):
    common = []
    files_a = set(list(df_a.file))
    files_b = set(list(df_b.file))
    for f in files_a:
        if f in files_b:
            common += [f]
    return common

def makeCommonFileLabelOffsetsDf(ann_df, common_files):
    label = list(ann_df.label)
    start = list(ann_df.start)
    end = list(ann_df.end)
    file = list(ann_df.file)
    i = 0
    maxI = len(start)
    df_files, df_labels, df_starts, df_ends = [], [], [], []
    while i < maxI:
        if file[i] in common_files:
            df_files += [file[i]]
            df_labels += [label[i]]
            df_starts += [start[i]]
            df_ends += [end[i]]
        i += 1
    return pd.DataFrame({"file":df_files, "label":df_labels, "start_offset":df_starts, "end_offset":df_ends})

# Create an interval tree for one annotator for a specified file and specified label
def createIntervalTree(df, filename, labelname):
    subdf = df[df.file == filename]                                       # Get only rows for the input file
    subdf = subdf[subdf.label == labelname]                               # Get only rows for that file with the input label
    subdf = subdf.astype({"start_offset":int, "end_offset":int})          # Make sure the offsets are integers
    offsets = list(zip(list(subdf.start_offset), list(subdf.end_offset)))
    return IntervalTree.from_tuples(offsets)

# Count all matches including exactly matching, overlapping, and enveloping annotations
def iaaPerFile(tree_exp, tree_pred):
    tp = 0                                                   # count of true positives
    fn = len(tree_exp.difference(tree_pred))                 # count of false negatives
    fp = len(tree_pred.difference(tree_exp))                 # count of false positives
    for annotation in tree_exp: 
        tp += len(tree_pred.overlap(annotation))
    return tp, fn, fp

# Create lists of all true positive, false negative, and false_positive counts for all common files between 
# expected and predicted annotators for the input label
def iaaAcrossFiles(df_exp, df_pred, commonfiles, labelname):
    true_positives, false_negatives, false_positives = [], [], []
    for f in commonfiles:
        t_exp = createIntervalTree(df_exp, f, labelname)
        t_pred = createIntervalTree(df_pred, f, labelname)
        tp, fn, fp = iaaPerFile(t_exp, t_pred)
        true_positives += [tp]
        false_negatives += [fn]
        false_positives += [fp]
    return true_positives, false_negatives, false_positives

def IAA(tp, fn, fp):
    if tp+fp == 0:
        precision = 1
    else:
        precision = (tp/(tp+fp))
    if tp+fn == 0:
        recall = 1
    else:
        recall = (tp/(tp+fn))
    f_1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f_1

def initDF():
    exp_ann = []
    pred_ann = []
    label = []
    truep = []
    falsep = []
    falsen = []
    prec = []
    rec = []
    f = []
    for_df = {"expected":exp_ann, "predicted":pred_ann, "label":label, "true_positive":truep, "false_positive":falsep, 
                     "false_negative":falsen, "precision":prec, "recall":rec, "F_1":f}
    return pd.DataFrame(for_df)

def appendRow(df, exp_name, pred_name, label_name, tp, fp, fn, precision, recall, f_1):
    new_row = pd.DataFrame({"expected":[exp_name], "predicted":[pred_name], "label":[label_name], "true_positive":[tp], 
                    "false_positive":[fp], "false_negative":[fn], "precision":[precision], "recall":[recall], "F_1":[f_1]})
    new_df = pd.concat([df, new_row])
    return new_df


In [38]:
commonC03 = findCommonFiles(labels0C, labels3)  
commonC04 = findCommonFiles(labels0C, labels4)  
commonC34 = findCommonFiles(labels3, labels4)
print(len(commonC03), len(commonC04), len(commonC34)) 

343 125 45


In [39]:
df0_with3 = makeCommonFileLabelOffsetsDf(labels0C, commonC03)
df3_with0 = makeCommonFileLabelOffsetsDf(labels3, commonC03)

df0_with4 = makeCommonFileLabelOffsetsDf(labels0C, commonC04)
df4_with0 = makeCommonFileLabelOffsetsDf(labels4, commonC04)

df3_with4 = makeCommonFileLabelOffsetsDf(labels3, commonC34)
df4_with3 = makeCommonFileLabelOffsetsDf(labels4, commonC34)

In [40]:
iaa_df = initDF()

labels = ["Omission", "Stereotype"]

for labelname in labels:
    # Ann0 as expected, Ann3 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(df0_with3, df3_with0, commonC03, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Annotator 0", "Annotator 3", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)
    # Ann0 as expected, Ann4 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(df0_with4, df4_with0, commonC04, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Annotator 0", "Annotator 4", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)
    # Ann3 as expected, Ann4 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(df3_with4, df4_with3, commonC34, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Annotator 3", "Annotator 4", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)

iaa_df

Unnamed: 0,expected,predicted,label,true_positive,false_positive,false_negative,precision,recall,F_1
0,Annotator 0,Annotator 3,Omission,1376.0,902.0,2891.0,0.604039,0.322475,0.420474
0,Annotator 0,Annotator 4,Omission,416.0,308.0,868.0,0.574586,0.323988,0.414343
0,Annotator 3,Annotator 4,Omission,215.0,305.0,155.0,0.413462,0.581081,0.483146
0,Annotator 0,Annotator 3,Stereotype,505.0,450.0,216.0,0.528796,0.700416,0.602625
0,Annotator 0,Annotator 4,Stereotype,507.0,522.0,585.0,0.492711,0.464286,0.478076
0,Annotator 3,Annotator 4,Stereotype,34.0,58.0,158.0,0.369565,0.177083,0.239437


In [41]:
iaa_df.to_csv(manual_ann_dir+"labels0C34_iaa_bydesc.csv")

Next calculate annotators' IAA with the aggregated dataset:

In [46]:
# Separate the input DataFrame's offsets column into 'start' offset and 'end' offset columns of type int
def splitOffsets(df):
    offsets = list(df.ann_offsets)
    start, end = [], []
    for o in offsets:
        pair = o[1:-1]
        pair_list = pair.split(",")
        start += [pair_list[0]]
        end += [pair_list[1]]
    df["start"] = start
    df["end"] = end
    df = df.astype({"start":int, "end":int})
    return df

In [52]:
agg = pd.read_csv("../data/aggregated_data/aggregated_final.csv")
agg = agg[agg.label.isin(["Omission", "Stereotype"])]
agg = splitOffsets(agg)
agg.head()

Unnamed: 0,agg_ann_id,file,text,ann_offsets,label,category,associated_genders,description_id,start,end
249,249,Coll-1036_00600.ann,"two boys, one girl","(18688, 18706)",Stereotype,Contextual,Multiple,1093,18688,18706
250,250,Coll-1357_00100.ann,daughter of the Rev. J. S. Whale,"(4252, 4284)",Stereotype,Contextual,Masculine,5533,4252,4284
251,251,Coll-1434_12800.ann,his wife,"(4822, 4830)",Stereotype,Contextual,Multiple,12903,4822,4830
252,252,Coll-1036_00600.ann,"two boys, one girl","(18688, 18706)",Omission,Contextual,Multiple,1093,18688,18706
253,253,Coll-1434_12800.ann,farmer's wife,"(4762, 4775)",Omission,Contextual,Feminine,12903,4762,4775


In [53]:
commonC0agg = findCommonFiles(labels0C, agg)
commonC3agg = findCommonFiles(labels3, agg)
commonC4agg = findCommonFiles(labels4, agg)

In [54]:
df0C = makeCommonFileLabelOffsetsDf(labels0C, commonC0agg)
agg_with0C = makeCommonFileLabelOffsetsDf(agg, commonC0agg)
df3 = makeCommonFileLabelOffsetsDf(labels3, commonC3agg)
agg_with3 = makeCommonFileLabelOffsetsDf(agg, commonC3agg)
df4 = makeCommonFileLabelOffsetsDf(labels4, commonC4agg)
agg_with4 = makeCommonFileLabelOffsetsDf(agg, commonC4agg)

In [55]:
iaa_df = initDF()
labels = ["Omission", "Stereotype"]
for labelname in labels:
    # Aggregated as expected, Ann0 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(agg_with0C, df0C, commonC0agg, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Aggregated", "Annotator 0", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)
    # Aggregated as expected, Ann3 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(agg_with3, df3, commonC3agg, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Aggregated", "Annotator 3", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)
    # Aggregated as expected, Ann4 as predicted
    tp_list, fn_list, fp_list = iaaAcrossFiles(agg_with4, df4, commonC4agg, labelname)
    tp_sum = sum(tp_list)
    fn_sum = sum(fn_list)
    fp_sum = sum(fp_list)
    prec, rec, f_1 = IAA(tp_sum, fn_sum, fp_sum)
    iaa_df = appendRow(iaa_df, "Aggregated", "Annotator 4", labelname, 
                       tp_sum, fp_sum, fn_sum, prec, rec, f_1)

iaa_df

Unnamed: 0,expected,predicted,label,true_positive,false_positive,false_negative,precision,recall,F_1
0,Aggregated,Annotator 0,Omission,5916.0,12.0,1167.0,0.997976,0.835239,0.909384
0,Aggregated,Annotator 3,Omission,2310.0,13.0,3102.0,0.994404,0.426829,0.597285
0,Aggregated,Annotator 4,Omission,1876.0,5.0,960.0,0.997342,0.661495,0.795421
0,Aggregated,Annotator 0,Stereotype,1748.0,11.0,966.0,0.993746,0.644068,0.781578
0,Aggregated,Annotator 3,Stereotype,1089.0,9.0,266.0,0.991803,0.80369,0.887892
0,Aggregated,Annotator 4,Stereotype,1400.0,2.0,697.0,0.998573,0.66762,0.800229


In [56]:
iaa_df.to_csv(manual_ann_dir+"labels0C34_iaa_withagg_bydesc.csv")