In [1]:
!wget https://raw.githubusercontent.com/andabi/deep-text-corrector/master/data/conll14st-test-data/alt/official-2014.combined-withalt.m2

--2020-03-27 14:58:07--  https://raw.githubusercontent.com/andabi/deep-text-corrector/master/data/conll14st-test-data/alt/official-2014.combined-withalt.m2
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 515159 (503K) [text/plain]
Saving to: ‘official-2014.combined-withalt.m2’


2020-03-27 14:58:08 (2.21 MB/s) - ‘official-2014.combined-withalt.m2’ saved [515159/515159]



In [2]:
pip install multiset

Note: you may need to restart the kernel to use updated packages.


In [3]:
with open('official-2014.combined-withalt.m2', 'r') as f:
    corpus_lines = f.readlines()

In [4]:
from difflib import SequenceMatcher

# get edit script in terms of 'insert' and 'delete' operations only
def edit_script(x, y):
    ops = []
    matcher = SequenceMatcher(None, x, y, autojunk=False)
    offset = 0
    for opcode in matcher.get_opcodes():
#         print(opcode)
        tag, x_s, x_e, y_s, y_e = opcode
        if tag == 'delete':
            for i in range(x_s, x_e):
                ops.append(('delete', i))
        elif tag == 'insert':
            for i in range(y_s, y_e):
                ops.append(('insert', y[i], x_s))
        elif tag == 'replace':
            for i in range(x_s, x_e):
                ops.append(('delete', i))
            for i in range(y_s, y_e):
                ops.append(('insert', y[i], x_s))
    return ops

s1 = ['Do', 'ya', 'like', 'dawgs', '?']
s2 = ['Do', 'you', 'like', 'dogs', '?']

edit_script(s1, s2)

[('delete', 1), ('insert', 'you', 1), ('delete', 3), ('insert', 'dogs', 3)]

In [5]:
class Annotation:
    category = None
    start = None
    end = None
    replacement = None
    tokenized_replacement = None
    annotator = None
    
    def __init__(self, start, end, category, replacement, annotator):
        assert category
        assert isinstance(start,int)
        assert isinstance(end,int)
        assert replacement is not None
        assert annotator
        self.category = category
        self.start = start
        self.end = end
        self.replacement = replacement
        self.tokenized_replacement = replacement.split()
        self.annotator = annotator
    
    def __str__(self):
        return "{}:[{} {}]|{}|{}".format(self.annotator, self.start, self.end, self.category, self.replacement)
    def __repr__(self):
        return self.__str__()

class AnnotatedSentence:
    tokens = None
    annotations = None
    
    def __init__(self, tokens, annotations):
        assert tokens        
        self.tokens = tokens
        self.annotations = annotations
        
    def _apply_annotations(self, annotations):
        offset = 0
        new_tokens = self.tokens.copy()
        for ann in sorted(annotations, key=lambda ann: ann.start):
            if ann.category == 'noop':
                break
            new_tokens[offset + ann.start: offset + ann.end] = ann.tokenized_replacement
            offset += len(ann.tokenized_replacement) - (ann.end - ann.start)
        return new_tokens
    
    def edit_ops(self, annotations):
        new_tokens = self._apply_annotations(annotations)
        return edit_script(self.tokens, new_tokens)
    
    def annotations_by_annotator(self):
        d = {}
        for ann in self.annotations:
            annotator_anns = d.setdefault(ann.annotator, [])
            annotator_anns.append(ann)
        return d
    
    def annotations_by_annotator_and_category(self):
        d = {}
        for ann in self.annotations:
            by_anntr = d.setdefault(ann.annotator, {})
            by_cat = by_anntr.setdefault(ann.category, [])
            by_cat.append(ann)
        return d
        
    def __str__(self):
        return "{}\n{}".format(' '.join(self.tokens), '\n'.join((map(str, self.annotations))))
    def __repr__(self):
        return self.__str__()
    
def parse_annotation(line):
    parts = line.split('|||')
    start, end = parts[0].split()[1:]
    category, replacement, _, _, annotator = parts[1:]
    return Annotation(int(start), int(end), category, replacement, annotator)

def parse_tokens(line):
    return line.split()[1:]

def parse_sentence_lines(lines):
    tokens = parse_tokens(lines[0])
    annotations = [parse_annotation(line) for line in lines[1:]]
    sent = AnnotatedSentence(tokens, annotations)
    if len(sent.annotations_by_annotator()) > 1:
        return sent

def parse_corpus_lines(lines):
    sentences_lines = []
    cur_sentence_lines = []
    for line in lines:
        line = line.strip()
        if line:
            if line.startswith('S'):
                if cur_sentence_lines:
                    sentences_lines.append(cur_sentence_lines)
                    cur_sentence_lines = []
            cur_sentence_lines.append(line)
    if cur_sentence_lines:
        sentences_lines.append(cur_sentence_lines)
    
    parsed_sents = []
    for lines in sentences_lines:
        parsed = parse_sentence_lines(lines)
        if parsed:
            parsed_sents.append(parsed)
    
    return parsed_sents

def parse_corpus(corpus):
    return parse_corpus_lines(corpus.splitlines())

In [6]:
import itertools
from multiset import Multiset

# agreement between annotators A and B is a number of 'insert'/'delete' operations
# that are common among A's and B's annotations divided by the total number
# of 'insert'/'delete' operatons that A's and B's annotations produce
def annotators_agreements(corpus):
    agreements = {}
    for sent in corpus:
        anns_by_anntr = sent.annotations_by_annotator()        
        anntr_combs = itertools.combinations(anns_by_anntr.keys(), 2)
        
        for x_anntr, y_anntr in anntr_combs:
            x_anns = anns_by_anntr[x_anntr]
            y_anns = anns_by_anntr[y_anntr]
            x_edits = Multiset(sent.edit_ops(x_anns))
            y_edits = Multiset(sent.edit_ops(y_anns))
            edits_num = len(x_edits) + len(y_edits)
            if edits_num == 0:
                agr = 1
            else:
                common_ops = x_edits.intersection(y_edits)
                agr = len(common_ops) * 2 / edits_num
            agreements.setdefault(frozenset({x_anntr, y_anntr}), []).append(agr)
    return agreements

In [7]:
lines = '''S Above all , life is more important than secret .
A 8 9|||Nn|||secrets|||REQUIRED|||-NONE-|||0
A 8 9|||Wform|||secrecy|||REQUIRED|||-NONE-|||1'''

# both annotators decided to delete 'secret' token -> two 'delete' operations are the same.
# however they decided to replace it by 'secrets' and 'secrecy' tokens respectively -> two 'insert' operations aren't the same.
# therefore, agreement = two same 'delete's / (two 'delete's + two 'insert's) = 0.5
annotators_agreements(parse_corpus(lines))

{frozenset({'0', '1'}): [0.5]}

In [8]:
lines = '''S When we are diagonosed out with certain genetic disease , are we suppose to disclose this result to our relatives ?
A 3 4|||Mec|||diagnosed|||REQUIRED|||-NONE-|||0
A 4 5|||Prep||||||REQUIRED|||-NONE-|||0
A 3 5|||Rloc-|||diagnosed|||REQUIRED|||-NONE-|||1'''

# all tokens matched - full agreement
annotators_agreements(parse_corpus(lines))

{frozenset({'0', '1'}): [1.0]}

In [9]:
lines = '''S He entered into at school .
A 2 4|||Foo||||||REQUIRED|||-NONE-|||0
A 3 4|||Bar||||||REQUIRED|||-NONE-|||1'''

# two 'delete's of 'at' token are common between two annotators, but 'delete' of 'into' isn't.
# therefore, aggrement = (two 'delete's of 'at') / (two 'delete's of 'at' + one 'delete' of 'into') = 0.66
annotators_agreements(parse_corpus(lines))

{frozenset({'0', '1'}): [0.6666666666666666]}

In [10]:
lines = '''S He entered into at school .
A 1 4|||Foo|||enters|||REQUIRED|||-NONE-|||0
A 2 3|||Bar||||||REQUIRED|||-NONE-|||1'''

# 2 out of 5 ops are the same
annotators_agreements(parse_corpus(lines))

{frozenset({'0', '1'}): [0.4]}

In [11]:
lines = '''S If certain disease genetic test is very accurate and it is unavoidable and necessary to get treatment and known by others , it is OK to disclose the result .
A 1 1|||ArtOrDet|||a|||REQUIRED|||-NONE-|||0
A 2 3|||Rloc-||||||REQUIRED|||-NONE-|||0
A 18 19|||Wci|||tell|||REQUIRED|||-NONE-|||0
A 19 20|||Prep||||||REQUIRED|||-NONE-|||0
A 1 1|||ArtOrDet|||a|||REQUIRED|||-NONE-|||1
A 2 5|||WOinc|||genetic test for a disease|||REQUIRED|||-NONE-|||1
A 18 18|||Spar|||to make it|||REQUIRED|||-NONE-|||1
A 19 20|||Prep|||to|||REQUIRED|||-NONE-|||1
A 1 1|||ArtOrDet|||a|||REQUIRED|||-NONE-|||2
A 2 3|||Rloc-||||||REQUIRED|||-NONE-|||2
A 18 18|||Spar|||to make it|||REQUIRED|||-NONE-|||2
A 19 20|||Prep|||to|||REQUIRED|||-NONE-|||2
A 1 1|||ArtOrDet|||a|||REQUIRED|||-NONE-|||3
A 2 3|||Rloc-||||||REQUIRED|||-NONE-|||3
A 18 19|||Wci|||tell|||REQUIRED|||-NONE-|||3
A 19 20|||Prep||||||REQUIRED|||-NONE-|||3
A 27 30|||WOinc|||results|||REQUIRED|||-NONE-|||3
A 1 1|||ArtOrDet|||a|||REQUIRED|||-NONE-|||4
A 2 3|||Rloc-||||||REQUIRED|||-NONE-|||4
A 18 19|||Wci|||tell|||REQUIRED|||-NONE-|||4
A 19 20|||Prep||||||REQUIRED|||-NONE-|||4
A 28 29|||Nn|||results|||REQUIRED|||-NONE-|||4'''

annotators_agreements(parse_corpus(lines))

{frozenset({'0', '1'}): [0.4],
 frozenset({'0', '2'}): [0.5],
 frozenset({'0', '3'}): [0.7142857142857143],
 frozenset({'0', '4'}): [0.8333333333333334],
 frozenset({'1', '2'}): [0.8235294117647058],
 frozenset({'1', '3'}): [0.3157894736842105],
 frozenset({'1', '4'}): [0.35294117647058826],
 frozenset({'2', '3'}): [0.375],
 frozenset({'2', '4'}): [0.42857142857142855],
 frozenset({'3', '4'}): [0.75]}

In [12]:
from statistics import mean

parsed_corpus = parse_corpus_lines(corpus_lines)
anntr_agr = {}
all_agrs = []
for pair, agreements in annotators_agreements(parsed_corpus).items():
    anntr_agr[pair] = mean(agreements)
    all_agrs.extend(agreements)

In [13]:
anntr_agr

{frozenset({'0', '1'}): 0.30903002378094613,
 frozenset({'0', '2'}): 0.40142757985057864,
 frozenset({'1', '2'}): 0.47211829964379404,
 frozenset({'0', '3'}): 0.4705591944591578,
 frozenset({'0', '4'}): 0.6320067388178843,
 frozenset({'1', '3'}): 0.581600891527528,
 frozenset({'1', '4'}): 0.5627751333633687,
 frozenset({'2', '3'}): 0.5319040640999786,
 frozenset({'2', '4'}): 0.46823104431800083,
 frozenset({'3', '4'}): 0.45396825396825397}

In [14]:
mean(all_agrs)

0.3747484216311601

In [15]:
def annotator_agreements_by_cat(corpus):
    agreements = {}
    for sent in corpus:
        by_anntr_and_cats = sent.annotations_by_annotator_and_category()
        anntr_combs = itertools.combinations(by_anntr_and_cats.keys(), 2)
        
        for x_anntr, y_anntr in anntr_combs:
            x_by_cats = by_anntr_and_cats[x_anntr]
            y_by_cats = by_anntr_and_cats[y_anntr]
            cats_to_process = set(list(x_by_cats.keys()) + list(y_by_cats.keys()))
            for cat in cats_to_process:
                x_anns = x_by_cats.get(cat)
                y_anns = y_by_cats.get(cat)
                                
                agr = 0
                if (x_anns is not None) and (y_anns is not None):
                    x_edits = Multiset(sent.edit_ops(x_anns))
                    y_edits = Multiset(sent.edit_ops(y_anns))
                    edits_num = len(x_edits) + len(y_edits)
                    if edits_num == 0:
                        agr = 1
                    else:
                        common_edits = x_edits.intersection(y_edits)
                        agr = len(common_edits) * 2 / edits_num                        
                agreements.setdefault((cat, frozenset({x_anntr, y_anntr})), []).append(agr)
    return agreements

In [16]:
lines = '''S If certain disease genetic test is very accurate and it is unavoidable and necessary to get treatment and known by others , it is OK to disclose the result .
A 2 5|||WOinc|||genetic test for a disease|||REQUIRED|||-NONE-|||1
A 27 30|||WOinc|||results|||REQUIRED|||-NONE-|||3'''

annotator_agreements_by_cat(parse_corpus(lines))

{('WOinc', frozenset({'1', '3'})): [0.0]}

In [17]:
lines = '''S He entered into at school.
A 2 4|||Foo||||||REQUIRED|||-NONE-|||1
A 3 4|||Foo||||||REQUIRED|||-NONE-|||3'''

annotator_agreements_by_cat(parse_corpus(lines))

{('Foo', frozenset({'1', '3'})): [0.6666666666666666]}

In [18]:
lines = '''S He entered into at school.
A 2 4|||Foo||||||REQUIRED|||-NONE-|||1
A 3 4|||Bar||||||REQUIRED|||-NONE-|||3'''

annotator_agreements_by_cat(parse_corpus(lines))

{('Foo', frozenset({'1', '3'})): [0], ('Bar', frozenset({'1', '3'})): [0]}

In [19]:
agr_by_cat = {}
for (cat, pair), agrs in annotator_agreements_by_cat(parsed_corpus).items():
    agr_by_cat.setdefault(cat, []).extend(agrs)
    
for cat, agrs in agr_by_cat.copy().items():
    agr_by_cat[cat] = mean(agrs)
    
agr_by_cat

{'noop': 0,
 'WOadv': 0.16037735849056603,
 'Vt': 0.32094749694749697,
 'Mec': 0.2685179520907307,
 'ArtOrDet': 0.31416750391109366,
 'Prep': 0.3293407731490288,
 'Nn': 0.3009072249589491,
 'Rloc-': 0.1503034319950808,
 'Wform': 0.33197278911564626,
 'Wci': 0.2249104216774839,
 'Vform': 0.27911458333333333,
 'Vm': 0.1808875931324911,
 'Others': 0.16039787184825352,
 'Trans': 0.20051239209133945,
 'Ssub': 0.21583695496738975,
 'V0': 0.24915824915824916,
 'WOinc': 0.1364071038251366,
 'Npos': 0.23950617283950618,
 'Spar': 0.1111111111111111,
 'Pref': 0.17404040404040405,
 'SVA': 0.3983589743589744,
 'Srun': 0.20082815734989648,
 'Pform': 0.23484848484848486,
 'Um': 0.15165559078602556,
 'Sfrag': 0.1,
 'Smod': 0.125,
 'Wtone': 0.08333333333333333,
 'Wa': 0.2}