In [157]:
from itertools import combinations
from collections import defaultdict, Counter, namedtuple

Sentence = namedtuple('Sentence', 'text anns')
Annotation = namedtuple('Annotation', 'span error_type correction ann_id')

# TODO: figure out how to handle partial agreement
def agree_by_correction(ann1, ann2):
    return ann1.span == ann2.span and \
           ann1.correction == ann2.correction

def span_agreements(anns):
    result = []
    
    ids = list(set(map(lambda x: x.ann_id, anns)))
    
    for id1, id2 in combinations(ids, 2):
        anns1 = list(filter(lambda x: x.ann_id == id1, anns))
        anns2 = list(filter(lambda x: x.ann_id == id2, anns))
        
        agreements = [[ann1, ann2] for ann1 in anns1
                                   for ann2 in anns2
                                   if agree_by_correction(ann1, ann2)]
        
        score = (2 * len(agreements)) / (len(anns1) + len(anns2))
        
        result.append((tuple(sorted([id1, id2])), score))
    
    return result

In [153]:
# Case: 2 annotators, 0-1 has 2 anns, 0-1 agree by one of anns
# S From a practitioner 's perspective these findings are important because if lay constructs of the family and kinship are a social construct they may not be in line with geneticists ' views of family relationships , or about which blood ( or non-blood ) relatives should be informed and by whom ; this is also likely to be dependent on the cultural and ethnic context .
# A 5 5|||Mec|||,|||REQUIRED|||-NONE-|||0
# A 22 22|||Mec|||,|||REQUIRED|||-NONE-|||0
# A 10 10|||Mec|||,|||REQUIRED|||-NONE-|||1
# A 22 22|||Mec|||,|||REQUIRED|||-NONE-|||1
anns = [
  Annotation(( 5,  5), 'Mec', ',', 0),
  Annotation((22, 22), 'Mec', ',', 0),
  Annotation((10, 10), 'Mec', ',', 1),
  Annotation((22, 22), 'Mec', ',', 1),
]
result = span_agreements(anns)
print(result)
assert result == [((0, 1), 0.5)]

[((0, 1), 0.5)]


In [147]:
def error_agreements(anns):
    result = []
    ids = list(set(map(lambda x: x.ann_id, anns)))
    
    for id1, id2 in combinations(ids, 2):
        anns1 = list(filter(lambda x: x.ann_id == id1, anns))
        anns2 = list(filter(lambda x: x.ann_id == id2, anns))

        agreements = [[ann1, ann2] for ann1 in anns1
                                   for ann2 in anns2
                                   if ann1.span == ann2.span]
                
        for ann1, ann2 in agreements:
            score = 1 if ann1.error_type == ann2.error_type else 0
            result.append((ann1.error_type, score))
            result.append((ann2.error_type, score))
    
    return result

In [148]:
# Case: 2 annotators, 0-1 has 1 ann, disagree by type
# S Here I want to share Forest'view on this issue .
# A 5 6|||Npos|||Forests'sview|||REQUIRED|||-NONE-|||0
# A 5 6|||Mec|||Forest 's view|||REQUIRED|||-NONE-|||1
anns = [
  Annotation((5, 6), 'Npos', 'Forests\'sview', 0),
  Annotation((5, 6), 'Mec', 'Forest \'s view', 1),
]
result = error_agreements(anns)
print(result)
assert result == [('Npos', 0), ('Mec', 0)]

[('Npos', 0), ('Mec', 0)]


In [149]:
# Case: 2 annotators, 0-1 has 2 anns, 1 agree by type
# S Besides that , the risk of the known genetic is very serious that it can not be described .
# A 9 9|||Wci|||disease|||REQUIRED|||-NONE-|||0
# A 10 11|||Wci|||so|||REQUIRED|||-NONE-|||0
# A 8 9|||Others|||disorder|||REQUIRED|||-NONE-|||1
# A 10 11|||Wci|||so|||REQUIRED|||-NONE-|||1
anns = [
  Annotation((9, 9), 'Wci', 'disease', 0),
  Annotation((10, 11), 'Wci', 'so', 0),
  Annotation((8, 9), 'Others', 'disorder', 1),
  Annotation((10, 11), 'Wci', 'so', 1),
  Annotation((10, 11), 'Wtone', 'so', 2),
]
result = error_agreements(anns)
print(result)
assert result == [('Wci', 1), ('Wci', 1), ('Wci', 0), ('Wtone', 0), ('Wci', 0), ('Wtone', 0)]

[('Wci', 1), ('Wci', 1), ('Wci', 0), ('Wtone', 0), ('Wci', 0), ('Wtone', 0)]


In [159]:
def process(data, agreement_fn):
    scores = defaultdict(Counter)

    for s in data:
        output = agreement_fn(s.anns)
        for key, score in output:
            scores[key]['score'] += score
            scores[key]['total'] += 1
    
    return scores

def print_avg(scores):
    avg = sum(map(lambda x: x['score'] / x['total'], scores.values())) / len(scores)
    print('Avg agreement: {:.2f}'.format(avg))  

def print_scores(scores):
    for key, result in scores.items():
        print('{} {:.2f} | {}'.format(key, result['score'] / result['total'], result['total']))
    
def make_ann(text):
    span, error_type, correction, _, _, ann_id = text.split('|||')
    span = tuple(map(int, span.split(' ')))
    return Annotation(span, error_type, correction, int(ann_id))

with open('official-2014.combined-withalt.m2') as lines:
    data = []
    
    for l in lines:
        l = l.strip()
        
        if l.startswith('S'):
            s = Sentence(l[2:], [])
            data.append(s)
        elif l.startswith('A'):
            s.anns.append(make_ann(l[2:]))
    
    print('General inter-annotator agreement:')
    span_scores = process(data, span_agreements)
    print_avg(span_scores)
    print_scores(span_scores)
    
    print()
    
    print('Inter-annotator agreement by error:')
    error_scores = process(data, error_agreements)
    print_scores(error_scores)

General inter-annotator agreement:
Avg agreement: 0.43
(0, 1) 0.23 | 1128
(0, 2) 0.35 | 349
(1, 2) 0.40 | 349
(0, 3) 0.42 | 59
(0, 4) 0.59 | 6
(1, 3) 0.49 | 59
(1, 4) 0.48 | 6
(2, 3) 0.47 | 59
(2, 4) 0.44 | 6
(3, 4) 0.47 | 6

Inter-annotator agreement by error:
Mec 0.85 | 694
Prep 0.95 | 658
ArtOrDet 0.92 | 894
Vt 0.79 | 354
Wci 0.81 | 846
Wform 0.75 | 222
Vform 0.71 | 270
Vm 0.64 | 94
Others 0.71 | 68
Ssub 0.62 | 97
Rloc- 0.61 | 270
WOinc 0.75 | 48
WOadv 0.89 | 18
Npos 0.79 | 66
Spar 0.46 | 26
Nn 0.94 | 440
SVA 0.88 | 326
Trans 0.78 | 174
Pref 0.65 | 203
Srun 0.51 | 35
Pform 0.60 | 90
Um 0.55 | 44
Sfrag 0.33 | 12
Wtone 0.59 | 17
V0 0.76 | 63
Wa 0.50 | 4
Smod 0.67 | 3


## Cпостереження і Висновки Щодо Якості Анотування в Корпусі та Рівня Згоди Анотувальників.
- Чим більше аннотації зробили обидва аннотувальники, тим менший між ними рівень згоди,
- анотувальники часто не погоджуються щодо точних проміжків, які містять помилку (порахувати),
- анотувальники по різному розуміють що є помилкою та як її можна виправити,
- помилки можуть бути пояснені декількома способами (додати прикладів; спробувати порахувати),
- існують помилки, категоризація яких є неоднозначною (додати прикладів; спробувати порахувати),
- є помилки, категоризація яких є неоднозначною (можливо визначити декілька классів),
- рівень згоди між анотувальниками і потенційні проблеми при анотуванні викликають сумніви щодо якості анотування в корпусі.