In [11]:
from collections import defaultdict
import pandas as pd

In [3]:
data_filename = './data/official-2014.combined-withalt.m2'

In [4]:
class AnnotationsParser:

    def __init__(self, filename):
        self.filename = filename

    def get_annotations(self):
        with open(self.filename) as f:
            lines = [l.strip() for l in f.readlines() if l.strip() != '']

        annotations_dict = defaultdict(list)

        current_sentence = None
        for line in lines:
            if line.startswith('S '):
                current_sentence = line
            elif line.startswith('A '):
                annotations_dict[current_sentence].append(self._parse_annotation(line))

        filtered_annotations_dict = {
            sentence: annotations for sentence, annotations in annotations_dict.items() if len(annotations) > 1
        }

        return filtered_annotations_dict

    def _parse_annotation(self, line):
        # 'A 25 27|||WOadv|||will definitely|||REQUIRED|||-NONE-|||1']
        fields = line[2:].split('|||')
        start, end = (int(f) for f in fields[0].split(' '))
        return {
            'start': start,
            'end': end,
            'error_tag': fields[1],
            'correction': fields[2] if fields[2] != '-NONE-' else None,
            'annotator_id': int(fields[5])
        }
        

In [5]:
parser = AnnotationsParser(data_filename)
annotations_dict = parser.get_annotations()
list(annotations_dict.keys())[5]

'S And both are not what we want since most of us just want to live as normal people .'

In [7]:
annotations_dict['S And both are not what we want since most of us just want to live as normal people .']

[{'start': -1,
  'end': -1,
  'error_tag': 'noop',
  'correction': None,
  'annotator_id': 0},
 {'start': 7,
  'end': 7,
  'error_tag': 'Mec',
  'correction': ',',
  'annotator_id': 1}]

In [14]:
def get_annotator_ids(annotations_dict):
    annotator_ids = set([])
    for sentence, annotations in annotations_dict.items():
        for annotation in annotations:
            annotator_ids.add(annotation['annotator_id'])

    return list(annotator_ids)

annotator_ids = get_annotator_ids(annotations_dict)