# Brat to Brat conversion

Converts a Brat dataset into another Brat dataset by converting a set of entity tags and attributes to another set.

Do not consider relations nor events (will not be copied into the converted dataset).

In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from os.path import isfile, isdir, join
from os import makedirs, walk, listdir
from shutil import rmtree, copy
import re
from random import randint

from brat_conversion_schemas import BRAT_CONVERSION_SCHEMAS

# Directory containing the original Brat annotations
INDIR = '../../data/release/train2021'

# Name of the conversion schema to be used (among those listed in BRAT_CONVERSION_SCHEMAS)
CONVERSION_SCHEMA = 'sosy_and_pathologies_with_aggregated_attributes'

# Directory containing the output (Brat annotations after mapping)
OUTDIR = f'../../data/work/brat_mapping/{CONVERSION_SCHEMA}'

assert(isdir(INDIR))
if isdir(OUTDIR):
    rmtree(OUTDIR)
makedirs(OUTDIR)

In [14]:
if CONVERSION_SCHEMA not in BRAT_CONVERSION_SCHEMAS:
    raise KeyError(f'Unknown schema {CONVERSION_SCHEMA}')
    
conversion_schema = BRAT_CONVERSION_SCHEMAS[CONVERSION_SCHEMA]

In [15]:
conversion_schema

{'name': 'sosy_and_pathologies_with_aggregated_attributes',
 'desc': '"sosy" and "pathology" are NOT grouped together into "sosypath", they are left alone, and each"assertion" attribute leads to an entity sosy_nonfactual and pathology_nonfactual. The result is "sosy" (no assertion attribute), "sosy_nonfactual","pathologie" (no assertion attribute), "pathologie_nonfactual"',
 'skip_if_absent': True,
 'mapping': {('sosy', '', ''): ('sosy', '', ''),
  ('sosy', 'assertion', 'absent'): ('sosy_absent', '', ''),
  ('sosy', 'assertion', 'hypothétique'): ('sosy_hypothetique', '', ''),
  ('sosy', 'assertion', 'non-associé'): ('sosy_non_associe', '', ''),
  ('pathologie', '', ''): ('pathologie', '', ''),
  ('pathologie', 'assertion', 'absent'): ('pathologie_absent', '', ''),
  ('pathologie', 'assertion', 'hypothétique'): ('pathologie_hypothetique',
   '',
   ''),
  ('pathologie', 'assertion', 'non-associé'): ('pathologie_non_associe',
   '',
   ''),
  ('sosy', '*', '*'): ('sosy', '', ''),
  ('pat

In [16]:
mapping = conversion_schema['mapping']

In [17]:
ENTITY_REGEX = re.compile('^(T\d+)\t([^ ]+) ([^\t]+)\t(.*)$')
ATTRIBUTE_REGEX = re.compile('(A\d+)\t([^ ]+) (T\d+) ?(.*)$')

In [18]:
from functools import cmp_to_key
def sort_triplets(x, y):
    if x[0] == '*':
        return 1
    elif y[0] == '*':
        return -1
    elif x[0] != y[0]:
        return x[0] < y[0]
    else:
        if x[1] == '*':
            return 1
        elif y[1] == '*':
            return -1
        elif x[1] != y[1]:
            return x[1] < y[1]
        else:
            if x[2] == '*':
                return 1
            elif y[2] == '*':
                return -1
            elif x[2] != y[2]:
                return x[2] < y[2]
            else:
                return 0
            


In [19]:
for filename in listdir(INDIR):
    if filename.endswith('.ann'):
        annpath = join(INDIR, filename)
        txtpath = join(INDIR, filename[:-4] + '.txt')
        # copy text file
        assert(isfile(txtpath))
        copy(txtpath, OUTDIR)
        outannpath = join(OUTDIR, filename)
        
        entities = {}
        attributes = {}
        attribute_ids = set()
        
        # Read Brat annotation informations
        with open(annpath, 'r', encoding='utf-8') as f_ann:
            for line in f_ann:
                line = line.strip()
                # parse entity
                entity_match = ENTITY_REGEX.match(line)
                if entity_match is not None:
                    t_id = entity_match.group(1)
                    t_type = entity_match.group(2)
                    t_offsets = entity_match.group(3)
                    t_text = entity_match.group(4)
                    entities[t_id] = (t_type, t_offsets, t_text)
                    continue
                # parse attribute   
                attribute_match = ATTRIBUTE_REGEX.match(line)
                if attribute_match is not None:
                    a_id = attribute_match.group(1)
                    a_key = attribute_match.group(2)
                    t_id = attribute_match.group(3)
                    if len(attribute_match.groups()) > 3:
                        a_value = attribute_match.group(4)
                    else:
                        a_value = None
                    t_attributes = attributes.get(t_id, [])
                    t_attributes.append((a_id, a_key, a_value))
                    attributes[t_id] = t_attributes
                    attribute_ids.add(a_id)
                    
        # Parse the results and convert
        with open(outannpath, 'w', encoding='utf-8') as f_out:
            for t_id, (t_type, t_offset, t_text) in entities.items():
                found = None
                # the entity type can be mapped to "*" or to its name
                # (starting by the most specific)
                for t in (t_type, '*'):
                    # if the entity has attributes
                    if t_id in attributes:
                        attribute_loop = [
                            ("*", "*")
                        ]
                        for (_, a_key, a_value) in attributes[t_id]:
                            attribute_loop.extend([
                                (a_key, "*"),
                                (a_key, a_value)
                            ])
                    # if the entity has no attribute, it can be mapped to "no attribute"
                    # or to any attribute
                    else:
                        attribute_loop = [
                            ("", ""),
                            ("*", "*")
                        ]
                    # sort attribute mapping from most specific to less specific
                    attribute_loop = sorted(attribute_loop, key=cmp_to_key(sort_triplets))

                    for (k, v) in attribute_loop:
                        if (t, k, v) in mapping:
                            found = (t, k, v), mapping[(t, k, v)]
                            break
                    if found is not None:
                        break
                # to be converted!
                if found is not None:
                    (old_t, old_k, old_v), (new_t, new_k, new_v) = found
                    if new_t is None:
                        new_t = t_type
                    if new_k is None:
                        new_k = old_k
                    if new_v is None:
                        new_v = old_v
                    f_out.write(f'{t_id}\t{new_t} {t_offset}\t{t_text}\n')
                    # choose random attribute id
                    a_id = randint(0, 10000)
                    if new_k != '':
                        # reproduce all attributes
                        if new_k == '*':
                            for (a_id, a_key, a_value) in attributes.get(t_id, []):
                                f_out.write(f'{a_id}\t{a_key} {t_id} {a_value}\n')
                        elif new_v == '*':
                            for (a_id, a_key, a_value) in attributes.get(t_id, []):
                                if a_key == new_k:
                                    f_out.write(f'{a_id}\t{a_key} {t_id} {a_value}\n')
                        else:
                            # build new attribute
                            while 'A' + str(a_id) in attribute_ids:
                                a_id = randint(0, 10000)
                            f_out.write(f'A{a_id}\t{new_k} {t_id} {new_v}\n')
        # Write description as a README
        with open(join(OUTDIR, 'README'), 'w', encoding='utf-8') as f_out:
            f_out.write('# ' + conversion_schema['name'] + '\n\n')
            f_out.write(conversion_schema['desc'])

In [20]:
OUTDIR

'../../data/work/brat_mapping/sosy_and_pathologies_with_aggregated_attributes'