## Demo on how to generate data from BRAT or BIO format for biaffine training

- the default format of the data is below, we only read en_text, en_type, start, end indexes, you can add other information in entities after these elements for post-processing
```
[
    {"tokens": [xx, xx, xx, ...], 
    "entities": [[en_text, en_type, (start_idx, end_idx)], [en_text, en_type, (start_idx, end_idx)], ...]
    },
    ...
]
```
- note the start_idx, end_idx are token index in tokens not the absolute position in the original document

In [18]:
from pathlib import Path
import logging
from collections import defaultdict
import warnings
from sklearn.model_selection import train_test_split
import json

In [19]:
import sys

# get NLPpreprocessing from https://github.com/uf-hobi-informatics-lab/NLPreprocessing
sys.path.append("./NLPpreprocessing/")
sys.path.append("./NLPpreprocessing/text_process/")

In [20]:
from annotation2BIO import pre_processing, read_annotation_brat
from annotation2BIO import logger as l1
from sentence_tokenization import logger as l2
l1.setLevel(logging.ERROR)
l2.setLevel(logging.ERROR)

In [21]:
def json_dump(data, fn):
    with open(fn, "w") as f:
        json.dump(data, f)


def get_sent_bound(sents):
    sent_bound_range = dict()  # key: sent id; value: boundary range
    for i, each in enumerate(sents):
        try:
            sent_start_index = each[0][1][0]
            sent_end_index = each[-1][1][1]
            sent_bound_range[i] = (sent_start_index, sent_end_index)
        except Exception as ex:
            if i != len(nsents) - 1:
                raise RuntimeError(f'The {i}th sentence is an empty sentence')
    return sent_bound_range


def get_sent_idx(en, r2i, fid):
    s, e = en[2]
    for r in r2i:
        ss, se = r
        if ss <= s < e <= se:
            return r2i[r]
    
    warnings.warn(f"entity {en} in {fid} - cannot be mapped to one sentence. Will skip this entity.")
    return None


def get_en_idx_in_sent(en, sent):
    s, e = None, None
    for idx, word in enumerate(sent):
        if word[1][0] == en[2][0]:
            s = idx
        if word[1][1] == en[2][1]:
            e = idx
    if s == None:
        for idx, word in enumerate(sent):
            if word[1][0] < en[2][0] < word[1][1]:
                s = idx
                break
    if e == None:
        for idx, word in enumerate(sent):
            if word[1][0] < en[2][1] < word[1][1]:
                e = idx
                break
    assert s != None and e != None, f"{en}\n{sent}"
    return s, e

## training data from brat to biaffine

In [22]:
# biaffine entity format (text, type, start, end) => 
# later we need to translate the start end to indexes in sentence to construct labels and mask
# skip BIO to overcome overlap entities issue
# final formatted data should be json lines {sent: "xxxx", entity: [(en1, ty, s, e, sindex, eindex), ...]}

In [23]:
# we use 2018 n2c2 data as brat example, you can replace with any brat formatted data here
p = Path("./data/2018_n2c2_ade/track2-training_data/")

fids = [fn.stem for fn in p.glob("*.ann")]
fids[:3]

['114220', '189471', '102324']

In [24]:
train_ids, dev_ids = train_test_split(fids, train_size=0.9, random_state=13, shuffle=True)
len(train_ids), len(dev_ids)

(272, 31)

In [25]:
def brat2biaffine_data(file_ids, file_path, test=False):
    biaffine_data = []
    en_track = defaultdict(list)

    for fid in file_ids:
        t_fn = file_path / f"{fid}.txt"
        a_fn = file_path / f"{fid}.ann"
        
        if test:
            ens, rels = [], []
        else:
            enid_map, ens, rels = read_annotation_brat(a_fn)

        sents_text, sents = pre_processing(t_fn, max_len=200)

        set_bound = get_sent_bound(sents)
        range2idx = {v:k for k, v in set_bound.items()}

        for en in ens:
            sent_idx = get_sent_idx(en, range2idx, fid)
            if not sent_idx:
                # skip the en that cannot be mapped
                continue
            # get start end index in sentence; will be updated
            sent = sents[sent_idx]
            s_idx, e_idx = get_en_idx_in_sent(en, sent)
            # cat reduce to only type and index for training to reduce saved data size
            en_track[sent_idx].append([en[0], en[1], (s_idx, e_idx), en[2], sent_idx, fid])
 
    #     for k, v in en_track.items():
    #         biaffine_data.append({"tokens": [e[0] for e in sents[k]], "entities": v})
    
        # we still need to keep sentences that have no entities
        for i in range(len(sents)):
            biaffine_data.append(
                {
                    "tokens": [e[0] for e in sents[i]], 
                    "entities": sorted(en_track[i], key=lambda x: x[2][0])
                }
            )

    return biaffine_data

In [26]:
biaffine_train = brat2biaffine_data(train_ids, p)
biaffine_dev = brat2biaffine_data(dev_ids, p)



In [27]:
p_test = Path("/Users/alexgre/workspace/data/2018_n2c2_ade/gold_standard_test/")

test_fids = [fn.stem for fn in p_test.glob("*.txt")]
biaffine_test = brat2biaffine_data(test_fids, p_test, test=True)

In [28]:
len(biaffine_train), len(biaffine_dev), len(biaffine_test)

(34930, 3831, 25766)

In [None]:
json_dump(biaffine_train, "./data/n2c2/train.json")
json_dump(biaffine_dev, "./data/n2c2/dev.json")
json_dump(biaffine_test, "./data/n2c2/test.json")

## training from IOB to biaffine (no offset) (i2b2 2010)

In [29]:
# we use 2010 i2b2 data as BIO example, you can replace with any BIO formatted data here
# example use conll-2003 (we have the data in ./test_data directory)
p = Path("./data/2010_i2b2")

In [30]:
def load_data(fn):
    with open(fn, "r") as f:
        text = f.read().strip()
    
    sents = text.split("\n\n")
    nsents = []
    for sent in sents:
        words = sent.strip().split("\n")
        nsent = []
        for i, word in enumerate(words):
            info = word.strip().split(" ")
            word_text = info[0]
            label = info[-1]
            new_word = [word_text, label, i]
            nsent.append(new_word)
        nsents.append(nsent)
    
    return nsents


def data2biaffine_format(data):
    biaffine_data = []
    
    for sent in data:
        tokens = []
        entities = []
        
        label_ty = "O"
        temp_text = []
        idx_s = 1000
        idx_e = 1000
        
        for word in sent:
            tokens.append(word[0])
            label = word[1]
            if label == "O":
                if label_ty != "O":
                    entities.append([" ".join(temp_text), label_ty, (idx_s, idx_e)])
                label_ty = "O"
                temp_text = []
            elif label[0] == "B":
                if label_ty != "O":
                    entities.append([" ".join(temp_text), label_ty, (idx_s, idx_e)])
                    temp_text = []
                label_ty = label[2:]
                temp_text.append(word[0])
                idx_s = word[-1]
                idx_e = word[-1]
            else:
                # label if I-XX
                if label_ty == label[2:]:
                    idx_e = word[-1]
                    temp_text.append(word[0])
                else:
                    entities.append([" ".join(temp_text), label_ty, (idx_s, idx_e)])
                    temp_text = [word[0]]
                    label_ty = label[2:]
                    idx_s = word[-1]
                    idx_e = word[-1]
        
        biaffine_data.append(
            {
                "tokens": tokens,
                "entities": entities
            }
        ) 
    
    return biaffine_data    

In [31]:
train_bio = load_data(p / "train.txt")
dev_bio = load_data(p / "dev.txt")
test_bio = load_data(p / "test.txt")

i2b22010_biaffine_train = data2biaffine_format(train_bio)
i2b22010_biaffine_dev = data2biaffine_format(dev_bio)
i2b22010_biaffine_test = data2biaffine_format(test_bio)

In [32]:
len(train_bio), len(i2b22010_biaffine_train)

(14987, 14987)

In [33]:
idx = 26
train_bio[idx], i2b22010_biaffine_train[idx]

([['At', 'O', 0],
  ['the', 'O', 1],
  ['end', 'O', 2],
  ['of', 'O', 3],
  ['a', 'O', 4],
  ['January', 'O', 5],
  ['1967', 'O', 6],
  ['concert', 'O', 7],
  ['in', 'O', 8],
  ['the', 'O', 9],
  ['English', 'B-MISC', 10],
  ['city', 'O', 11],
  ['of', 'O', 12],
  ['Nottingham', 'B-LOC', 13],
  ['he', 'O', 14],
  ['threw', 'O', 15],
  ['the', 'O', 16],
  ['sheet', 'O', 17],
  ['of', 'O', 18],
  ['paper', 'O', 19],
  ['into', 'O', 20],
  ['the', 'O', 21],
  ['audience', 'O', 22],
  [',', 'O', 23],
  ['where', 'O', 24],
  ['it', 'O', 25],
  ['was', 'O', 26],
  ['retrieved', 'O', 27],
  ['by', 'O', 28],
  ['a', 'O', 29],
  ['fan', 'O', 30],
  ['.', 'O', 31]],
 {'tokens': ['At',
   'the',
   'end',
   'of',
   'a',
   'January',
   '1967',
   'concert',
   'in',
   'the',
   'English',
   'city',
   'of',
   'Nottingham',
   'he',
   'threw',
   'the',
   'sheet',
   'of',
   'paper',
   'into',
   'the',
   'audience',
   ',',
   'where',
   'it',
   'was',
   'retrieved',
   'by',
   'a'

In [34]:
idx = 202
dev_bio[idx], i2b22010_biaffine_dev[idx]

([[':', 'O', 0]], {'tokens': [':'], 'entities': []})

In [None]:
json_dump(i2b22010_biaffine_train, "./data/i2b22010/train.json")
json_dump(i2b22010_biaffine_dev, "./data/i2b22010/dev.json")
json_dump(i2b22010_biaffine_test, "./data/i2b22010/test.json")