## Converting output of NER

We wish to convert the output of NER to a format that can be fed to the RE module.

In [1]:
from pathlib import Path
import shutil
import pprint
import bisect
import re
from collections import defaultdict 
import csv

import scispacy
import spacy

## Function to generate output

In [2]:
#This is just one file. Can be easily generalized

nlp = spacy.load("en_core_sci_sm")
 
def generate_output(text_path,ann_path,output_path):
    
    with open(text_path, 'r') as f:
        input_lines = f.readlines()
        
    if len(input_lines)==1:
        doc = nlp(input_lines[0])
        processed_lines = list(doc.sents)
        length_arr_cum = [sent.end_char for sent in processed_lines]
    
    else:
        length_arr = [len(l) for l in input_lines] 
        length_arr_cum = [0] * len(length_arr)
        temp = 0
        for i in range(len(length_arr)):
            temp += length_arr[i]
            length_arr_cum[i] = temp

    
    
#     print(length_arr_cum)             

#     idx = bisect.bisect_right(length_arr_cum,436)
#     print(idx)
    # print(lines[idx][1125-lines_len_cum[idx-1]:1135-lines_len_cum[idx-1]])

    named_entities = [defaultdict(list) for i in range(len(length_arr_cum))]

    with open(ann_path,'r') as f:
        lines = csv.reader(f,delimiter='\t');
        for line in lines:
            info = line[1].split(' ')
            label = info[0]
            start = int(info[1])
            end = int(info[2])
#             print(doc.text[start:end],line[2])
            idx = bisect.bisect_right(length_arr_cum,start)
            baseline = processed_lines[idx].start_char
            named_entities[idx][label].append((start-baseline,end-baseline,line[0],line[2]))
            print(processed_lines[idx].text[start-baseline:end-baseline],line[2])
    
    ## Creating anonymous sentences
    
    output_sent = []
    count = 0
    id_list = []
    for i in range(len(length_arr_cum)):

        for t1 in named_entities[i]['Chemical']:
            for t2 in named_entities[i]['Gene']:
                (start1,end1,id1,w1) = t1
                (start2,end2,id2,w2) = t2
                str_temp = processed_lines[i].text
                if(start1 < start2):
                    output_str = str_temp[:start1] + "@CHEMICAL$" + str_temp[end1:start2] + "@GENE$" + str_temp[end2:]
                    token_tup = (id1,id2)
                else:
                    output_str = str_temp[:start2] + "@GENE$" + str_temp[end2:start1] + "@CHEMICAL$" + str_temp[end1:]
                    token_tup = (id2,id1)
#                 print(output_str)
                output_sent.append(output_str)
                id_list.append(token_tup)
    
    ## Generating input for RE
    
    name = text_path.stem
    new_dir = output_path / str(name)
    new_dir.mkdir(exist_ok= True)
    
    shutil.copy(text_path, new_dir / text_path.name)
    shutil.copy(ann_path, new_dir / ("old_" + str(ann_path.name)))
    
    with open(new_dir / 'test.tsv','w') as f:
        f.write("dummy\tdummy\tsentence\n")
        for sent in output_sent:
            if '\n' not in sent:
                sent += '\n'
            f.write("a\ta\t "+ sent)

    with open(new_dir / 'pairs.tsv','w') as f:
        for (t1,t2) in id_list:
            f.write(t1+"\t"+t2 + "\n")
# # pprint.pprint(named_entities)
                 

In [3]:
## This block is just a test. Comment this out 

text_path = Path("../data/original/sb3000673.txt")
ann_path = Path("../data/original/sb3000673.ann")
output_path = Path("../data/processed/")
generate_output(text_path,ann_path,output_path)

## Iterating through brat files

Given the root, we will iterate through all the .txt and .ann files that exist inside given path (also iterates through nested subdirectories, thus is recusive)

In [4]:
root = Path("../data/original/")
output_dir = Path("../data/processed/")

def walk_dir(root):
    if(root.is_dir()):
        for child in root.iterdir():
            walk_dir(child)
    else:
        if(root.suffix == '.txt'):
            ann_path = root.with_suffix(".ann")
            if(ann_path.exists()):
                generate_output(root,ann_path,output_dir)

In [5]:
walk_dir(root)