### The code assumes that the text in the input directory ann files and the json files is the exact same

In [8]:
import os
import json
import copy

# in_dir = "refined_annotations"
# in_dir should consist of ann files
in_dir = "ann"
# ACS data in json format is stored in json_dir
json_dir = "json"
# ACS data in json format with NER annotations would be stored in out_dir
out_dir = "json_out"

In [9]:
def make_json(para_data, sent_span):
    json_ner_data = []

    # add keys to entity mention's features
    for mention, mention_data in para_data.items():
        ner_instance_array = []
        for apprnc in mention_data[1]:
            ner_instance_array.append({"instance_id": apprnc[0], "entity_type": apprnc[1], 
                                       "global_span_start": apprnc[2], "global_span_end": apprnc[3],
                                      "local_span_start": apprnc[2]-sent_span, "local_span_end": apprnc[3]-sent_span})

        
        json_ner_data.append({"ner_mention": mention,"appearance_count": mention_data[0], "ner_instance_array": ner_instance_array})


    return json_ner_data

In [10]:
# def get_text(json_file):
#     new_text = ""
    
#     data = copy.deepcopy(json_file)
    
#     para_list = data["body"]
#     para_list.insert(0, {"text": data["abstract"][0]})
    
#     sent_span = 0
#     # data["body"] is a list of dicts
#     for i, para in enumerate(para_list):
#         # Empty sections issue
#         if "section_header" in para and para["section_header"] == para["text"]:
# #             print("Hello")
#             continue
    
#         print(i, sent_span, sent_span + len(para["text"]))
        
#         sent_span += len(para["text"]) + 1
        
#         new_text += para["text"] + "\n"
#     with open("test.txt", "w") as fp:
#         fp.write(new_text)

In [11]:
# with open(os.path.join(json_dir, "sb7b00428" + ".json"), "r") as fp:
#     json_file = json.load(fp)
# get_text(json_file)

In [12]:
def add_annotations(ann_file, txt_file, json_file):
    data = copy.deepcopy(json_file)
    
    para_list = copy.deepcopy(data["body"])
    para_list.insert(0, {"text": data["abstract"][0]})
    
    num_covered = 0
    
    status = True
    
    sent_span = 0
    # data["body"] is a list of dicts
    for i, para in enumerate(para_list): 
        
        # para is the actual test of the paragraph
        # Empty sections issue
        if "section_header" in para and para["section_header"] == para["text"]:
            # the section name would be a part of the text of the next para, so update sent_span
            sent_span += len(para["section_header"]) + 1
            continue
        
        para_data = {}
        
        # abstracts are not annotated in 20210111
        if i==0:
            data["abstract"] = [{"text": para, "ner_annotations": {}}]
            continue
        else:
            data["body"][i-1]["ner_annotations"] = {}
        
        
#         # Missing sections issue
#         flag = False
#         # make sure all mentions in the ann file are in this para span, if any one is missing, skip this para
#         for line in ann_file:
#             split_line = line.strip().split("\t")

#             instance_id = split_line[0]
#             ent_info = split_line[1].strip().split(" ")
#             entity = ent_info[0]
#             span_start = int(ent_info[1])
#             span_end = int(ent_info[2])
#             mention = split_line[-1]
            
            
#             # check if the mention should be in this para
#             if span_start < sent_span + len(para["text"]) and span_start >= sent_span:
#                 # check whether mention is actually present in para or not
#                 if para["text"].find(mention) < 0:
#                     flag = True
#                     break
        
#         if(flag):
#             continue
        
        
#         if para["text"] != txt_file[sent_span:sent_span + len(para["text"])]:
#             status = False
        
        # find all mentions in the ann file which are present in this span
        for line in ann_file:
            split_line = line.strip().split("\t")

            instance_id = split_line[0]
            ent_info = split_line[1].strip().split(" ")
            entity = ent_info[0]
            span_start = int(ent_info[1])
            span_end = int(ent_info[2])
            mention = split_line[-1]
            
            # check if the mention is in this para
            if span_start < sent_span + len(para["text"]) and span_start >= sent_span:
                num_covered += 1
                
                # just making sure that the mention occurs in this para
                # if this assertion fails, the text in the json files is NOT the same as the
                # input txt files on which the ann is based
#                 assert para["text"].find(mention) >= 0
                
                if mention in para_data:
                    para_data[mention][0] += 1
                    para_data[mention][1].append((instance_id, entity, span_start, span_end))
                else:
                    para_data[mention] = [1, [(instance_id, entity, span_start, span_end)]]
    
    
        jsonify = make_json(para_data, sent_span)
    
        if i==0:
            data["abstract"][0]["ner_annotations"] = jsonify
        else:
            # convert the para_data dict into the json object required by the schema
            data["body"][i-1]["ner_annotations"] = jsonify
    
        sent_span += len(para["text"]) + 1 # +1 for newline or a single whitespace
    
    data["abstract"][0]["text"] = data["abstract"][0]["text"]["text"]
    return data, ((num_covered == len(ann_file)) and status)

In [13]:
def main():
    ner_files = []
    for f_name in os.listdir(in_dir):
        if ".ann" in f_name:
            ner_files.append(f_name[:-4])


    for f_name in ner_files:
        with open(os.path.join(in_dir, f_name + ".ann"), "r") as fp:
            ann_file = fp.readlines()
        with open(os.path.join("txt", f_name + ".txt"), "r") as fp:
            txt_file = fp.read()
        with open(os.path.join(json_dir, f_name + ".json"), "r") as fp:
            json_file = json.load(fp)

        
        json_ann_data, status = add_annotations(ann_file, txt_file, json_file)
        
        if status:
            fp = open(os.path.join(out_dir, f_name+".json"), "w")
            json.dump(json_ann_data, fp, indent=4)
            print(f"Successfully processed {f_name}" )
        else:
            print(f"Could not process {f_name}" )

In [14]:
if __name__=="__main__":
    main()

Successfully processed sb6b00010
Successfully processed sb5b00012
Successfully processed sb5b00007
Could not process sb5b00002
Successfully processed sb6b00009


#### The following code transfers few select files to another directory

In [200]:
# transfer of files to output directory
from shutil import copyfile
final_out_dir = "output"

In [219]:
files_list = ["sb5b00012", "sb6b00027", "sb7b00086", "sb8b00534", "sb9b00103", 
         "sb400051t", "sb3001084", "sb2000275", "sb6b00135", "sb8b00242" ]

for f_name in files_list:
#     try:
        src1 = os.path.join(out_dir, f_name + ".json")
        dst1 = os.path.join(final_out_dir, f_name + ".json")
        copyfile(src1, dst1)
        src2 = os.path.join("txt", f_name + ".txt")
        dst2 = os.path.join(final_out_dir, f_name + ".txt")
        copyfile(src2, dst2)
#     except:
#         continue

In [1]:
# with open("vcu/sb5b00012.txt", "r") as fp:
#     text_vcu = fp.read()
# with open("acs_research/sb5b00012.txt", "r") as fp:
#     text_acs = fp.read()
    
# for i in range(len(text_acs)):
#     if text_acs[i] != text_vcu[i]:
#         print(i, text_acs[i], text_vcu[i])
        