In [None]:
import os
import json
from pprint import pprint

from tqdm.notebook import tqdm

In [None]:
data_dir = "../../data/acs-data"

ann_dir = os.path.join(data_dir, "acs-ner-20210205")
json_dir = os.path.join(data_dir, "json-files")
output_dir = os.path.join(data_dir, "json-files-ner")

In [None]:
def process_one_file(ann_path, json_path, output_path):
    
    # load ann information
    ann = []
    with open(ann_path, "r", encoding="utf-8") as ann_fin:
        for line in ann_fin:
            if not line.startswith("T"):
                continue
            items = line.strip("\n").split("\t")
            pos_and_type = items[1].strip().split(" ")
            ann.append({
                "id": items[0],
                "type": pos_and_type[0],
                "start": int(pos_and_type[1]),
                "end": int(pos_and_type[2]),
                "text": items[2]
            })
    
    # load json information
    json_obj = None
    with open(json_path, "r", encoding="utf-8") as json_fin:
        json_obj = json.load(json_fin)
    
    ann = sorted(ann, key=lambda x: x["start"])
    ann_idx = 0
    for para_idx, para in enumerate(json_obj["body"]):
        para_start, para_end = para["global_span_start"], para["global_span_end"]
        ent_bucket = {}
        while ann_idx < len(ann):
            ent = ann[ann_idx]
            if ent["start"] >= para_end: # this ent is for the next paragraph
                break
            ent_text = ent["text"]
            if ent_text not in ent_bucket:
                ent_bucket[ent_text] = []
            ent_bucket[ent_text].append({
                "instance_id": ent["id"],
                "type": ent["type"],
                "global_span_start": ent["start"],
                "global_span_end": ent["end"],
                "local_span_start": ent["start"] - para_start,
                "local_span_end": ent["end"] - para_start
            })
            ann_idx += 1
        para["ner_annotations"] = []
        for k, v in ent_bucket.items():
            para["ner_annotations"].append({
                "ner_mention": k,
                "appearance_count": len(v),
                "ner_instance_array": v
            })
            
    with open(output_path, "w", encoding="utf-8") as json_fout:
        json.dump(json_obj, json_fout)

In [None]:
for filename in tqdm(os.listdir(ann_dir)):
    if filename.endswith(".ann"):
        pub_num = filename.split(".")[0]
        process_one_file(
            os.path.join(ann_dir, f"{pub_num}.ann"),
            os.path.join(json_dir, f"{pub_num}.json"),
            os.path.join(output_dir, f"{pub_num}.json")
        )

In [None]:
# check one output
with open(os.path.join(output_dir, f"sb7b00029.json")) as fin:
    obj = json.load(fin)
    pprint(obj["body"][10])
    print("\n".join(obj["body"][10]["text"])[2612:2629])