In [None]:
import os
import json
from collections import defaultdict, Counter

# the raw data downloaded from https://physionet.org/content/radgraph/1.0.0/
root_path = "/Users/liao/Desktop/RadGraph/radgraph-extracting-clinical-entities-and-relations-from-radiology-reports-1.0.0"

In [None]:
import shutil

# the output dir
output_dir = "/Users/liao/myProjects/VSCode_workspace/cxr_graph/graph_annotation_process/outputs/radgraph/json4ner_re"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

os.makedirs(output_dir)

In [None]:
cross_sent_relations = defaultdict(list)

for out_file_name in ["train", "dev", "test", "test1"]:
    if out_file_name == "test1":
        in_file_name = "test"
    else:
        in_file_name = out_file_name

    input_file_path = os.path.join(root_path, f"{in_file_name}.json")
    with open(input_file_path, "r") as f:
        docs_dict = json.loads(f.readline())

    for doc_key, doc in docs_dict.items():
        output_dict = {
            "doc_key": doc_key,
            "sentences": [],
            "ner": [],
            "relations": [],
        }
        sent_idx = 0
        tokidx2sentidx = []
        tokens = doc["text"].split(" ")
        sent = []
        for token_id, token in enumerate(tokens):
            tokidx2sentidx.append(sent_idx)
            sent.append(token)
            if token == "." or token_id == len(tokens) - 1:
                output_dict["sentences"].append(sent)
                output_dict["ner"].append([])
                output_dict["relations"].append([])
                sent_idx += 1
                sent = []
        assert len(tokens) == len([i for sent in output_dict["sentences"] for i in sent])

        if out_file_name == "test":
            doc_entities = doc["labeler_1"]["entities"]
        elif out_file_name == "test1":
            doc_entities = doc["labeler_2"]["entities"]
        else:
            doc_entities = doc["entities"]
            
        dataset="mimic"
        if "txt" not in doc_key and "test" in out_file_name:
            dataset="chexpert"

        for ent_idx, entity in doc_entities.items():
            subj_start = entity["start_ix"]
            subj_end = entity["end_ix"]
            subj_sent_idx = tokidx2sentidx[subj_start]
            output_dict["ner"][subj_sent_idx].append([subj_start, subj_end, entity["label"]])
            for rel in entity["relations"]:
                rel_label = rel[0]
                rel_obj_idx = rel[1]
                obj_start = doc_entities[rel_obj_idx]["start_ix"]
                obj_end = doc_entities[rel_obj_idx]["end_ix"]
                obj_sent_idx = tokidx2sentidx[obj_start]
                cross_sent_relations[f"{dataset}_{out_file_name}"].append(abs(subj_sent_idx - obj_sent_idx))
                output_dict["relations"][subj_sent_idx].append([subj_start, subj_end, obj_start, obj_end, rel_label])
                # if abs(subj_sent_idx - obj_sent_idx) > 0:
                #     print(doc_key, entity)

        output_path = os.path.join(output_dir, f"{out_file_name}.json")
        with open(output_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(output_dict))
            f.write("\n")
            
        if "txt" in doc_key and "test" in out_file_name:
            output_path = os.path.join(output_dir, f"{out_file_name}_mimic.json")
            with open(output_path, "a", encoding="utf-8") as f:
                f.write(json.dumps(output_dict))
                f.write("\n")
        if "txt" not in doc_key and "test" in out_file_name:
            output_path = os.path.join(output_dir, f"{out_file_name}_chexpert.json")
            with open(output_path, "a", encoding="utf-8") as f:
                f.write(json.dumps(output_dict))
                f.write("\n")