# Prepare the data for annotation

We are annotating the same data as RadGraph, including 50 MIMIC-CXR reports (model test set), and 50 CheXpert reports (for testing generalization)

The config file is in "../config/graph_annotation_process.yaml"

Prepare:
1. Put the RadGraph test.json files into /{project_root}/resources/radgraph/
2. Run the script

In [34]:
import sys
sys.path.append("../../src")
import os
import json

In [35]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="radgraph_to_brat"):
        config = compose(config_name="graph_annotation_process", overrides=["graph_annotation_process@_global_=radgraph"])
print(OmegaConf.to_yaml(config))

machine:
  work_dir: /Users/liao/myProjects/VSCode_workspace/structured_reporting
  fast_coref_dir: /Users/liao/myProjects/VSCode_workspace/fast-coref
work_dir: ${machine.work_dir}
src_dir: ${work_dir}/src
output_dir: ${work_dir}/output
resource_dir: ${work_dir}/resources
base_output_dir: ${work_dir}/output
mimic_cxr_output_dir: ${base_output_dir}/mimic_cxr
i2b2_output_dir: ${base_output_dir}/i2b2
radgraph_output_dir: ${base_output_dir}/radgraph
log_dir: ${work_dir}/logs/${hydra.job.config_name}
logging_level: INFO
ignore_source_path: ${work_dir}/config/ignore/common_ignore
fast_coref_dir: ${machine.fast_coref_dir}
coref_scorer_dir: ${machine.fast_coref_dir}/coref_resources/reference-coreference-scorers
input:
  base_dir: ${resource_dir}/radgraph
  train_file: ${input.base_dir}/train.json
  dev_file: ${input.base_dir}/dev.json
  test_file: ${input.base_dir}/test.json
output:
  base_dir: ${radgraph_output_dir}
  for_inspection_dir: ${output.base_dir}/originalData_bratFormatted
  for_ann

In [36]:
def get_pos_list(doc_tokens_list):
    pos_list = []
    curr_pos = 0
    for tok in doc_tokens_list:
        pos_list.append(curr_pos)
        curr_pos = curr_pos + len(tok)+1
    return pos_list


In [37]:
import itertools


class AnnEntityClass:
    def __init__(self,incremental_id) -> None:
        self.id = f"T{next(incremental_id)}"
        self.type = ""
        self.start_index = ""
        self.end_index = ""
        self.token_str = ""

    def get_ann_str(self) -> str:
        return f"{self.id}\t{self.type} {self.start_index} {self.end_index}\t{self.token_str}\n"

    def __repr__(self) -> str:
        return self.get_ann_str()

    def __str__(self) -> str:
        return self.get_ann_str()


class AnnRelationClass:
    __incremental_id = itertools.count(1)

    def __init__(self,incremental_id) -> None:
        self.id = f"R{next(incremental_id)}"
        self.type = ""
        self.arg1 = ""
        self.arg2 = ""

    def get_ann_str(self) -> str:
        return f"{self.id}\t{self.type} Arg1:{self.arg1} Arg2:{self.arg2}\t\n"

    def __repr__(self) -> str:
        return self.get_ann_str()

    def __str__(self) -> str:
        return self.get_ann_str()


## Convert test.json to brat format

Having two annotators

In [38]:
input_file_path = config.input.test_file

with open(input_file_path,"r",encoding="utf-8") as f:
    radgraph_test = f.readlines()
data_dict = json.loads("".join(radgraph_test))

In [39]:
entity_type_mapper: dict[str, str] = {"ANAT-DP": "Anatomy", "OBS-DP": "Observation-Present", "OBS-DA": "Observation-Absent", "OBS-U": "Observation-Uncertain"}
relation_type_mapper: dict[str, str] = {"modify": "modify", "suggestive_of": "suggestive_of", "located_at": "located_at"}


for doc_id, doc_info in data_dict.items():
    output_file_name = f'{doc_info["data_source"]}_{doc_id.replace("/","_").strip(".txt")}'
    
    output_dir = os.path.join(config.output.for_ann_dir, doc_info["data_source"], "brat_data", doc_info["data_split"])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # We want to show the unlabelled data and two labelers' result in the same file. 
    # So we need to offset two labelers' token pos
    doc_tokens_list = doc_info["text"].split(" ")
    doc_tokens_pos_list0 = get_pos_list(doc_tokens_list)
    doc_tokens_pos_list1 = [i+len(doc_info["text"])+2 for i in doc_tokens_pos_list0]
    doc_tokens_pos_list2 = [i+len(doc_info["text"])+2 for i in doc_tokens_pos_list1]
    pos_list_mapper:dict[str, list] = {"labeler_1":doc_tokens_pos_list1, "labeler_2":doc_tokens_pos_list2}

    # Create txt file
    with open(os.path.join(output_dir, output_file_name+".txt"), "w", encoding="utf-8") as f:
        f.write(doc_info["text"]+"\n\n"+doc_info["text"]+"\n\n"+doc_info["text"])

    # Create ann file
    ann_entitiy_list: list[AnnEntityClass] = []
    ann_relation_list: list[AnnRelationClass] = []
    
    entity_incremental_id = itertools.count(1)
    relation_incremental_id = itertools.count(1)
    for labeler_id,doc_tokens_pos_list in pos_list_mapper.items():
        # Temperarly save the entiity relations, since one entity may link forward to another unrecorded entity.
        # (A, modify, B) == A -> modify -> B
        temp_ann_entitiy_dict: dict[str,AnnEntityClass] = {}
        temp_raw_relation_list: list[tuple[str,str,str]] = []

        entities_dict = doc_info[labeler_id]["entities"]
        for entity_id, entity_info in entities_dict.items():
            ann_entity = AnnEntityClass(entity_incremental_id)
            ann_entity.type = entity_type_mapper[entity_info["label"]]
            ann_entity.token_str = entity_info["tokens"]
            ann_entity.start_index = doc_tokens_pos_list[entity_info["start_ix"]]
            ann_entity.end_index = ann_entity.start_index + len(ann_entity.token_str)
            for relation in entity_info["relations"]:
                temp_raw_relation_list.append((entity_id, relation[0], relation[1]))
            temp_ann_entitiy_dict[entity_id] = ann_entity
        ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
        
        for entity1_idstr, relation_type, entity2_idstr in temp_raw_relation_list:
            ann_relation = AnnRelationClass(relation_incremental_id)
            ann_relation.type = relation_type
            ann_relation.arg1 = temp_ann_entitiy_dict[entity1_idstr].id
            ann_relation.arg2 = temp_ann_entitiy_dict[entity2_idstr].id
            ann_relation_list.append(ann_relation)
    
    # Dir to save the labels' id that are created for showing only, 
    # which should be ignored when resolving the finial ann data.
    label_inUse_dir = os.path.join(config.output.for_ann_dir, doc_info["data_source"], "label_in_use", doc_info["data_split"])
    if not os.path.exists(label_inUse_dir):
        os.makedirs(label_inUse_dir)

    with open(os.path.join(output_dir, output_file_name+".ann"), "w", encoding="utf-8") as f1, \
        open(os.path.join(label_inUse_dir, output_file_name+".txt"), "w", encoding="utf-8") as f2:
        for label_obj in ann_entitiy_list+ann_relation_list:
            f1.write(label_obj.get_ann_str())
            f2.write(str(label_obj.id)+"\n")

## Convert train/dev.json to brat format

Having one annotator

In [42]:
entity_type_mapper: dict[str, str] = {"ANAT-DP": "Anatomy", "OBS-DP": "Observation-Present", "OBS-DA": "Observation-Absent", "OBS-U": "Observation-Uncertain"}
relation_type_mapper: dict[str, str] = {"modify": "modify", "suggestive_of": "suggestive_of", "located_at": "located_at"}


for input_file_path in [config.input.train_file,config.input.dev_file]:
    with open(input_file_path,"r",encoding="utf-8") as f:
        radgraph_test = f.readlines()
        data_dict = json.loads("".join(radgraph_test))

    for doc_id, doc_info in data_dict.items():
        output_file_name = f'{doc_info["data_source"]}_{doc_id.replace("/","_").strip(".txt")}'
        
        output_dir = os.path.join(config.output.for_ann_dir, doc_info["data_source"], "brat_data", doc_info["data_split"])
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # We want to show two labelers' result in the same file. So we need to offset labeler 2's token pos
        doc_tokens_list = doc_info["text"].split(" ")
        doc_tokens_pos_list = [i+len(doc_info["text"])+2 for i in get_pos_list(doc_tokens_list)]

        # Create txt file
        with open(os.path.join(output_dir, output_file_name+".txt"), "w", encoding="utf-8") as f:
            f.write(doc_info["text"]+"\n\n"+doc_info["text"])

        # Create ann file
        ann_entitiy_list: list[AnnEntityClass] = []
        ann_relation_list: list[AnnRelationClass] = []
        
        entity_incremental_id = itertools.count(1)
        relation_incremental_id = itertools.count(1)
        
        # Temperarly save the entiity relations, since one entity may link forward to another unrecorded entity.
        # (A, modify, B) == A -> modify -> B
        temp_ann_entitiy_dict: dict[str,AnnEntityClass] = {}
        temp_raw_relation_list: list[tuple[str,str,str]] = []

        entities_dict = doc_info["entities"]
        for entity_id, entity_info in entities_dict.items():
            ann_entity = AnnEntityClass(entity_incremental_id)
            ann_entity.type = entity_type_mapper[entity_info["label"]]
            ann_entity.token_str = entity_info["tokens"]
            ann_entity.start_index = doc_tokens_pos_list[entity_info["start_ix"]]
            ann_entity.end_index = ann_entity.start_index + len(ann_entity.token_str)
            for relation in entity_info["relations"]:
                temp_raw_relation_list.append((entity_id, relation[0], relation[1]))
            temp_ann_entitiy_dict[entity_id] = ann_entity
        ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
        
        for entity1_idstr, relation_type, entity2_idstr in temp_raw_relation_list:
            ann_relation = AnnRelationClass(relation_incremental_id)
            ann_relation.type = relation_type
            ann_relation.arg1 = temp_ann_entitiy_dict[entity1_idstr].id
            ann_relation.arg2 = temp_ann_entitiy_dict[entity2_idstr].id
            ann_relation_list.append(ann_relation)

        # Dir to save the labels' id that are created for showing only, 
        # which should be ignored when resolving the finial ann data.
        label_inUse_dir = os.path.join(config.output.for_ann_dir, doc_info["data_source"], "label_in_use", doc_info["data_split"])
        if not os.path.exists(label_inUse_dir):
            os.makedirs(label_inUse_dir)

        with open(os.path.join(output_dir, output_file_name+".ann"), "w", encoding="utf-8") as f1, \
            open(os.path.join(label_inUse_dir, output_file_name+".txt"), "w", encoding="utf-8") as f2:
            for label_obj in ann_entitiy_list+ann_relation_list:
                f1.write(label_obj.get_ann_str())
                f2.write(str(label_obj.id)+"\n")