# Prepare the data for annotation

We are annotating the same data as RadGraph.

Prepare:
1. Set `radgraph_root_dir` to the downloaded RadGraph root dir
2. Run the script

Outputs:
1. `brat_data/..` is the files for `the BRAT Annotation Tool`, relavant config files are in `graph_annotation_process/brat_config`
2. `label_in_use/..` is the files that store existing labels before annotation. We use these to distinguish which labels are created in annotation process.

In [15]:
radgraph_root_dir = "/Users/liao/Desktop/RadGraph/radgraph-extracting-clinical-entities-and-relations-from-radiology-reports-1.0.0"
output_root_dir = "./outputs/to_be_annotated"

In [16]:
import os
import json
import itertools

In [17]:
def get_pos_list(doc_tokens_list):
    pos_list = []
    curr_pos = 0
    for tok in doc_tokens_list:
        pos_list.append(curr_pos)
        curr_pos = curr_pos + len(tok)+1
    return pos_list


In [18]:
class AnnEntityClass:
    def __init__(self,incremental_id) -> None:
        self.id = f"T{next(incremental_id)}"
        self.label = ""
        self.start_index = ""
        self.end_index = ""
        self.token_str = ""

    def get_ann_str(self) -> str:
        return f"{self.id}\t{self.label} {self.start_index} {self.end_index}\t{self.token_str}\n"

    def __repr__(self) -> str:
        return self.get_ann_str()

    def __str__(self) -> str:
        return self.get_ann_str()


class AnnRelationClass:
    __incremental_id = itertools.count(1)

    def __init__(self,incremental_id) -> None:
        self.id = f"R{next(incremental_id)}"
        self.label = ""
        self.arg1 = ""
        self.arg2 = ""

    def get_ann_str(self) -> str:
        return f"{self.id}\t{self.label} Arg1:{self.arg1} Arg2:{self.arg2}\t\n"

    def __repr__(self) -> str:
        return self.get_ann_str()

    def __str__(self) -> str:
        return self.get_ann_str()


## Convert test.json to brat format

Having two annotators

In [19]:
input_test_path = os.path.join(radgraph_root_dir, "test.json")

with open(input_test_path,"r",encoding="utf-8") as f:
    radgraph_test = f.readlines()
data_dict = json.loads("".join(radgraph_test))

In [20]:
entity_label_mapper: dict[str, str] = {"ANAT-DP": "Anatomy", "OBS-DP": "Observation-Present", "OBS-DA": "Observation-Absent", "OBS-U": "Observation-Uncertain"}
relation_label_mapper: dict[str, str] = {"modify": "modify", "suggestive_of": "suggestive_of", "located_at": "located_at"}


for doc_id, doc_info in data_dict.items():
    output_file_name = f'{doc_info["data_source"]}_{doc_id.replace("/","_").strip(".txt")}'
    
    output_dir = os.path.join(output_root_dir, doc_info["data_source"], "brat_data", doc_info["data_split"])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # We want to show the unlabelled data and two labelers' result in the same file. 
    # So we need to offset two labelers' token pos
    doc_tokens_list = doc_info["text"].split(" ")
    doc_tokens_pos_list0 = get_pos_list(doc_tokens_list)
    doc_tokens_pos_list1 = [i+len(doc_info["text"])+2 for i in doc_tokens_pos_list0]
    doc_tokens_pos_list2 = [i+len(doc_info["text"])+2 for i in doc_tokens_pos_list1]
    pos_list_mapper:dict[str, list] = {"labeler_1":doc_tokens_pos_list1, "labeler_2":doc_tokens_pos_list2}

    # Create txt file
    with open(os.path.join(output_dir, output_file_name+".txt"), "w", encoding="utf-8") as f:
        f.write(doc_info["text"]+"\n\n"+doc_info["text"]+"\n\n"+doc_info["text"])

    # Create ann file
    ann_entitiy_list: list[AnnEntityClass] = []
    ann_relation_list: list[AnnRelationClass] = []
    
    entity_incremental_id = itertools.count(1)
    relation_incremental_id = itertools.count(1)
    for labeler_id, doc_tokens_pos_list in pos_list_mapper.items():
        # Temperarly save the entiity relations, since one entity may link forward to another unrecorded entity.
        # (A, modify, B) == A -> modify -> B
        temp_ann_entitiy_dict: dict[str,AnnEntityClass] = {}
        temp_raw_relation_list: list[tuple[str,str,str]] = []

        entities_dict = doc_info[labeler_id]["entities"]
        for entity_id, entity_info in entities_dict.items():
            ann_entity = AnnEntityClass(entity_incremental_id)
            ann_entity.label = entity_label_mapper[entity_info["label"]]
            ann_entity.token_str = entity_info["tokens"]
            ann_entity.start_index = doc_tokens_pos_list[entity_info["start_ix"]]
            ann_entity.end_index = ann_entity.start_index + len(ann_entity.token_str)
            for relation in entity_info["relations"]:
                temp_raw_relation_list.append((entity_id, relation[0], relation[1]))
            temp_ann_entitiy_dict[entity_id] = ann_entity
        ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
        
        for entity1_idstr, relation_label, entity2_idstr in temp_raw_relation_list:
            ann_relation = AnnRelationClass(relation_incremental_id)
            ann_relation.label = relation_label
            ann_relation.arg1 = temp_ann_entitiy_dict[entity1_idstr].id
            ann_relation.arg2 = temp_ann_entitiy_dict[entity2_idstr].id
            ann_relation_list.append(ann_relation)
    
    # Dir to save the labels' id that are created for showing only, 
    # which should be ignored when resolving the finial ann data.
    label_inUse_dir = os.path.join(output_root_dir, doc_info["data_source"], "label_in_use", doc_info["data_split"])
    if not os.path.exists(label_inUse_dir):
        os.makedirs(label_inUse_dir)

    with open(os.path.join(output_dir, output_file_name+".ann"), "w", encoding="utf-8") as f1, \
        open(os.path.join(label_inUse_dir, output_file_name+".txt"), "w", encoding="utf-8") as f2:
        for label_obj in ann_entitiy_list+ann_relation_list:
            f1.write(label_obj.get_ann_str())
            f2.write(str(label_obj.id)+"\n")

## Convert train/dev.json to brat format

Having one annotator, do not pre-annotate

In [21]:
entity_label_mapper: dict[str, str] = {"ANAT-DP": "Anatomy", "OBS-DP": "Observation-Present", "OBS-DA": "Observation-Absent", "OBS-U": "Observation-Uncertain"}
relation_label_mapper: dict[str, str] = {"modify": "modify", "suggestive_of": "suggestive_of", "located_at": "located_at"}

input_dev_path = os.path.join(radgraph_root_dir, "dev.json")
input_train_path = os.path.join(radgraph_root_dir, "train.json")

for input_file_path in [input_train_path, input_dev_path]:
    with open(input_file_path,"r",encoding="utf-8") as f:
        radgraph_test = f.readlines()
        data_dict = json.loads("".join(radgraph_test))

    for doc_id, doc_info in data_dict.items():
        output_file_name = f'{doc_info["data_source"]}_{doc_id.replace("/","_").strip(".txt")}'
        
        output_dir = os.path.join(output_root_dir, doc_info["data_source"], "brat_data", doc_info["data_split"])
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        doc_tokens_list = doc_info["text"].split(" ")
        doc_tokens_pos_list = [i+len(doc_info["text"])+2 for i in get_pos_list(doc_tokens_list)]

        # Create txt file
        with open(os.path.join(output_dir, output_file_name+".txt"), "w", encoding="utf-8") as f:
            f.write(doc_info["text"]+"\n\n"+doc_info["text"])

        # Create ann file
        ann_entitiy_list: list[AnnEntityClass] = []
        ann_relation_list: list[AnnRelationClass] = []
        
        entity_incremental_id = itertools.count(1)
        relation_incremental_id = itertools.count(1)
        
        # Temperarly save the entiity relations, since one entity may link forward to another unrecorded entity.
        # (A, modify, B) == A -> modify -> B
        temp_ann_entitiy_dict: dict[str,AnnEntityClass] = {}
        temp_raw_relation_list: list[tuple[str,str,str]] = []

        entities_dict = doc_info["entities"]
        for entity_id, entity_info in entities_dict.items():
            ann_entity = AnnEntityClass(entity_incremental_id)
            ann_entity.label = entity_label_mapper[entity_info["label"]]
            ann_entity.token_str = entity_info["tokens"]
            ann_entity.start_index = doc_tokens_pos_list[entity_info["start_ix"]]
            ann_entity.end_index = ann_entity.start_index + len(ann_entity.token_str)
            for relation in entity_info["relations"]:
                temp_raw_relation_list.append((entity_id, relation[0], relation[1]))
            temp_ann_entitiy_dict[entity_id] = ann_entity
        ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
        
        for entity1_idstr, relation_label, entity2_idstr in temp_raw_relation_list:
            ann_relation = AnnRelationClass(relation_incremental_id)
            ann_relation.label = relation_label
            ann_relation.arg1 = temp_ann_entitiy_dict[entity1_idstr].id
            ann_relation.arg2 = temp_ann_entitiy_dict[entity2_idstr].id
            ann_relation_list.append(ann_relation)

        # Dir to save the labels' id that are created for showing only, 
        # which should be ignored when resolving the finial ann data.
        label_inUse_dir = os.path.join(output_root_dir, doc_info["data_source"], "label_in_use", doc_info["data_split"])
        if not os.path.exists(label_inUse_dir):
            os.makedirs(label_inUse_dir)

        with open(os.path.join(output_dir, output_file_name+".ann"), "w", encoding="utf-8") as f1, \
            open(os.path.join(label_inUse_dir, output_file_name+".txt"), "w", encoding="utf-8") as f2:
            for label_obj in ann_entitiy_list+ann_relation_list:
                f1.write(label_obj.get_ann_str())
                f2.write(str(label_obj.id)+"\n")

Having one annotator, giving repeated pre-annotated labels

In [22]:
# entity_label_mapper: dict[str, str] = {"ANAT-DP": "Anatomy", "OBS-DP": "Observation-Present", "OBS-DA": "Observation-Absent", "OBS-U": "Observation-Uncertain"}
# relation_label_mapper: dict[str, str] = {"modify": "modify", "suggestive_of": "suggestive_of", "located_at": "located_at"}


# input_dev_path = os.path.join(radgraph_root_dir, "dev.json")
# input_train_path = os.path.join(radgraph_root_dir, "train.json")

# for input_file_path in [input_train_path, input_dev_path]:
#     with open(input_file_path,"r",encoding="utf-8") as f:
#         radgraph_test = f.readlines()
#         data_dict = json.loads("".join(radgraph_test))

#     for doc_id, doc_info in data_dict.items():
#         output_file_name = f'{doc_info["data_source"]}_{doc_id.replace("/","_").strip(".txt")}'
        
#         output_dir = os.path.join(output_root_dir, doc_info["data_source"], "brat_data", doc_info["data_split"])
#         if not os.path.exists(output_dir):
#             os.makedirs(output_dir)


#         # We want to show the unlabelled data and two labelers' result in the same file. 
#         # So we need to offset two labelers' token pos
#         doc_tokens_list = doc_info["text"].split(" ")
#         doc_tokens_pos_list0 = get_pos_list(doc_tokens_list)
#         doc_tokens_pos_list1 = [i+len(doc_info["text"])+2 for i in doc_tokens_pos_list0]
#         pos_list_mapper:dict[str, list] = {"keep":doc_tokens_pos_list1, "remove":doc_tokens_pos_list0}

#         # Create txt file
#         with open(os.path.join(output_dir, output_file_name+".txt"), "w", encoding="utf-8") as f:
#             f.write(doc_info["text"]+"\n\n"+doc_info["text"])

#         # For ann file
#         ann_entitiy_list: list[AnnEntityClass] = []
#         ann_relation_list: list[AnnRelationClass] = []
        
#         # For label_in_use txt file
#         labelinuse_ann_entitiy_list: list[AnnEntityClass] = []
#         labelinuse_ann_relation_list: list[AnnRelationClass] = []
        
#         entity_incremental_id = itertools.count(1)
#         relation_incremental_id = itertools.count(1)
#         for action, doc_tokens_pos_list in pos_list_mapper.items():
#             # Temperarly save the entiity relations, since one entity may link forward to another unrecorded entity.
#             # (A, modify, B) == A -> modify -> B
#             temp_ann_entitiy_dict: dict[str,AnnEntityClass] = {}
#             temp_raw_relation_list: list[tuple[str,str,str]] = []

#             entities_dict = doc_info["entities"]
#             for entity_id, entity_info in entities_dict.items():
#                 ann_entity = AnnEntityClass(entity_incremental_id)
#                 ann_entity.label = entity_label_mapper[entity_info["label"]]
#                 ann_entity.token_str = entity_info["tokens"]
#                 ann_entity.start_index = doc_tokens_pos_list[entity_info["start_ix"]]
#                 ann_entity.end_index = ann_entity.start_index + len(ann_entity.token_str)
#                 for relation in entity_info["relations"]:
#                     temp_raw_relation_list.append((entity_id, relation[0], relation[1]))
#                 temp_ann_entitiy_dict[entity_id] = ann_entity
#             ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
#             if action == "keep":
#                 labelinuse_ann_entitiy_list.extend(temp_ann_entitiy_dict.values())
            
#             for entity1_idstr, relation_label, entity2_idstr in temp_raw_relation_list:
#                 ann_relation = AnnRelationClass(relation_incremental_id)
#                 ann_relation.label = relation_label
#                 ann_relation.arg1 = temp_ann_entitiy_dict[entity1_idstr].id
#                 ann_relation.arg2 = temp_ann_entitiy_dict[entity2_idstr].id
#                 ann_relation_list.append(ann_relation)
#                 if action == "keep":
#                     labelinuse_ann_relation_list.append(ann_relation)
        
#         # Dir to save the labels' id that are created for showing only, 
#         # which should be ignored when resolving the finial ann data.
#         label_inUse_dir = os.path.join(output_root_dir, doc_info["data_source"], "label_in_use", doc_info["data_split"])
#         if not os.path.exists(label_inUse_dir):
#             os.makedirs(label_inUse_dir)

#         with open(os.path.join(output_dir, output_file_name+".ann"), "w", encoding="utf-8") as f1:
#             for label_obj in ann_entitiy_list + ann_relation_list:
#                 f1.write(label_obj.get_ann_str())
                
#         with open(os.path.join(label_inUse_dir, output_file_name+".txt"), "w", encoding="utf-8") as f2:
#             for label_obj in labelinuse_ann_entitiy_list + labelinuse_ann_relation_list:
#                 f2.write(str(label_obj.id)+"\n")