In [1]:
synonyms_path = "./all_gazetteer_terms_from_umls.tsv"
train_split_path = "path_to_dataset_split_into_sentences"
output_path = "output_path"
gold_annotations_with_snomed_code_path = "./symptemist_tsv_train_subtask2.tsv" 
gazetteer_path = "./symptemist_gazetter_snomed_ES_v2.tsv" 

In [2]:
import os
import pandas as pd

In [3]:
synonyms = pd.read_csv(synonyms_path, sep="\t", encoding="utf8")
synonyms["snomed_code"] = synonyms["snomed_code"].astype("string")

In [4]:
gold_annotations = pd.read_csv(gold_annotations_with_snomed_code_path, sep="\t", encoding="utf8")
gold_annotations["code"] = gold_annotations["code"].astype('string')

In [5]:
def is_empty_file(file_path):  
    return os.path.isfile(file_path) and os.path.getsize(file_path) == 0

In [6]:
def get_term_snomed_code(term, source_report_file_name): 
    filtered = gold_annotations[(gold_annotations["filename"] == source_report_file_name) & (gold_annotations["text"] == term)]
    if filtered.empty:
        return None

    return filtered["code"].iloc[0]

In [7]:
codes_without_synonyms = []
def get_synonyms(term, source_report_file_name):
    code = get_term_snomed_code(term, source_report_file_name)
    
    if code == "NO_CODE" or code is None:
        return []
        
    filtered = synonyms[synonyms["snomed_code"] == code]
    
    if filtered.empty: 
        codes_without_synonyms.append(code)
        return []
        
    return filtered["term"].tolist()

In [8]:
def insert_string_at_position(original_string, insertion, position):
    # slice the original string into two parts: before and after the position
    before = original_string[:position]
    after = original_string[position:]    
    return before + insertion + after

In [9]:
def get_start_pos(row, src_text):
    return src_text.find(row["text"])

def get_end_pos(row):
    return row["start_pos"] + len(row["text"])

In [10]:
import random
unchanged_files = []

all_annotations = []
for file_name in os.listdir(train_split_path):
    if file_name.endswith(".ann"):
        continue
        
    file_name_no_ext = file_name.rstrip(".txt")
    source_report_file_name = file_name_no_ext.split("-b-")[0]
    annotations_file_path = f"{train_split_path}/{file_name_no_ext}.ann"
    augmentation_file_name_no_ext = f"a_{file_name_no_ext}"

    if is_empty_file(annotations_file_path):
        continue
    
    annotations = pd.read_csv(annotations_file_path, sep='\t', names=['ann_type', 'entity_type', 'text'], encoding='utf-8')
    annotations['start_pos'] = annotations['entity_type'].transform(lambda v: int(v.split()[1]))
    annotations['end_pos'] = annotations['entity_type'].transform(lambda v: int(v.split()[2]))
    annotations['entity_type'] = annotations['entity_type'].transform(lambda v: v.split()[0])
    annotations['synonyms'] = annotations.apply(lambda row: get_synonyms(row["text"], source_report_file_name), axis=1)
    annotations['has_multiple_synonyms'] = annotations['synonyms'].transform(lambda v: len(v) > 1)
    
    candidate_annotations = annotations[annotations['has_multiple_synonyms'] == True] # just to be explicit
    
    if candidate_annotations.empty:
        unchanged_files.append(file_name_no_ext)
        continue
    
    with open(f"{train_split_path}/{file_name}", "r", encoding="utf8") as og_file:
        report_text = og_file.read()

    annotation_to_replace = candidate_annotations.sample(1) # pick a random annotation to replace, this returns a data frame
    replacement_term = random.choice(annotation_to_replace.iloc[0]["synonyms"]) # pick a random synonym of the annotation term
    report_text = report_text.replace(annotation_to_replace.iloc[0]["text"], "", 1)
    report_text = insert_string_at_position(report_text, replacement_term, annotation_to_replace.iloc[0]["start_pos"])
    
    with open(f"{output_path}/{augmentation_file_name_no_ext}.txt", "w+", encoding="utf8") as aug_file:
        aug_file.write(report_text)

    # re-align annotations
    result_annotations = pd.concat([annotations, annotation_to_replace], axis=0, ignore_index=True)
    result_annotations = result_annotations.drop_duplicates(subset=["start_pos", "end_pos", "text", "ann_type", "entity_type"], keep=False)

    if not result_annotations.empty:
        result_annotations["start_pos"] = result_annotations.apply(lambda row: get_start_pos(row, report_text), axis=1)
        result_annotations["end_pos"] = result_annotations.apply(get_end_pos, axis=1)

    # add the replacement annotation
    annotation_to_replace["end_pos"] = annotation_to_replace["start_pos"] + len(replacement_term)
    annotation_to_replace["text"] = replacement_term
    result_annotations = pd.concat([result_annotations, annotation_to_replace], axis=0, ignore_index=True)
    
    result_annotations['entity_type_with_positions'] = result_annotations.apply(lambda row: f'{row["entity_type"]} {row["start_pos"]} {row["end_pos"]}', axis=1)
    result_annotations.drop(columns=['start_pos', 'end_pos', 'entity_type', 'synonyms', 'has_multiple_synonyms'], inplace=True) 
    result_annotations.to_csv(f"{output_path}/{augmentation_file_name_no_ext}.ann", encoding="utf8", sep="\t", columns=['ann_type', 'entity_type_with_positions', 'text'], index=False, header=False)

In [314]:
len(unchanged_files)

1038