# Preprocess data

## This file processes the original CSV format into json formats and also tags the data with UMLS concepts.

In [None]:
DATA_DIR = '../data/'
RESOURCE_DIR = '../resources/des/'

In [None]:
# CONVERT CSV format to JSON format

import json
import pandas as pd
from glob import glob
import os

#dataset,id,doctor_name,doctor_gender,patient_gender,patient_age,patient_firstname,patient_familyname,cc,2nd_complaints,note,dialogue
#

def check_dir(dir_path):
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    return

def output_json_versions( all_files ) :

    all_files=[file for file in all_files if "_metadata" not in file]
    all_files.sort()
    for file in all_files:
            df=pd.read_csv(file,encoding="utf-8")

            out=[]
            for ind in df.index:
                id=df['encounter_id'][ind] if 'encounter_id' in df else str(ind)
                out.append({
                            "src":df['dialogue'][ind],
                            "tgt":df['note'][ind],
                            "file":id+"-"+str(df['dataset'][ind]),
                        })
            out_name=file.replace("src_experiment_data","src_experiment_data_json").replace("challenge_data","challenge_data_json").replace(".csv",".json")
            with open(out_name,"w",encoding="utf-8") as f:
                json.dump({"data":out},f,indent=4)
        

# pre-process baseline experiment data
all_files=glob( "%s/challenge_data/*.csv" %DATA_DIR )
check_dir("%s/challenge_data_json" %DATA_DIR)
output_json_versions( all_files )

# pre-process ablation experiment data
all_files=glob( "%s/src_experiment_data/*.csv" %DATA_DIR )
check_dir("%s/src_experiment_data_json" %DATA_DIR)
output_json_versions( all_files )

# label json data with UMLS concepts

This is just for the ablation study, where we label the UMLS concepts in the dialogue

In [None]:
#define the fact-based extractor
quickumls_fp = "%s" %RESOURCE_DIR

#the window size for transitioning
WINDOW_SIZE=5
COUNT_THRESHOLD=50
ENCODING="utf-8"

import spacy
nlp = spacy.load("en_ner_bc5cdr_md")


from semantics import SEMANTICS

from quickumls import QuickUMLS
matcher = QuickUMLS(quickumls_fp,window=WINDOW_SIZE,threshold=1,accepted_semtypes=SEMANTICS)

In [None]:
def add_umls(text):
    matches=matcher.match(text, ignore_syntax=True)
    UMLS_set=[]
    for match in matches:
        #print([m['semtypes'] for m in match])
        UMLS_set.append([match[0]['start'],match[0]['end'],", ".join(set([w for m in match for w in m['semtypes']]))])
        #print(match)
    UMLS_set.sort(key = lambda x: [x[0],x[1]])

    result=text[:UMLS_set[0][0]]
    for i,(s,e,type) in enumerate(UMLS_set):
        result+="[{}]".format(text[s:e])#"[{}]({})".format(text[s:e],type)
        if i<len(UMLS_set)-1:
            result+=text[e:UMLS_set[i+1][0]]
        else:
            result+=text[e:]
    return result

import shutil
import json
from tqdm import tqdm

for dataset in ["train","valid","clinicalnlp_taskB_test1","clinicalnlp_taskC_test2","clef_taskC_test3"]:
    file= "%s/challenge_data_json/" %DATA_DIR + "{}.json".format(dataset)
    dic=json.loads(open(file,encoding="utf-8").read())["data"]

    for i in tqdm(range(len(dic))):
        dic[i]["src"]=add_umls(dic[i]["src"])
    with open(file.replace(".json","_UMLS.json"),"w") as f:
        json.dump({"data":dic},f,indent=4)
    with open(file.replace(".json","_UMLS_full.json"),"w") as f:
        json.dump({"data":dic},f,indent=4)