# baseline_transcript_retrieval.ipynb

### This file contains the transcript-copy and IR retreival baselines.

In [None]:
RESOURCE_DIR = './../resources'
DATA_DIR = './../data'
testsets=["valid","clinicalnlp_taskB_test1","clinicalnlp_taskC_test2","clef_taskC_test3"]

# retrieval-based: spacy sentence similarity

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] ="1"

In [None]:
#spacy similarity
import spacy
import json
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np
import os
from tqdm import tqdm

nlp = spacy.load("en_core_web_md")

train_file="{}/challenge_data_json/train.json".format(DATA_DIR)
data=json.loads(open(train_file).read())["data"]

source=[]
for dic in data:
        source.append(nlp(dic["src"]))
for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                embedding = nlp(dic['src'])
                similarity = [embedding.similarity(d) for d in source]
                index=np.argmax(similarity)
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":data[index]["tgt"]})
        dirt=f"experiments/spacy_similarity_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)

# retrieval-based: UMLS similarity

In [None]:
#UMLS simialrity
import spacy
import json
import numpy as np
import os
from tqdm import tqdm
quickumls_fp = "{}/des/".format(RESOURCE_DIR)
#the window size for transitioning
WINDOW_SIZE=5
COUNT_THRESHOLD=50
ENCODING="utf-8"
from semantics import SEMANTICS
def get_matches(text,use_umls=True):
    concepts={}
    cui_list=[]
    if use_umls:
        matches=matcher.match(text, ignore_syntax=True)
        for match in matches:
            for m in match:
                if m['cui'] not in concepts.get(m['term'],[]):
                    concepts[m['term']]=concepts.get(m['term'],[])+[m['cui']]
                    cui_list.append(m['cui'])
    else:
        doc = nlp(text)
        #linker = nlp.get_pipe("scispacy_linker")
        for ent in doc.ents:
            key=(ent.text.lower(),ent.label_)
            if ent.text not in concepts.get(key,[]):
                    concepts[key]=concepts.get(key,[])+[ent.text]
                    cui_list.append(ent.text)
    return concepts,cui_list

from quickumls import QuickUMLS
matcher = QuickUMLS(quickumls_fp,window=WINDOW_SIZE,threshold=1,accepted_semtypes=SEMANTICS)

train_file="{}/challenge_data_json/train.json".format(DATA_DIR)
data=json.loads(open(train_file).read())["data"]

source=[]
for dic in data:
        source.append(dic['src'])
def umls_score_individual(reference,prediction,use_umls=True):
    true_concept,true_cuis=get_matches(reference,use_umls)
    pred_concept,pred_cuis=get_matches(prediction,use_umls)
    try:
        num_t=0
        for key in true_concept:
            for cui in true_concept[key]:
                if cui in pred_cuis:
                    num_t+=1
                    break
        
        precision=num_t*1.0/len(pred_concept.keys())
        recall=num_t*1.0/len(true_concept.keys())
        F1=2*(precision*recall)/(precision+recall)
        return F1
    except:
        return 0
for testset in testsets:    
    valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
    tgt=json.loads(open(valid_file).read())["data"]
        
    out=[]
    for dic in tqdm(tgt):
            similarity = [umls_score_individual(reference,dic['src'],use_umls=True) for reference in source]
            #print(similarity)
            index=np.argmax(similarity)
            out.append({"source":dic["src"],"true":dic["tgt"],"pred":data[index]["tgt"]})
    dirt=f"experiments/UMLS_similarity_{testset}/"
    if not os.path.isdir(dirt):
            os.mkdir(dirt)
    with open(dirt+"prediction.json","w",encoding="utf-8") as f:
            json.dump(out,f,indent=4)

# transcript as baseline

In [4]:
#transcript as baseline
import json
import os
for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":dic["src"]})
        dirt=f"experiments/transcript_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)

# part of transcript as baseline

In [5]:
#first 2 doctor turns and the last 10 turns for doctors
import json
import os


for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                doctors=dic["src"].replace("[patient_guest]","[patient]").split("[doctor]")
                
                doctors=[d.split("[patient]")[0] for d in doctors if d]
                if len(doctors)>12:
                        to_include=doctors[:2]+doctors[len(doctors)-10:]
                else:
                        to_include=doctors
                assert len(to_include)<=12
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":"\n".join(to_include)})
        dirt=f"experiments/12_doctor_turns_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)

In [6]:
#first 2 speaker turns and the last 10 speaker turns for doctors
import json
import os


for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                doctors=dic["src"].replace("[patient_guest]","[patient]")
                #assert doctors.count("[")==doctors.count("[patient]")+doctors.count("[doctor]"),[doctors.count("["),doctors.count("[patient]")+doctors.count("[doctor]")]
                doctors=doctors.split("[")
                doctors=[d for d in doctors if d]
                
                #doctors=[d.split("[patient]")[0] for d in doctors if d]
                if len(doctors)>12:
                        to_include=doctors[:2]+doctors[len(doctors)-10:]
                else:
                        to_include=doctors
                assert len(to_include)<=12
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":"[".join(to_include)})
        dirt=f"experiments/12_speaker_turns_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)

In [7]:
#longest doctor turn
#spacy similarity
import json
import os


for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                doctors=dic["src"].replace("[patient_guest]","[patient]").split("[doctor]")
                doctors=[d.split("[patient]")[0] for d in doctors if d]
                current_length=0
                for d in doctors:
                    if len(d.split())>current_length:
                           to_include=d
                           current_length=len(d.split())
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":to_include})
        dirt=f"experiments/longest_doctor_turn_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)

In [None]:
#longest doctor turn
import json
import os

for testset in testsets:    
        valid_file="{}/challenge_data_json/{}.json".format(DATA_DIR,testset)
        tgt=json.loads(open(valid_file).read())["data"]
        
        out=[]
        for dic in tgt:
                doctors=dic["src"].replace("[patient_guest]","[patient]").replace("[patient]","[doctor]").split("[doctor]")
                current_length=0
                for d in doctors:
                    if len(d.split())>current_length:
                           to_include=d
                           current_length=len(d.split())
                out.append({"source":dic["src"],"true":dic["tgt"],"pred":to_include})
        dirt=f"experiments/longest_speaker_turn_{testset}/"
        if not os.path.isdir(dirt):
                os.mkdir(dirt)
        with open(dirt+"prediction.json","w",encoding="utf-8") as f:
                json.dump(out,f,indent=4)