# Post-process

This code creates the evaluation bash script that calls the evaluation for each test file.

In [None]:
# remove all the model logs to save space
from glob import glob
import os
import shutil

for path in glob("experiments/*"):
        if os.path.isdir(path+"/runs"):
            shutil.rmtree(path+"/runs")
        for subpath in glob(path+"/*"):
             if os.path.isdir(subpath) and "best_model" not in subpath:
                shutil.rmtree(subpath)

for file in glob("experiments/**/*.bin"):
    if "best_model" not in file:
        os.remove(file)

In [None]:
# combine predictions from division-based approaches
import json

sec_map={
    "subjective": "chief complaint :", 
    "objective_exam":"physical examination :", 
    "objective_results": "results :",
    "assessment_and_plan": "ASSESSMENT AND PLAN"    
}

division_types=["subjective","objective_exam", "objective_results","assessment_and_plan"]

for testset in ["clinicalnlp_taskB_test1","clinicalnlp_taskC_test2","clef_taskC_test3"]:#
        for model in ["bart-large-xsum-samsum","BioBART","BART_large","bart-large-xsum-samsum","LED","LED_pubmed"]:#]:
            out_dir="experiments/{}_{}_division_combined".format(model,testset)
            if not os.path.isdir(out_dir):
                os.mkdir(out_dir)

            pred_dir="../data/challenge_data_json/{}.json".format(testset)
            
            old_pred=json.loads(open(pred_dir).read())['data']
            pred=[]

            for p in old_pred:
                    pred.append({
                        "source":p["src"],
                        "true":p["tgt"],
                        "pred":""
                    })

            for sec in division_types: 
                #For bart-based models
                source_dir=glob("experiments/{}_{}_{}/prediction_{}_{}.json".format(model,testset,sec,testset,sec))
                
                #fOR led-BASED MODELS
                if not source_dir:
                        source_dir=glob("experiments/{}_{}_{}/*.json".format(model,testset,sec))
                        source_dir=[file for file in source_dir if "prediction" in file]
                
                if len(source_dir)>1:
                    source_dir.sort()
                assert len(source_dir)==1
                source_dir=source_dir[-1]
                section=json.loads(open(source_dir).read())
                assert len(section)==len(pred), [len(section),len(pred)]
                for i,p in enumerate(pred):
                        if "LED" in source_dir:
                            #as LED models unable to learn the section header.
                            pred[i]["pred"]+="\n"+sec_map[sec]+"\n"
                        else:
                            pred[i]["pred"]+="\n"
                        pred[i]["pred"]+=section[i]["pred"]
                
            with open(out_dir+"/prediction.json","w",encoding="utf-8") as f :
                json.dump(pred,f,indent=4)

# evaluate the result

1. extract predictions from the baseline/experiment folder, reformat into csv style into the predictions folder.
2. generate the bash script for running evaluation

In [None]:
import json
from glob import glob
import pandas as pd
import os

DATA_DIR = '../data/'
RESOURCE_DIR = '../resource/'

CHALLENGE_DATA_DIR = DATA_DIR+'challenge_data_json/' 
SRCEXP_DATA_dir = DATA_DIR+'src_experiment_data/'

testsets = ["clinicalnlp_taskB_test1", "clinicalnlp_taskC_test2","clef_taskC_test3"]
testsets = ["test1", "test2","test3"]
PRED_DIR = "predictions/"
if not os.path.isdir(PRED_DIR):
    os.mkdir(PRED_DIR)

In [None]:
#define the reference script for the tabl
dataset_map={}

#map for the ablation study
for testset in testsets:
    for testset2 in [ "aci_asrcorr", "aci_asr", "virtscribe_asr", "virtscribe_humantrans" ]:
        dataset_map["{}_{}_".format(testset,testset2)]="{}src_experiment_data/{}_{}.csv".format(DATA_DIR,testset,testset2)
        dataset_map["{}_{}.".format(testset,testset2)]=dataset_map["{}_{}_".format(testset,testset2)]

#map for the real study
dataset_map={**dataset_map,
            **{
#            "valid":"%schallenge_data/valid.csv" %DATA_DIR,
            "test1":"%schallenge_data/clinicalnlp_taskB_test1.csv" %DATA_DIR,
            "taskB":"%schallenge_data/clinicalnlp_taskB_test1.csv" %DATA_DIR,
            "test2":"%schallenge_data/clinicalnlp_taskC_test2.csv" %DATA_DIR,
            "taskC_test2":"%schallenge_data/clinicalnlp_taskC_test2.csv" %DATA_DIR,
            "test3":"%schallenge_data/clef_taskC_test3.csv" %DATA_DIR,
            "taskC_test3":"%schallenge_data/clef_taskC_test3.csv" %DATA_DIR,
            }}

all_paths=glob("experiments/*")

results_to_evaluate=["full","division","12_doctor_turns","12_speaker_turns","longest_speaker_turn","longest_doctor_turn",
                    "spacy_similarity","UMLS_similarity","transcript"]

#print(pred_files)
with open("evaluation_script.sh","w") as f:
    for path in all_paths:
        pred_files=[file for file in glob(path+"/*.json") if "prediction" in file and "epoch" not in file]
        if pred_files:
            pred_files.sort()
            file=pred_files[-1]

            #if the result will be included in the table
            if any([r in file for r in results_to_evaluate]):
                for key in dataset_map:
                    if key in file and "BART" in file.upper():
                            outname="predictions/{}.csv".format(path.split("/")[-1])
                            
                            #generate prediction file
                            pred=json.loads(open(file).read())
                            src_df=pd.read_csv(dataset_map[key],encoding="utf-8")
                            if len(pred)==len(src_df):
                                for ind,p in enumerate(pred):
                                    src_df['note'][ind]=p['pred']
                                src_df['dataset'][ind]=src_df['dataset'][ind]+"-{}".format(ind)
                            else:
                                print([file,key,"error"])
                                continue
                            
                            src_df.to_csv(outname,index=False)
                            f.write("python evaluation/evaluate_fullnote.py \\\n")
                            f.write(dataset_map[key][3:]+" \\\n") # ref
                            assert os.path.isfile(dataset_map[key]), dataset_map[key]
                            assert  os.path.isfile(outname),outname
                            f.write("baselines/"+outname+" \\\n") # prediction
                            
                            meta_file=dataset_map[key].replace(".csv","_metadata.csv")
                            if os.path.isfile(meta_file):
                                f.write(meta_file[3:]+"\n") #write meta-data
                            else:
                                print(meta_file)
                            f.write("\n\n\n")
                            break  

# save the results to tables

first, run the evaluation script through

```
bash ./baselines/evaluation_script.sh
```

In [None]:
# renaming the files
import os
from glob import glob
name_pairs={
"taskB_organizer":"clinicalnlp_taskB_test1",
"taskC_organizer":"clinicalnlp_taskC_test2",
"clef_organizer":"clef_taskC_test3",
"chatgpt_run2":"ChatGPT_",
"chatgpt_run2":"ChatGPT_",
"davinci2_run2":"Text-Davinci-002_",
"davinci3_run2":"Text-Davinci-003_",
"gpt4_run2":"GPT-4_",
"clinicalnlp_taskB_test1":"test1",
"clinicalnlp_taskC_test2":"test2",
"clef_taskC_test3":"test3",
"_1024":"",
"_gl1024":"",
".csv":""
}

# files=glob("../baselines/predictions/*.csv")
files=glob("../results/*.json")
for file in files:
    new_name=file+""
    for key in name_pairs:
        new_name=new_name.replace(key,name_pairs[key])
    os.rename(file,new_name)

In [None]:
from glob import glob
import json
import os
files=glob("../results/*.json")

def get_model_result(dataset):
    model_map={
        "longest spearker turn":'../results/longest_speaker_turn_{}.json'.format(dataset),
        "longest doctor turn":'../results/longest_doctor_turn_{}.json'.format(dataset),
        "12 speaker turns":'../results/12_speaker_turns_{}.json'.format(dataset),
        "12 doctor turns": '../results/12_doctor_turns_{}.json'.format(dataset),
        "transcript":"../results/transcript_{}.json".format(dataset),
        "train$_{\mathrm{UMLS}}$":"../results/UMLS_similarity_{}.json".format(dataset),
        "train$_{\mathrm{sent}}$":'../results/spacy_similarity_{}.json'.format(dataset),
    }

    #read_data
    for model in ["BART_large","bart-large-xsum-samsum","BioBART","LED","LED_pubmed",
                  "Text-Davinci-002","Text-Davinci-003","ChatGPT","GPT-4"]:
        outfile='../results/{}_{}_full.json'.format(model,dataset)

        #open_ai models
        if not os.path.isfile(outfile):
            outfile='../results/{}{}_.json'.format(dataset,model) 
        if outfile not in files:
            outfile=[file for file in files if "{}_{}_full".format(model,dataset) in file]
            outfile.sort()
            outfile=outfile[-1] if outfile else ""

        #format the output for latex table
        key=model.replace("bart-large","BART").replace("_large","").replace("-xsum-samsum","+FT$_{\mathrm{SAMSum}}$").replace("_pubmed","+FT$_{\mathrm{PubMed}}$")
        #baseline
        model_map[key]=outfile

        if model in ["BART_large","bart-large-xsum-samsum","BioBART","LED","LED_pubmed"]:
            #modelname=f"{model}_{dataset}_ori_SOAP_combined" if "LED" not in model else f"{model}_{dataset}_division_combined"
            modelname=f"{model}_{dataset}_division_combined"
            
            outfile=[file for file in files if modelname in file]
            model_map[key+" (Division)"]=outfile[-1] if outfile else ""
    return model_map


#testsets=["clinicalnlp_taskB_test1","clinicalnlp_taskC_test2","clef_taskC_test3"] #"train","valid",
testsets=["test1","test2","test3"]
# test 1
for dataset in testsets:
    model_map=get_model_result(dataset)
    #map
    with open("../tables/{}.csv".format(dataset),"w") as f:
        for result_type in ["ALL", "division-subjective","division-objective_exam",
                            "division-objective_results","division-assessment_and_plan"]:
            f.write("\n\n"+result_type+"\n")
            headers=['Model','ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'BLEURT','Fact',"average"]
            f.write(",".join(headers)+"\n")
            for key in model_map:
                if key in ["BART","LED"]:
                    f.write("{}-based\n".format(key))
                elif key=="longest spearker turn":
                    f.write("Transcript-based\n")
                elif key=="train$_{\mathrm{UMLS}}$":
                    f.write("Retrieval-based\n")
                elif key=="Text-Davinci-002":
                    f.write("OpenAI (wo FT)\n")
                row=[key]
                if os.path.isfile(model_map[key]):
                    result=json.loads(open(model_map[key]).read())[result_type]
                    for metric in ['rouge1', 'rouge2', 'rougeLsum', 'bertscore-f1', 'bleurt','umls']:
                        row.append(float(result[metric]))
                    row.append(((sum(row[1:4])/3+sum(row[4:7])))/4)
                    row=[row[0]]+["{:.2f}".format(round(r*100,2)) for r in row[1:]]
                else:
                    row=row+["NA"]*7
                f.write(",".join(row)+"\n")

# output to latex table

In [None]:
#full note
Full_note_begin="""
\\begin{table}[]
\\centering
\\begin{tabular}{lcccc}
\\hline
\\textbf{Model}                           & \\textbf{ROUGE-1} & \\textbf{ROUGE-2} & \\textbf{ROUGE-L} & \\textbf{Fact} \\\\ \\hline
"""

Full_note_end="""
\\hline
\\end{tabular}
\\caption{Results of the summarization models fine-tuned (FT) and evaluated on the ACI demo corpus, test set 1.}
\\label{tab:test1}
\\end{table}
"""

In [None]:
def write_tex(text,filename):
    with open(filename,"w",encoding="utf-8") as f:
        f.write(text)
    return

result_type="ALL"
metrics=['rouge1', 'rouge2', 'rougeLsum', 'umls']
for dataset in testsets:

        max_values={}

        filename=f"../tables/{dataset}.tex"
        model_map=get_model_result(dataset)

        text=Full_note_begin
        
        headers=['Model','ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Fact']

        for key in model_map:
                    if key in ["BART","LED"]:
                        text+="\\cline{1-1}\n\\textbf{"+key+"-based} & & & &\\\\"
                    elif key=="longest spearker turn":
                        text+="\\textbf{Transcript-copy-and-paste} & & & &\\\\"
                    elif key=="train$_{\mathrm{UMLS}}$":
                        text+="\\cline{1-1}\n\\textbf{Retrieval-based} & & & &\\\\"
                    elif key=="Text-Davinci-002":
                        text+="\\cline{1-1}\n\\textbf{OpenAI (wo FT)} & & & &\\\\"

                    row=[key]
                    if os.path.isfile(model_map[key]):
                        result=json.loads(open(model_map[key]).read())[result_type]
                        for metric in metrics:
                            row.append(float(result[metric]))
                            max_values[metric]=max(max_values.get(metric,row[-1]),row[-1])
                        row=[row[0]]+["{:.2f}".format(round(r*100,2)) for r in row[1:]]
                    else:
                        row=row+["NA"]*4
                    text+="&".join(row)+"\\\\ \n"

        text+=Full_note_end

        text=text.replace("\n\\cline{1-1}","\\cline{1-1}")

        # highlight the max values
        for metric in metrics:
            number="{:.2f}".format(round(max_values[metric]*100,2))
            if text.count(number)==1:
                text=text.replace(number,"\\textbf{"+number+"}")
            else:
                print("error")
        
        text=text.replace("test set 1","test set {}".format(dataset[-1]))
        text=text.replace("test1",dataset)
        write_tex(text,filename)

In [None]:
#devision-based
division_begin="""
\\begin{table}[]
\\centering
\\begin{tabular}{lccccccc}
\\hline
\\textbf{} & \\multicolumn{7}{c}{\\textbf{Evaluation score on the assessment\\_and\\_plan division}} \\\\ \\cline{2-8} 
\\textbf{Model} & \\textbf{ROUGE-1} & \\textbf{ROUGE-2} & \\textbf{ROUGE-L} & \\textbf{BERTScore} & \\textbf{BLEURT} & \\textbf{Fact} & \\textbf{Average} \\\\ \\hline
"""

division_end="""
\\hline
\\end{tabular}
\\caption{Results of the summarization models on the assessment\\_and\\_plan division, test set 1.}
\\label{tab:test1_assessment_and_plan}
\\end{table}
"""

In [None]:
metrics=['rouge1', 'rouge2', 'rougeLsum', 'bertscore-f1', 'bleurt','umls']
for result_type in ["subjective","objective_exam","objective_results","assessment_and_plan"]:
    for dataset in testsets:

        max_values={}

        filename=f"../tables/{dataset}_{result_type}.tex"
        model_map=get_model_result(dataset)

        text=division_begin
        
        headers=['Model','ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'Fact']

        keys=list(model_map.keys())[5:]
        for key in keys:
                    if key in ["BART","LED"]:
                        text+="\\cline{1-1}\n\\textbf{"+key+"-based }&  &  &  &  &  &  &\\\\ \n"
                    elif key=="longest spearker turn":
                        text+="\\textbf{Transcript-copy-and-paste} &  &  &  &  &  &  &\\\\ \n"
                    elif key=="train$_{\mathrm{UMLS}}$":
                        text+="\\textbf{Retrieval-based} &  &  &  &  &  &  &\\\\ \n"
                    elif key=="Text-Davinci-002":
                        text+="\\cline{1-1}\n\\textbf{OpenAI (wo FT)} &  &  &  &  &  &  &\\\\ \n"

                    row=[key]
                    if os.path.isfile(model_map[key]):
                        result=json.loads(open(model_map[key]).read())["division-"+result_type]
                        for metric in metrics:
                            row.append(float(result[metric]))
                            max_values[metric]=max(max_values.get(metric,row[-1]),row[-1])
                        row.append(((sum(row[1:4])/3+sum(row[4:7])))/4)
                        max_values["Average"]=max(max_values.get("Average",row[-1]),row[-1])
                        row=[row[0]]+["{:.2f}".format(round(r*100,2)) for r in row[1:]]
                    else:
                        row=row+["NA"]*7
                    text+="&".join(row)+"\\\\ \n"

        text+=division_end

        text=text.replace("\n\\cline{1-1}","\\cline{1-1}")

        # highlight the max values
        for metric in metrics+["Average"]:
            number="{:.2f}".format(round(max_values[metric]*100,2))
            if text.count(number)==1:
                text=text.replace(number,"\\textbf{"+number+"}")
            else:
                print([metric,dataset,result_type])
        
        text=text.replace("test set 1","test set {}".format(dataset[-1]))
        text=text.replace("test1",dataset)
        text=text.replace("assessment_and_plan",result_type)
        text=text.replace("assessment\\_and\\_plan",result_type.replace("_","\\_"))
        write_tex(text,filename)

# read prediction example.

In [None]:
file1="/home/velvinfu/code/aci-demo-benchmark-private-main/baselines/experiments/2_bart-large-xsum-samsum_clinicalnlp_taskC_test2_full/prediction_clinicalnlp_taskC_test2_full.json"
file2="/home/velvinfu/code/clef2023-internal/predictions/bart-large-xsum-samsum_test2_full_ori.json"
file3="/home/velvinfu/code/aci-demo-benchmark-private-main/baselines/experiments/bart-large-xsum-samsum_clinicalnlp_taskC_test2_full/prediction_clinicalnlp_taskC_test2_full.json"

import json
result1=json.loads(open(file1).read())[0]["pred"]
result2=json.loads(open(file2).read())[0]["pred"]
result3=json.loads(open(file3).read())[0]["pred"]

In [None]:
result1==result2

In [None]:
result1

In [None]:
result2

In [None]:
result3