# Post-process

This code creates the evaluation bash script that calls the evaluation for each test file.

In [37]:
# remove all the model logs to save space
from glob import glob
import os
import shutil

for path in glob("experiments/*"):
        if os.path.isdir(path+"/runs"):
            shutil.rmtree(path+"/runs")
        for subpath in glob(path+"/*"):
             if os.path.isdir(subpath) and "best_model" not in subpath:
                shutil.rmtree(subpath)

for file in glob("experiments/**/*.bin"):
    if "best_model" not in file:
        os.remove(file)

In [38]:
# combine predictions from division-based approaches
import json

sec_map={
    "subjective": "chief complaint :", 
    "objective_exam":"physical examination :", 
    "objective_results": "results :",
    "assessment_and_plan": "ASSESSMENT AND PLAN"    
}

division_types=["subjective","objective_exam", "objective_results","assessment_and_plan"]

for fine_tune in ["","finetune3_"]:
    for trainset in ["virtscribe_asr","aci_asrcorr"]:#
        for testset in ["test1","test2","test3"]:
            for testset2 in ["aci_asr","aci_asrcorr","virtscribe_asr","virtscribe_humantrans"]:
                model="bart-large-xsum-samsum"
                #
                out_dir=f"experiments/{model}_{testset}_{testset2}_division_combined_train_{fine_tune}{trainset}"
                if not os.path.isdir(out_dir):
                    os.mkdir(out_dir)

                pred_dir=f"../data/scr_experiment_data_json/{testset}_{testset2}.json"
                
                old_pred=json.loads(open(pred_dir).read())['data']
                pred=[]

                for p in old_pred:
                        pred.append({
                            "source":p["src"],
                            "true":p["tgt"],
                            "pred":""
                        })

                for sec in division_types: 
                    #For bart-based models
                    #ablation_bart-large-xsum-samsum_${testset}_${testset2}_${section}_train_${trainset}
                    source_dir=glob(f"experiments/ablation_{model}_{testset}_{testset2}_{sec}_train_{fine_tune}{trainset}/prediction_{testset}_{testset2}_{sec}.json")
                    assert len(source_dir)==1,source_dir
                    source_dir=source_dir[-1]
                    section=json.loads(open(source_dir).read())
                    assert len(section)==len(pred), [len(section),len(pred)]
                    for i,p in enumerate(pred):
                        pred[i]["pred"]+="\n"+section[i]["pred"]
                    
                with open(out_dir+"/prediction.json","w",encoding="utf-8") as f :
                    json.dump(pred,f,indent=4)

# evaluate the result

1. extract predictions from the baseline/experiment folder, reformat into csv style into the predictions folder.
2. generate the bash script for running evaluation

In [39]:
import json
from glob import glob
import pandas as pd
import os

DATA_DIR = '../data/'
RESOURCE_DIR = '../resource/'

CHALLENGE_DATA_DIR = DATA_DIR+'challenge_data_json/' 
SRCEXP_DATA_dir = DATA_DIR+'scr_experiment_data/'

testsets = ["clinicalnlp_taskB_test1", "clinicalnlp_taskC_test2","clef_taskC_test3"]
testsets = ["test1", "test2","test3"]
PRED_DIR = "predictions/"
RESULT_DIR = "results/"
if not os.path.isdir(PRED_DIR):
    os.mkdir(PRED_DIR)
if not os.path.isdir(RESULT_DIR):
    os.mkdir(RESULT_DIR)

In [40]:
#define the reference script for the tabl
dataset_map={}

#map for the ablation study
for testset in testsets:
    for testset2 in [ "aci_asrcorr", "aci_asr", "virtscribe_asr", "virtscribe_humantrans" ]:
        #{model}_{testset}_{testset2}_division_combined_train_{trainset}/prediction.json
        dataset_map["{}_{}_".format(testset,testset2)]="{}scr_experiment_data/{}_{}.csv".format(DATA_DIR,testset,testset2)
        dataset_map["{}_{}.".format(testset,testset2)]=dataset_map["{}_{}_".format(testset,testset2)]


all_paths=glob("experiments/*")
all_paths=[p for p in all_paths if "ablation" in path]

results_to_evaluate=["full","division"]

#print(pred_files)
with open("ablation_evaluation_script.sh","w") as f:
    for path in all_paths:
        pred_files=[file for file in glob(path+"/*.json") if "prediction" in file and "epoch" not in file]
        if pred_files:
            pred_files.sort()
            file=pred_files[-1]

            #if the result will be included in the table
            if any([r in file for r in results_to_evaluate]):
                for key in dataset_map:
                    if key in file:
                            outname="predictions/{}.csv".format(path.split("/")[-1])
                            
                            #generate prediction file
                            pred=json.loads(open(file).read())
                            src_df=pd.read_csv(dataset_map[key],encoding="utf-8")
                            if len(pred)==len(src_df):
                                for ind,p in enumerate(pred):
                                    src_df['note'][ind]=p['pred']
                                src_df['dataset'][ind]=src_df['dataset'][ind]+"-{}".format(ind)
                            else:
                                print([file,key,"error"])
                                continue
                            
                            src_df.to_csv(outname,index=False)
                            f.write("python evaluation/evaluate_fullnote.py \\\n")
                            f.write(dataset_map[key][3:]+" \\\n") # ref
                            assert os.path.isfile(dataset_map[key]), dataset_map[key]
                            assert  os.path.isfile(outname),outname
                            f.write("baselines/"+outname+" \\\n") # prediction
                            
                            meta_file=dataset_map[key].replace(".csv","_metadata.csv")
                            if os.path.isfile(meta_file):
                                f.write(meta_file[3:]+"\n") #write meta-data
                            else:
                                print(meta_file)
                            f.write("\n\n\n")
                            break  

# save the results to tables

first, run the evaluation script through

```
bash ./baselines/ablation_evaluation_script.sh
```

# output to latex table

In [31]:
#ASR and human

abalation_begin="""
\\begin{table}[]
\\centering
\\begin{tabular}{ccccccc}
\\hline
\\textbf{\\begin{tabular}[c]{@{}c@{}}Test\\\\ set\\end{tabular}} & \\textbf{\\begin{tabular}[c]{@{}c@{}}Bart\\\\ Fine-tuning\\end{tabular}} & \\textbf{\\begin{tabular}[c]{@{}c@{}}Test\\\\ Split\\end{tabular}} & \\textbf{ROUGE-1} & \\textbf{ROUGE-2} & \\textbf{ROUGE-L} & \\textbf{Fact} \\\\ \\hline
"""

abalation_end="""
\\end{tabular}
\\caption{Model performance on different test sets splits, comparison between \\textit{CATEGORY} dialogues with LABEL1 and LABEL2 transcript. The model finetuned on the train set is the BART+FT$_{\\mathrm{SAMSum}}$ (Division) fine-tuned with 10 epochs on the original train set, as in the baseline methods. The train + train$_{\\mathrm{TRAIN_LABEL}}$ model refers to the BART+FT$_{\\mathrm{SAMSum}}$ (Division) finetuned for 3 more epochs on the \\textit{CATEGORY} with TRAIN_LABEL split of the train set.}
\\label{tab:ablation_}
\\end{table}
"""

In [62]:
def write_tex(text,filename):
    with open(filename,"w",encoding="utf-8") as f:
        f.write(text)
    return

result_type="ALL"
metrics=['rouge1', 'rouge2', 'rougeLsum', 'umls']
testsets=["test1","test2","test3"]
trainsets=[["","virtscribe_asr"],["","aci_asrcorr"]]
testsets2=[["virtscribe_asr","virtscribe_humantrans"],["aci_asr","aci_asrcorr"]]
model="bart-large-xsum-samsum"
for trainset, testset2 in zip(trainsets,testsets2):
    
    finetune_type=trainset[1].split("_")[1]
    finetune_type=finetune_type.upper() if finetune_type=="asr" else finetune_type

    labels=[t.split("_")[1].replace("trans","") for t in testset2]
    
    outname=f"../tables/ablation_{labels[0]}_vs_{labels[1]}.tex"
    
    labels=[l.upper() if l=="asr" else l for l in labels ]
    
    text=abalation_begin
    max_values={}
    for testset in testsets:
        

        text+="\\multirow{4}{*}{"+str(testset[-1])+"}"
        
        for test_type,label in zip(testset2,labels):
            for train_type in trainset:
                if not train_type:
                    filename=f"../results/{model}_{testset}_{test_type}_division_combined_train_{trainset[-1]}.json"
                    text+="&train" 
                else:
                    filename=f"../results/{model}_{testset}_{test_type}_division_combined_train_finetune3_{trainset[-1]}.json"
                    text+="&+train$_{\\mathrm{"+finetune_type+"}}$"
                text+=f"&{label}&"

                if os.path.isfile(filename):
                    result=json.loads(open(filename).read())[result_type]
                    row=[]
                    for metric in metrics:
                        row.append(float(result[metric]))
                        max_values[metric+testset]=max(max_values.get(metric+testset,row[-1]),row[-1])
                    row=["{:.2f}".format(r*100) for r in row]
                else:
                    row=["NA"]*4
                text+="&".join(row)+"\\\\ \n"

        text+="\\cline{1-3}" if testset!=testsets[-1] else "\\hline"
    text+=abalation_end

    # highlight the max values
    for metric in max_values:
            number="{:.2f}".format(round(max_values[metric]*100,2))
            if text.count(number)==1:
                text=text.replace(number,"\\textbf{"+number+"}")
            else:
                print([text.count(number),metric])

    text=text.replace("CATEGORY",trainset[1].split("_")[0])
    text=text.replace("LABEL1",labels[0])
    text=text.replace("LABEL2",labels[1])
    text=text.replace("TRAIN_LABEL",finetune_type)
    text=text.replace("asrcorr","ASRcorr")
    text=text.replace("tab:ablation_",f"tab:ablation_{labels[0]}_{labels[1]}".lower())
    write_tex(text,outname)

In [57]:
max_values

{'rouge1test3': 0.5312651200192207,
 'rouge2test3': 0.23528875463861948,
 'rougeLsumtest3': 0.4772540734469096,
 'umlstest3': 0.46379720063786994}

In [48]:
filename

'results/bart-large-xsum-samsum_test3_aci_asrcorr_division_combined_train_finetune3_aci_asrcorr.json'

# read prediction example.

In [69]:
file1="/home/velvinfu/code/aci-demo-benchmark-private-main/baselines/experiments/2_bart-large-xsum-samsum_clinicalnlp_taskC_test2_full/prediction_clinicalnlp_taskC_test2_full.json"
file2="/home/velvinfu/code/clef2023-internal/predictions/bart-large-xsum-samsum_test2_full_ori.json"
file3="/home/velvinfu/code/aci-demo-benchmark-private-main/baselines/experiments/bart-large-xsum-samsum_clinicalnlp_taskC_test2_full/prediction_clinicalnlp_taskC_test2_full.json"

import json
result1=json.loads(open(file1).read())[0]["pred"]
result2=json.loads(open(file2).read())[0]["pred"]
result3=json.loads(open(file3).read())[0]["pred"]

In [65]:
result1==result2

False

In [66]:
result1

'CHIEF COMPLAINT\n\nOsteoarthritis follow-up.\n\nHISTORY OF PRESENT ILLNESS\n\nThe patient is a 49-year-old female who presents for follow up of her chronic problems. She is a right-handed female who reports she has been experiencing pain in her right elbow and right hand with typing. She has a history of gout and psoriasis. The last episode of her last episode was about 3 months ago. She denies any other issues. She reports pain with flexion and extension of the right arm and pain with pronation and supination. She also reports numbness and tingling in her hands when she is typing for long periods of time. She states she tries to shake her arms a little bit to help relieve the pain. The patient has had edema and inflammation of her right olecranon bursa and there is some tenderness and an effusion right there. When she turns her arm, that hurts a bit too. She experiences pain when she flexes and straightens her arm. Her right toe has some inflammation of the toe, but she has not seen 

In [67]:
result2

'CHIEF COMPLAINT\n\nOsteoarthritis follow-up.\n\nHISTORY OF PRESENT ILLNESS\n\nCarolyn is a 49-year-old female who is here for follow up of her chronic problems. She is a right-handed female who has a history of gout and psoriasis. The last episode of her last episode was about 3 months ago. She reports pain with flexion and extension of the right arm and pain with pronation and supination. She has pain with palpation of the olecranon bursa. She experiences numbness and tingling in her hands when she is typing for long periods of time. She tries to shake her arms a little bit to relieve the pain. The right elbow does bother her more than the other, but she is primarily typing all day versus writing. She denies any other joint pain. She had some inflammation of her right toe, but the medication she was given for that has controlled it and she has not seen a flare-up since the last episode. She also had an autoimmune response to clobetasol for her scalp, which has been doing well.\n'

In [70]:
result3

'CHIEF COMPLAINT\n\nThe patient is a 49-year-old female who is here for follow-up of her chronic problems.\n\nOsteoarthritis has been flaring up a little bit lately. The patient reports pain with extension of the right arm and pain with pronation and supination. She also reports pain-to- palpation of the olecranon bursa. She has had a history of gout and has not seen a flare-up since the last episode 3 months ago. She reports that the medication she was given for gout has been effective in controlling her symptoms. She states that her psoriasis has been under control for the last 3 months. She does not appreciate any cervical lymphadenopathy. Her heart rate is a nice regular rate and rhythm, and her lungs sound clear. She experiences numbing or tingling in her hands when typing for long periods of time.'