In [167]:
import os
import sys
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [168]:
results_dir = './results'

In [169]:
results_files = list()
overall_results_df = None
results_files_columns = ['method', 'model', 'system', 'prompt', 'temperature', 'run', 'dataset']

for results_file in sorted(os.listdir(results_dir)):
    if results_file.endswith('overall.csv') or results_file.endswith('granular.csv'):
        continue
    
    results_file_parts = results_file.split('.csv')[0].split('_')
    
    system, prompt, temperature, run = ['NA']*4

    if results_file_parts[0] == 'code-llama-34b':
        method = 'LLM'
        model = results_file_parts[0]
        system, prompt, temperature = results_file_parts[1:4]
        dataset = results_file_parts[4]
        run = results_file_parts[5]
    elif results_file_parts[0] == 'OpenAI':
        method = 'LLM'
        model = results_file_parts[1]
        system, prompt, temperature = results_file_parts[2:5]
        dataset = results_file_parts[5]
        run = results_file_parts[6]
    elif results_file_parts[0] == 'exact':
        method = results_file_parts[0]
        model = 'exact'
        dataset = results_file_parts[1]
    elif results_file_parts[0] == 'deepcadrme':
        method = 'BERT'
        model = 'deepcadrme'
        system = results_file_parts[1]
        dataset = results_file_parts[2]
    else:
        raise Exception(results_file_parts[0])

    results_files.append([method, model, system, prompt, temperature, run, dataset])

    for evaltype in ('strict', 'lenient'):
        overall_results_file = results_file.replace('.csv', f"_{evaltype}_overall.csv")
        res = pd.read_csv(os.path.join(results_dir, overall_results_file), index_col=0)
        for name, colval in zip(results_files_columns, results_files[-1]):
            res.insert(0, name, [colval]*res.shape[0])
        res.insert(0, 'evaltype', [evaltype]*res.shape[0])
        
        if overall_results_df is None:
            overall_results_df = res
        else:
            overall_results_df = pd.concat([overall_results_df, res])

results_files_df = pd.DataFrame(results_files, columns=results_files_columns)

In [170]:
results_files_df.groupby(['dataset', 'method', 'model', 'system', 'prompt', 'temperature']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,run
dataset,method,model,system,prompt,temperature,Unnamed: 6_level_1
test,BERT,deepcadrme,005,,,1
test,BERT,deepcadrme,010,,,1
test,BERT,deepcadrme,015,,,1
test,BERT,deepcadrme,025,,,1
test,BERT,deepcadrme,050,,,1
test,BERT,deepcadrme,075,,,1
test,BERT,deepcadrme,100,,,1
test,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v1,temp0,3
test,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v1,temp0,2
test,LLM,gpt-4-1106-preview,fatal-prompt-v3,pharmexpert-v1,temp0,1


In [246]:
# default eval
groupbycols = ['dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt']
additional_querystr = " & ade_type=='all'"

# evaluate the prompt
# found that pharmexpert-v1 is working a bit better than the others
# groupbycols = ['dataset', 'ade_type', 'section', 'prompt', 'method']
# additional_querystr = " & model != 'code-llama-34b' & method == 'LLM'"

# compare ade_types
# groupbycols = ['ade_type', 'dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt']
# additional_querystr = ""

querystr = "evaltype=='lenient' & dataset=='test' & section == 'adverse reactions'" + additional_querystr
metric = 'recall'
overall_results_df.query(querystr).groupby(groupbycols)[f"macro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,count,mean,min,max
dataset,ade_type,section,method,model,system,prompt,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
test,all,adverse reactions,BERT,deepcadrme,005,,1,0.962558,0.962558,0.962558
test,all,adverse reactions,BERT,deepcadrme,010,,1,0.97102,0.97102,0.97102
test,all,adverse reactions,BERT,deepcadrme,015,,1,0.962563,0.962563,0.962563
test,all,adverse reactions,BERT,deepcadrme,025,,1,0.962357,0.962357,0.962357
test,all,adverse reactions,BERT,deepcadrme,050,,1,0.96331,0.96331,0.96331
test,all,adverse reactions,BERT,deepcadrme,075,,1,0.965114,0.965114,0.965114
test,all,adverse reactions,BERT,deepcadrme,100,,1,0.96564,0.96564,0.96564
test,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v1,3,0.763918,0.7548,0.774791
test,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v1,2,0.85081,0.848175,0.853445
test,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v3,pharmexpert-v1,1,0.854655,0.854655,0.854655


In [247]:
overall_results_df.query(querystr).groupby(groupbycols)[f"micro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,count,mean,min,max
dataset,ade_type,section,method,model,system,prompt,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
test,all,adverse reactions,BERT,deepcadrme,005,,1,0.960137,0.960137,0.960137
test,all,adverse reactions,BERT,deepcadrme,010,,1,0.966781,0.966781,0.966781
test,all,adverse reactions,BERT,deepcadrme,015,,1,0.960994,0.960994,0.960994
test,all,adverse reactions,BERT,deepcadrme,025,,1,0.961852,0.961852,0.961852
test,all,adverse reactions,BERT,deepcadrme,050,,1,0.96078,0.96078,0.96078
test,all,adverse reactions,BERT,deepcadrme,075,,1,0.962495,0.962495,0.962495
test,all,adverse reactions,BERT,deepcadrme,100,,1,0.962709,0.962709,0.962709
test,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v1,3,0.739534,0.73682,0.743463
test,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v1,2,0.841299,0.837334,0.845264
test,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v3,pharmexpert-v1,1,0.843978,0.843978,0.843978
