In [167]:
import os
import sys
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [168]:
results_dir = './results'

In [169]:
results_files = list()
overall_results_df = None
results_files_columns = ['method', 'model', 'system', 'prompt', 'temperature', 'run', 'dataset']

for results_file in sorted(os.listdir(results_dir)):
    if results_file.endswith('overall.csv') or results_file.endswith('granular.csv'):
        continue
    
    results_file_parts = results_file.split('.csv')[0].split('_')
    
    system, prompt, temperature, run = ['NA']*4

    if results_file_parts[0] == 'code-llama-34b':
        method = 'LLM'
        model = results_file_parts[0]
        system, prompt, temperature = results_file_parts[1:4]
        dataset = results_file_parts[4]
        run = results_file_parts[5]
    elif results_file_parts[0] == 'OpenAI':
        method = 'LLM'
        model = results_file_parts[1]
        system, prompt, temperature = results_file_parts[2:5]
        dataset = results_file_parts[5]
        run = results_file_parts[6]
    elif results_file_parts[0] == 'exact':
        method = results_file_parts[0]
        model = 'exact'
        dataset = results_file_parts[1]
    elif results_file_parts[0] == 'deepcadrme':
        method = 'BERT'
        model = 'deepcadrme'
        system = results_file_parts[1]
        dataset = results_file_parts[2]
    else:
        raise Exception(results_file_parts[0])

    results_files.append([method, model, system, prompt, temperature, run, dataset])

    for evaltype in ('strict', 'lenient'):
        overall_results_file = results_file.replace('.csv', f"_{evaltype}_overall.csv")
        res = pd.read_csv(os.path.join(results_dir, overall_results_file), index_col=0)
        for name, colval in zip(results_files_columns, results_files[-1]):
            res.insert(0, name, [colval]*res.shape[0])
        res.insert(0, 'evaltype', [evaltype]*res.shape[0])
        
        if overall_results_df is None:
            overall_results_df = res
        else:
            overall_results_df = pd.concat([overall_results_df, res])

results_files_df = pd.DataFrame(results_files, columns=results_files_columns)

In [170]:
results_files_df.groupby(['dataset', 'method', 'model', 'system', 'prompt', 'temperature']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,run
dataset,method,model,system,prompt,temperature,Unnamed: 6_level_1
test,BERT,deepcadrme,005,,,1
test,BERT,deepcadrme,010,,,1
test,BERT,deepcadrme,015,,,1
test,BERT,deepcadrme,025,,,1
test,BERT,deepcadrme,050,,,1
test,BERT,deepcadrme,075,,,1
test,BERT,deepcadrme,100,,,1
test,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v1,temp0,3
test,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v1,temp0,2
test,LLM,gpt-4-1106-preview,fatal-prompt-v3,pharmexpert-v1,temp0,1


In [198]:
# groupbycols = ['dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt']
groupbycols = ['dataset', 'ade_type', 'section', 'prompt', 'method']
querystr = "evaltype=='lenient' & dataset=='train' & section == 'adverse reactions' & ade_type=='all'"
metric = 'f1'
overall_results_df.query(querystr).groupby(groupbycols)[f"macro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,min,max
dataset,ade_type,section,prompt,method,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
train,all,adverse reactions,,exact,1,0.802733,0.802733,0.802733
train,all,adverse reactions,no-system-prompt,LLM,6,0.818003,0.728818,0.906223
train,all,adverse reactions,pharmexpert-v0,LLM,7,0.800015,0.522793,0.906695
train,all,adverse reactions,pharmexpert-v1,LLM,11,0.835717,0.518157,0.909465


In [199]:
overall_results_df.query(querystr).groupby(groupbycols)[f"micro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean,min,max
dataset,ade_type,section,prompt,method,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
train,all,adverse reactions,,exact,1,0.835516,0.835516,0.835516
train,all,adverse reactions,no-system-prompt,LLM,6,0.803102,0.703835,0.894816
train,all,adverse reactions,pharmexpert-v0,LLM,7,0.775095,0.420294,0.897992
train,all,adverse reactions,pharmexpert-v1,LLM,11,0.818136,0.429824,0.90126
