In [269]:
import os
import sys
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [270]:
results_dir = './results/evals'

In [271]:
results_files = set()
overall_results_df = None
results_files_columns = ['method', 'model', 'system', 'prompt', 'temperature', 'run', 'dataset']

for results_file in sorted(os.listdir(results_dir)):
    
    results_file_parts = results_file.split('.csv')[0].split('_')
    
    system, prompt, temperature, run = ['NA']*4

    if results_file_parts[0] == 'code-llama-34b':
        method = 'LLM'
        model = results_file_parts[0]
        system, prompt, temperature = results_file_parts[1:4]
        dataset = results_file_parts[4]
        run = results_file_parts[5]
    elif results_file_parts[0] == 'OpenAI':
        method = 'LLM'
        model = results_file_parts[1]
        system, prompt, temperature = results_file_parts[2:5]
        dataset = results_file_parts[5]
        run = results_file_parts[6]
    elif results_file_parts[0] == 'exact':
        method = results_file_parts[0]
        model = 'exact'
        dataset = results_file_parts[1]
    elif results_file_parts[0] == 'deepcadrme':
        method = 'BERT'
        model = 'deepcadrme'
        system = results_file_parts[1]
        dataset = results_file_parts[2]
    elif results_file_parts[0] in ('granular', 'overall'):
        continue
    else:
        raise Exception(results_file_parts[0])

    results_files.add([method, model, system, prompt, temperature, run, dataset])
    
    for evaltype in ('strict', 'lenient'):
        overall_results_file = results_file.replace('.csv', f"_{evaltype}_overall.csv")
        if not os.path.exists(os.path.join(results_dir, overall_results_file)):
            continue
        res = pd.read_csv(os.path.join(results_dir, overall_results_file), index_col=0)
        for name, colval in zip(results_files_columns, results_files[-1]):
            res.insert(0, name, [colval]*res.shape[0])
        res.insert(0, 'evaltype', [evaltype]*res.shape[0])
        
        if overall_results_df is None:
            overall_results_df = res
        else:
            overall_results_df = pd.concat([overall_results_df, res])

results_files_df = pd.DataFrame(results_files, columns=results_files_columns)

In [272]:
results_files_df.groupby(['dataset', 'method', 'model', 'system', 'prompt', 'temperature']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,run
dataset,method,model,system,prompt,temperature,Unnamed: 6_level_1


In [273]:
# default eval
groupbycols = ['dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt', 'temperature']
additional_querystr = " & ade_type=='all'"

# evaluate the prompt
# found that pharmexpert-v1 is working a bit better than the others
# groupbycols = ['dataset', 'ade_type', 'section', 'prompt', 'method']
# additional_querystr = " & model != 'code-llama-34b' & method == 'LLM'"

# compare ade_types
# groupbycols = ['ade_type', 'dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt']
# additional_querystr = ""

querystr = "evaltype=='lenient' & dataset=='train' & section == 'adverse reactions'" + additional_querystr
metric = 'f1'
overall_results_df.query(querystr).groupby(groupbycols)[f"macro_{metric}"].agg(['count', 'mean', 'min', 'max'])

AttributeError: 'NoneType' object has no attribute 'query'

In [268]:
overall_results_df.query(querystr).groupby(groupbycols)[f"micro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,count,mean,min,max
dataset,ade_type,section,method,model,system,prompt,temperature,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
train,all,adverse reactions,LLM,code-llama-34b,fatal-prompt-v2,pharmexpert-v0,temp0,1,0.420294,0.420294,0.420294
train,all,adverse reactions,LLM,code-llama-34b,fatal-prompt-v2,pharmexpert-v1,temp0,1,0.429824,0.429824,0.429824
train,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,no-system-prompt,temp0,3,0.711965,0.703835,0.720075
train,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v0,temp0,3,0.775151,0.773829,0.776471
train,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v0,temp0.5,3,0.778933,0.772114,0.783626
train,all,adverse reactions,LLM,gpt-3.5-turbo-0125,fatal-prompt-v2,pharmexpert-v1,temp0,4,0.799266,0.767364,0.822303
train,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,no-system-prompt,temp0,3,0.89424,0.893668,0.894816
train,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v0,temp0,3,0.893306,0.890133,0.897992
train,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v0,temp0.5,3,0.894164,0.887528,0.898406
train,all,adverse reactions,LLM,gpt-4-1106-preview,fatal-prompt-v2,pharmexpert-v0,temp1.0,3,0.892385,0.888134,0.895062
