In [1]:
import os
import sys
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
results_dir = './results/evals'

In [42]:
results_files = set()
overall_results_df = None
results_files_columns = ['method', 'model', 'system', 'prompt', 'temperature', 'run', 'dataset']

for results_file in sorted(os.listdir(results_dir)):
    
    results_file_parts = results_file.split('.csv')[0].split('_')
    
    system, prompt, temperature, run = ['NA']*4

    if results_file_parts[0] in ('code-llama-34b', 'Mixtral-8x7B-Instruct-v0.1'):
        method = 'LLM'
        model = results_file_parts[0]
        prompt, system, temperature = results_file_parts[1:4]
        dataset = results_file_parts[4]
        run = results_file_parts[5]
    elif results_file_parts[0] == 'OpenAI':
        method = 'LLM'
        model = results_file_parts[1]
        prompt, system, temperature = results_file_parts[2:5]
        dataset = results_file_parts[5]
        run = results_file_parts[6]
    elif results_file_parts[0] == 'exact':
        method = results_file_parts[0]
        model = 'exact'
        dataset = results_file_parts[1]
    elif results_file_parts[0] == 'deepcadrme':
        method = 'BERT'
        model = 'deepcadrme'
        system = results_file_parts[1]
        dataset = results_file_parts[2]
    elif results_file_parts[0] in ('granular', 'overall', 'ember-v1'):
        continue
    else:
        raise Exception(results_file_parts[0])

    results_files.add((method, model, system, prompt, temperature, run, dataset))
    
results_files_df = pd.DataFrame(results_files, columns=results_files_columns)


In [59]:
for method, model, system, prompt, temperature, run, dataset in results_files:
    for evaltype in ('strict', 'lenient', 'ember-v1'):
        if model == 'exact':
            if evaltype == 'ember-v1':
                continue
            else:
                overall_results_file = '_'.join([model, dataset, evaltype, 'overall.csv'])
        elif model == 'deepcadrme':
            if evaltype == 'ember-v1':
                continue
            else:
                overall_results_file = '_'.join([model, system, dataset, evaltype, 'overall.csv'])
        elif model.startswith('gpt'):
            overall_results_file = '_'.join(['OpenAI', model, prompt, system, temperature, dataset, run, evaltype, 'overall.csv'])
        else:
            overall_results_file = '_'.join([model, prompt, system, temperature, dataset, run, evaltype, 'overall.csv'])
        # overall_results_file = results_file.replace('.csv', f"_{evaltype}_overall.csv")
        
        if not os.path.exists(os.path.join(results_dir, overall_results_file)):
            print(model, system, prompt)
            print(os.path.join(results_dir, overall_results_file))
            continue
        
        res = pd.read_csv(os.path.join(results_dir, overall_results_file), index_col=0)
        
        for name, colval in zip(results_files_columns, list(results_files)[-1]):
            res.insert(0, name, [colval]*res.shape[0])
        res.insert(0, 'evaltype', [evaltype]*res.shape[0])
        
        if overall_results_df is None:
            overall_results_df = res
        else:
            overall_results_df = pd.concat([overall_results_df, res])

overall_results_df.shape

gpt-4-1106-preview pharmexpert-v0 fatal-prompt-v2
./results/evals/OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v0_temp1.5_train_run0_lenient_overall.csv
gpt-4-1106-preview pharmexpert-v1 fatal-prompt-v2
./results/evals/OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_test_run0_ember-v1_overall.csv
gpt-4-1106-preview pharmexpert-v0 gpt-written-prompt
./results/evals/OpenAI_gpt-4-1106-preview_gpt-written-prompt_pharmexpert-v0_temp0_test_run0_ember-v1_overall.csv
gpt-4-1106-preview pharmexpert-v0 only-positives-v0
./results/evals/OpenAI_gpt-4-1106-preview_only-positives-v0_pharmexpert-v0_temp0_test_run0_ember-v1_overall.csv


(18406, 19)

In [51]:
results_files_df.groupby(['dataset', 'method', 'model', 'system', 'prompt', 'temperature']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,run
dataset,method,model,system,prompt,temperature,Unnamed: 6_level_1
test,BERT,deepcadrme,002,,,1
test,BERT,deepcadrme,005,,,1
test,BERT,deepcadrme,010,,,1
test,BERT,deepcadrme,015,,,1
test,BERT,deepcadrme,025,,,1
test,BERT,deepcadrme,050,,,1
test,BERT,deepcadrme,075,,,1
test,BERT,deepcadrme,100,,,1
test,LLM,gpt-3.5-turbo-0125,pharmexpert-v1,fatal-prompt-v2,temp0,3
test,LLM,gpt-4-1106-preview,pharmexpert-v0,gpt-written-prompt,temp0,1


In [60]:
overall_results_df

Unnamed: 0,evaltype,dataset,run,temperature,prompt,system,model,method,section,ade_type,tp,fp,fn,micro_precision,micro_recall,micro_f1,macro_precision,macro_recall,macro_f1
0,strict,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,adverse reactions,all,2577.0,385.0,2832.0,0.870020,0.476428,0.615697,0.826498,0.547706,0.650816
1,strict,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,adverse reactions,discontinuous,18.0,,379.0,,0.045340,,,0.035179,
2,strict,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,adverse reactions,exact-meddra,2325.0,,1838.0,,0.558491,,,0.625631,
3,strict,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,adverse reactions,hypothetical,20.0,,60.0,,0.250000,,,0.306034,
4,strict,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,adverse reactions,negated,14.0,,59.0,,0.191781,,,0.164762,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,ember-v1,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,warnings and precautions,discontinuous,343.0,,1291.0,,0.209914,,,0.263334,
17,ember-v1,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,warnings and precautions,exact-meddra,1144.0,,600.0,,0.655963,,,0.578699,
18,ember-v1,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,warnings and precautions,negated,49.0,,590.0,,0.076682,,,0.114180,
19,ember-v1,train,run0,temp0,gpt-written-prompt,pharmexpert-v0,gpt-4-turbo-preview,LLM,warnings and precautions,non-meddra,1301.0,,704.0,,0.648878,,,0.623480,


In [61]:
# default eval
groupbycols = ['dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt', 'temperature']
additional_querystr = " & ade_type=='all'"

# evaluate the prompt
# found that pharmexpert-v1 is working a bit better than the others
# groupbycols = ['dataset', 'ade_type', 'section', 'prompt', 'method']
# additional_querystr = " & model != 'code-llama-34b' & method == 'LLM'"

# compare ade_types
# groupbycols = ['ade_type', 'dataset', 'ade_type', 'section', 'method', 'model', 'system', 'prompt']
# additional_querystr = ""

querystr = "evaltype=='lenient' & dataset=='train' & section == 'adverse reactions'" + additional_querystr
metric = 'f1'
overall_results_df.query(querystr).groupby(groupbycols)[f"macro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,count,mean,min,max
dataset,ade_type,section,method,model,system,prompt,temperature,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
train,all,adverse reactions,LLM,gpt-4-turbo-preview,pharmexpert-v0,gpt-written-prompt,temp0,317,0.844977,0.364051,0.957095


In [62]:
overall_results_df.query(querystr).groupby(groupbycols)[f"micro_{metric}"].agg(['count', 'mean', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,count,mean,min,max
dataset,ade_type,section,method,model,system,prompt,temperature,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
train,all,adverse reactions,LLM,gpt-4-turbo-preview,pharmexpert-v0,gpt-written-prompt,temp0,317,0.830978,0.185248,0.959057
