In [1]:
import os
from tqdm import tqdm
import pandas as pd
from openai import OpenAI, AzureOpenAI
import csv
import numpy as np
import concurrent
import time
import json
from common_string import common_lenient_performance

## Set Up

### Functions

In [2]:
def perform_extraction(api_source, config, gpt_model, system_content, prompt, text, temperature):
    if api_source == 'OpenAI':
        client = OpenAI(api_key=config[api_source]['openai_api_key'])
    elif api_source == 'Azure':
        client = AzureOpenAI(api_key=config[api_source]['openai_api_key'], api_version="2023-12-01-preview", azure_endpoint=config[api_source]['openai_api_endpoint'])
    else:
        raise Exception(f"Unexpected API source requested: {api_source}")
  
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_content},
            {
                "role": "user",
                "content": prompt.format(text)
            }
        ],
        model=gpt_model,
        temperature=temperature,
    )
    term = chat_completion.choices[0].message.content
    return term

def perform_cleanup(extraction, openai_api):
    client = OpenAI(api_key=openai_api)
    
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": ""},
            {
                "role": "user",
                "content": """The following text is an extraction of adverse event terms from a drug label. Please remove any preamble or postamble from the list and turn the list of ADEs into a comma separated list. 
The text: {}""".format(extraction)
            }
        ],
        model="gpt-3.5-turbo-16k",
        temperature=0,
    )
    term = chat_completion.choices[0].message.content
    return term



In [3]:
# function for running GPT
def extract_ade_terms(api_source, config, gpt_model, system_content, prompt, text, temperature):
  extraction = perform_extraction(api_source, config, gpt_model, system_content, prompt, text, temperature)
  if extraction.find('\n') != -1:
    extraction = perform_cleanup(extraction, config['OpenAI']['openai_api_key'])
  return extraction


In [4]:
def evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all', lenient=False):
    '''
    For a given drug, evaluate the performance of GPT on a given subtype of ADEs. 
    '''
    
    drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
    if subtype == 'exact-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 1]
    if subtype == 'non-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 0]
    if subtype == 'negated': drug_df = drug_df[drug_df.negated_term == 1]
    if subtype == 'discontinuous': drug_df = drug_df[drug_df.discontinuous_term == 1]

    
    manual = set(drug_df['reaction_string'].to_list())
    gpt_drug = (gpt_output[
        (gpt_output['drug_name'] == drug)
        &
        (gpt_output['section_name'] == "adverse reactions")
        ]["gpt_output"].astype(str)
        .str.lower()
        .str.replace('\n-', ', ')
        .str.split(",").tolist())
    
    try:
        gpt_drug = [x.strip() for x in gpt_drug[0] if x]
        gpt_drug = set(gpt_drug)
    except:
        return [drug, subtype, len(manual), len(gpt_drug), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        
    if not lenient:    
        #overall
        TP = len(manual.intersection(gpt_drug))
        FP = len(gpt_drug.difference(manual))
        FN = len(manual.difference(gpt_drug))
        if TP == 0 and FP == 0:
            precision = np.NAN
        else:
            precision = TP/(TP+FP)
        if TP == 0 and FN == 0:
            recall = np.NAN
        else:
            recall = TP/(TP+FN)
        if precision != 0 and recall != 0:
            f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
        else:
            f1 = np.NAN
    else:
        [TP, FP, FN, precision, recall, f1] = common_lenient_performance(gpt_drug, manual)
    
    if subtype != 'all':
            # these can't be computed for the subtypes
            precision = np.nan
            f1 = np.nan
            FP = np.nan
    
    return [drug, subtype, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1]


In [5]:
def evaluation(manual_ades, gpt_output, lenient=False, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
            results.append(evaluation_subtype(manual_ades, gpt_output, drug, lenient))        
    results = pd.DataFrame(results, columns=['drug_name', 'exclude', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [6]:
def evaluation_granular(manual_ades, gpt_output, limit = 1000, lenient=False):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all', lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'exact-meddra',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'non-meddra',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'negated',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'discontinuous',lenient=lenient))

    results = pd.DataFrame(results, columns=['drug_name', 'ade_type', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

### Variables

In [7]:
drug_file = 'data/train_drug_label_text.csv'
manual_file = 'data/train_drug_label_text_manual_ades.csv'
my_max = 10000

In [8]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

## Run GPT

In [11]:
outputs = {}

In [38]:
config = json.load(open('./config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint'] 

gpt_model = config[api_source]["gpt_model"]
# gpt_model = "gpt-4-turbo-preview"
# gpt_model = "gpt-3.5-turbo-0125"

temperature = 0

In [39]:
nruns = 2

system_options = {
    "no-system-prompt": "",
    "pharmexpert-v0": "You are an expert in pharmacology.",
    "pharmexpert-v1": "You are an expert in medical natural language processing, adverse drug reactions, pharmacology, and clinical trials."
}

prompt_options = {
    "fatal-prompt-v2": """
Extract all adverse reactions as they appear, including all synonyms.
mentioned in the text and provide them as a comma-separated list.
If a fatal event is listed add 'death' to the list.
The text is :'{}' 
"""
}

system_name = "no-system-prompt"
system_content = system_options[system_name]

prompt_name = "fatal-prompt-v2"
prompt = prompt_options[prompt_name]

gpt_params = [f"temp{temperature}"]

output_file_basename = '{}_{}_{}_{}_{}_{}'.format(api_source, gpt_model, prompt_name, system_name, '-'.join(gpt_params), set_type)
output_file_basename

'OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train'

In [40]:
# if there is a max
new_rows = list()
unique_drugs = set()
for i, row in drugs.iterrows():
    unique_drugs.add(row["drug_name"])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue

    new_rows.append(row)

In [41]:
# run GPT
for i in range(nruns):
    run_key = "{}_run{}".format(output_file_basename, i)
    print(run_key)
    if run_key in outputs:
        print(f"Run {run_key} already completed and stored. Skipping.")
        continue
    
    if os.path.exists('results/{}.csv'.format(run_key)):
        gpt_output = pd.read_csv('results/{}.csv'.format(run_key))
        outputs[run_key] = gpt_output
        print(f"Run {run_key} started, loading from disk.")
        continue
    
    start = time.time()
    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text']
        try:
            gpt_out = extract_ade_terms(api_source, config, gpt_model, system_content, prompt, text, temperature)
            results.append([name, section, gpt_out])
        except Exception as err:
            print(f"Encountered an exception for row: {name} {section}. Error message below:")
            print(err)
            continue
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    end = time.time()

    if gpt_output.shape[0] > 0:
        outputs[run_key] = gpt_output
        gpt_output.to_csv('results/{}.csv'.format(run_key))
    
    print(f"Run: {run_key}, time elapsed: {end-start}s.")

OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run0


100%|██████████| 101/101 [21:11<00:00, 12.59s/it]


Run: OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run0, time elapsed: 1271.3513581752777s.
OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run1


100%|██████████| 101/101 [18:15<00:00, 10.85s/it]

Run: OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run1, time elapsed: 1095.4016880989075s.





## Exact Match Algorithm

TODO: Exact match really doesn't fit here any longer but we need it at least once to show how well (or poorly) it does. Refactor out into it's own notebook/script.

In [42]:
# run_key = 'exact_{}'.format(set_type)

# if not os.path.exists('results/{}.csv'.format(run_key)):
#     # load the meddra terms
#     fh = open('data/meddra_llt_pt_map.txt')
#     reader = csv.reader(fh, delimiter='|')
#     header = next(reader)

#     meddra_llt_terms = set()
#     meddra_pt_terms = set()

#     for row in reader:
#         meddra_llt_terms.add(row[1].lower())
#         meddra_pt_terms.add(row[4].lower())
    
#     fh.close()

#     meddra_terms = meddra_llt_terms | meddra_pt_terms
#     len(meddra_llt_terms), len(meddra_pt_terms), len(meddra_terms)

#     results = list()
#     for row in tqdm(new_rows):
#         name, section = row['drug_name'], row['section_name']
#         text = row['section_text'].lower()
        
#         found_terms = set()
#         for term in meddra_terms:
#             if text.find(term) != -1:
#                 found_terms.add(term)
        
#         exact_out = ', '.join(list(found_terms))
        
#         results.append([name, section, exact_out])

#     exact_output = pd.DataFrame(
#         [r for r in results if r is not None],
#         columns=['drug_name', 'section_name', 'gpt_output']
#     )
#     exact_output.to_csv('results/{}.csv'.format(run_key))
    
#     outputs[run_key] = exact_output

## Evaluation

In [43]:
for run_key, output in outputs.items():
    granular_save_filename = 'results/{}_strict_granular.csv'.format(run_key)
    overall_save_filename = 'results/{}_strict_overall.csv'.format(run_key)
    
    results_granular = evaluation_granular(manual_ades, output)
    overall_results = results_granular.groupby('ade_type')[['tp', 'fp', 'fn']].sum(min_count = 1).reset_index()
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')[['precision', 'recall', 'f1']].mean(numeric_only=True).reset_index()
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    
    overall_results.to_csv(overall_save_filename)
    results_granular.to_csv(granular_save_filename)
    
    print(run_key)
    print(overall_results)

100%|██████████| 101/101 [00:00<00:00, 138.38it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3957  465.0  1380         0.894844      0.741428  0.810944   
1  discontinuous    45    NaN   343              NaN      0.115979       NaN   
2   exact-meddra  3509    NaN   654              NaN      0.842902       NaN   
3        negated    19    NaN    13              NaN      0.593750       NaN   
4     non-meddra   448    NaN   727              NaN      0.381277       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.877682      0.768117  0.820351  
1              NaN      0.132069       NaN  
2              NaN      0.860161       NaN  
3              NaN      0.583333       NaN  
4              NaN      0.477187       NaN  


100%|██████████| 101/101 [00:00<00:00, 147.68it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3930  449.0  1407         0.897465      0.736369  0.808975   
1  discontinuous    52    NaN   336              NaN      0.134021       NaN   
2   exact-meddra  3482    NaN   681              NaN      0.836416       NaN   
3        negated    16    NaN    16              NaN      0.500000       NaN   
4     non-meddra   448    NaN   727              NaN      0.381277       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.881005      0.766863  0.819508  
1              NaN      0.154932       NaN  
2              NaN      0.858272       NaN  
3              NaN      0.472222       NaN  
4              NaN      0.464327       NaN  


100%|██████████| 101/101 [00:00<00:00, 148.82it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run2
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  4001  448.0  1336         0.899303      0.749672  0.817699   
1  discontinuous    52    NaN   336              NaN      0.134021       NaN   
2   exact-meddra  3544    NaN   619              NaN      0.851309       NaN   
3        negated    15    NaN    17              NaN      0.468750       NaN   
4     non-meddra   457    NaN   718              NaN      0.388936       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.876943      0.771264  0.829339  
1              NaN      0.144458       NaN  
2              NaN      0.862802       NaN  
3              NaN      0.456349       NaN  
4              NaN      0.492795       NaN  


100%|██████████| 101/101 [00:00<00:00, 145.94it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run3
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3928  430.0  1409         0.901331      0.735994  0.810315   
1  discontinuous    54    NaN   334              NaN      0.139175       NaN   
2   exact-meddra  3488    NaN   675              NaN      0.837857       NaN   
3        negated    17    NaN    15              NaN      0.531250       NaN   
4     non-meddra   440    NaN   735              NaN      0.374468       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.883397      0.769179  0.822937  
1              NaN      0.160018       NaN  
2              NaN      0.861248       NaN  
3              NaN      0.488095       NaN  
4              NaN      0.471232       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.05it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run4
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3984  442.0  1353         0.900136      0.746487  0.816143   
1  discontinuous    52    NaN   336              NaN      0.134021       NaN   
2   exact-meddra  3527    NaN   636              NaN      0.847226       NaN   
3        negated    18    NaN    14              NaN      0.562500       NaN   
4     non-meddra   457    NaN   718              NaN      0.388936       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.877745      0.770290  0.830443  
1              NaN      0.147796       NaN  
2              NaN      0.861019       NaN  
3              NaN      0.535714       NaN  
4              NaN      0.476221       NaN  


100%|██████████| 101/101 [00:00<00:00, 127.90it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3181  439.0  2156         0.878729      0.596028  0.710282   
1  discontinuous    36    NaN   352              NaN      0.092784       NaN   
2   exact-meddra  2804    NaN  1359              NaN      0.673553       NaN   
3        negated    11    NaN    21              NaN      0.343750       NaN   
4     non-meddra   377    NaN   798              NaN      0.320851       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.840049      0.650779  0.716432  
1              NaN      0.093203       NaN  
2              NaN      0.731902       NaN  
3              NaN      0.400794       NaN  
4              NaN      0.399779       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.65it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3107  442.0  2230         0.875458      0.582162  0.699302   
1  discontinuous    27    NaN   361              NaN      0.069588       NaN   
2   exact-meddra  2733    NaN  1430              NaN      0.656498       NaN   
3        negated    10    NaN    22              NaN      0.312500       NaN   
4     non-meddra   374    NaN   801              NaN      0.318298       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.848372      0.646647  0.709302  
1              NaN      0.101250       NaN  
2              NaN      0.722106       NaN  
3              NaN      0.353175       NaN  
4              NaN      0.409181       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.08it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run2
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3240  462.0  2097         0.875203      0.607083  0.716893   
1  discontinuous    32    NaN   356              NaN      0.082474       NaN   
2   exact-meddra  2882    NaN  1281              NaN      0.692289       NaN   
3        negated    10    NaN    22              NaN      0.312500       NaN   
4     non-meddra   358    NaN   817              NaN      0.304681       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.851308      0.658846   0.72259  
1              NaN      0.086891       NaN  
2              NaN      0.743430       NaN  
3              NaN      0.353175       NaN  
4              NaN      0.385765       NaN  


100%|██████████| 101/101 [00:00<00:00, 151.06it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run3
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  2895  413.0  2442         0.875151      0.542440  0.669751   
1  discontinuous    34    NaN   354              NaN      0.087629       NaN   
2   exact-meddra  2532    NaN  1631              NaN      0.608215       NaN   
3        negated    13    NaN    19              NaN      0.406250       NaN   
4     non-meddra   363    NaN   812              NaN      0.308936       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.848337      0.638924  0.698644  
1              NaN      0.111335       NaN  
2              NaN      0.716213       NaN  
3              NaN      0.436508       NaN  
4              NaN      0.407525       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.47it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_no-system-prompt_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  2443  313.0  2894          0.88643      0.457748  0.603732   
1  discontinuous    25    NaN   363              NaN      0.064433       NaN   
2   exact-meddra  2147    NaN  2016              NaN      0.515734       NaN   
3        negated    12    NaN    20              NaN      0.375000       NaN   
4     non-meddra   296    NaN   879              NaN      0.251915       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.859126      0.554514  0.638758  
1              NaN      0.074666       NaN  
2              NaN      0.625797       NaN  
3              NaN      0.380952       NaN  
4              NaN      0.354505       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.17it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_no-system-prompt_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  2591  303.0  2746         0.895301      0.485479  0.629571   
1  discontinuous    25    NaN   363              NaN      0.064433       NaN   
2   exact-meddra  2263    NaN  1900              NaN      0.543598       NaN   
3        negated    12    NaN    20              NaN      0.375000       NaN   
4     non-meddra   328    NaN   847              NaN      0.279149       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.858238      0.570654  0.652824  
1              NaN      0.088414       NaN  
2              NaN      0.637834       NaN  
3              NaN      0.380952       NaN  
4              NaN      0.375312       NaN  


100%|██████████| 101/101 [00:00<00:00, 144.02it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3905  407.0  1432         0.905612      0.731684   0.80941   
1  discontinuous    46    NaN   342              NaN      0.118557       NaN   
2   exact-meddra  3466    NaN   697              NaN      0.832573       NaN   
3        negated    15    NaN    17              NaN      0.468750       NaN   
4     non-meddra   439    NaN   736              NaN      0.373617       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.881586      0.759287  0.823956  
1              NaN      0.118779       NaN  
2              NaN      0.850220       NaN  
3              NaN      0.424603       NaN  
4              NaN      0.483704       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.06it/s]

OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3884  389.0  1453         0.908963      0.727750  0.808325   
1  discontinuous    50    NaN   338              NaN      0.128866       NaN   
2   exact-meddra  3447    NaN   716              NaN      0.828009       NaN   
3        negated    17    NaN    15              NaN      0.531250       NaN   
4     non-meddra   437    NaN   738              NaN      0.371915       NaN   

   macro_precision  macro_recall  macro_f1  
0          0.88394      0.754128  0.822647  
1              NaN      0.129035       NaN  
2              NaN      0.844196       NaN  
3              NaN      0.488095       NaN  
4              NaN      0.484584       NaN  





#### Lenient Matching using Longest Commmon Substring

In [44]:
# using lenient matching method 'longest common substring'
for run_key, output in outputs.items():
    granular_save_filename = 'results/{}_lenient_granular.csv'.format(run_key)
    overall_save_file = 'results/{}_lenient_overall.csv'.format(run_key)
    
    results_granular = evaluation_granular(manual_ades, output, lenient=True)
    overall_results = results_granular.groupby('ade_type')[['tp', 'fp', 'fn']].sum(min_count = 1).reset_index()
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')[['precision', 'recall', 'f1']].mean(numeric_only=True).reset_index()
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    
    overall_results.to_csv(overall_save_file)
    results_granular.to_csv(granular_save_filename)
    
    print(run_key)
    print(overall_results)

100%|██████████| 101/101 [00:06<00:00, 14.53it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4514  217.0  823         0.954132      0.845794  0.896702   
1  discontinuous   193    NaN  195              NaN      0.497423       NaN   
2   exact-meddra  3799    NaN  364              NaN      0.912563       NaN   
3        negated    26    NaN    6              NaN      0.812500       NaN   
4     non-meddra   716    NaN  459              NaN      0.609362       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.949246      0.881093  0.908541  
1              NaN      0.466871       NaN  
2              NaN      0.939424       NaN  
3              NaN      0.825397       NaN  
4              NaN      0.696340       NaN  


100%|██████████| 101/101 [00:06<00:00, 14.65it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run1
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4489  205.0  848         0.956327      0.841109  0.895025   
1  discontinuous   195    NaN  193              NaN      0.502577       NaN   
2   exact-meddra  3770    NaN  393              NaN      0.905597       NaN   
3        negated    24    NaN    8              NaN      0.750000       NaN   
4     non-meddra   720    NaN  455              NaN      0.612766       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.950712      0.876779  0.905758  
1              NaN      0.474899       NaN  
2              NaN      0.932467       NaN  
3              NaN      0.730159       NaN  
4              NaN      0.687107       NaN  


100%|██████████| 101/101 [00:07<00:00, 14.36it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run2
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4541  199.0  796         0.958017      0.850853   0.90126   
1  discontinuous   193    NaN  195              NaN      0.497423       NaN   
2   exact-meddra  3816    NaN  347              NaN      0.916647       NaN   
3        negated    23    NaN    9              NaN      0.718750       NaN   
4     non-meddra   726    NaN  449              NaN      0.617872       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.951567      0.880015  0.908425  
1              NaN      0.465079       NaN  
2              NaN      0.937799       NaN  
3              NaN      0.746032       NaN  
4              NaN      0.694165       NaN  


100%|██████████| 101/101 [00:06<00:00, 14.64it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run3
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4484  188.0  853          0.95976      0.840172  0.895994   
1  discontinuous   195    NaN  193              NaN      0.502577       NaN   
2   exact-meddra  3768    NaN  395              NaN      0.905117       NaN   
3        negated    24    NaN    8              NaN      0.750000       NaN   
4     non-meddra   717    NaN  458              NaN      0.610213       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.951563      0.876052  0.906807  
1              NaN      0.466871       NaN  
2              NaN      0.932972       NaN  
3              NaN      0.730159       NaN  
4              NaN      0.687356       NaN  


100%|██████████| 101/101 [00:06<00:00, 14.59it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run4
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4523  197.0  814         0.958263      0.847480  0.899473   
1  discontinuous   198    NaN  190              NaN      0.510309       NaN   
2   exact-meddra  3794    NaN  369              NaN      0.911362       NaN   
3        negated    25    NaN    7              NaN      0.781250       NaN   
4     non-meddra   729    NaN  446              NaN      0.620426       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.953285      0.879039  0.909465  
1              NaN      0.476700       NaN  
2              NaN      0.936835       NaN  
3              NaN      0.777778       NaN  
4              NaN      0.680824       NaN  


100%|██████████| 101/101 [00:05<00:00, 17.68it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3756  202.0  1581         0.948964      0.703766  0.808176   
1  discontinuous   145    NaN   243              NaN      0.373711       NaN   
2   exact-meddra  3120    NaN  1043              NaN      0.749460       NaN   
3        negated    21    NaN    11              NaN      0.656250       NaN   
4     non-meddra   636    NaN   539              NaN      0.541277       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.925193      0.767233  0.814083  
1              NaN      0.387205       NaN  
2              NaN      0.818923       NaN  
3              NaN      0.694444       NaN  
4              NaN      0.613370       NaN  


100%|██████████| 101/101 [00:05<00:00, 17.49it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3685  203.0  1652         0.947788      0.690463  0.798916   
1  discontinuous   139    NaN   249              NaN      0.358247       NaN   
2   exact-meddra  3059    NaN  1104              NaN      0.734807       NaN   
3        negated    20    NaN    12              NaN      0.625000       NaN   
4     non-meddra   626    NaN   549              NaN      0.532766       NaN   

   macro_precision  macro_recall  macro_f1  
0          0.93027      0.752597  0.806651  
1              NaN      0.373829       NaN  
2              NaN      0.797174       NaN  
3              NaN      0.670635       NaN  
4              NaN      0.608733       NaN  


100%|██████████| 101/101 [00:05<00:00, 17.00it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run2
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3864  198.0  1473         0.951256      0.724002  0.822215   
1  discontinuous   146    NaN   242              NaN      0.376289       NaN   
2   exact-meddra  3230    NaN   933              NaN      0.775883       NaN   
3        negated    21    NaN    11              NaN      0.656250       NaN   
4     non-meddra   634    NaN   541              NaN      0.539574       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.937195      0.772967  0.827579  
1              NaN      0.376313       NaN  
2              NaN      0.824337       NaN  
3              NaN      0.662698       NaN  
4              NaN      0.610508       NaN  


100%|██████████| 101/101 [00:04<00:00, 20.80it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v1_temp0_train_run3
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3447  200.0  1890          0.94516      0.645868  0.767364   
1  discontinuous   152    NaN   236              NaN      0.391753       NaN   
2   exact-meddra  2826    NaN  1337              NaN      0.678837       NaN   
3        negated    23    NaN     9              NaN      0.718750       NaN   
4     non-meddra   621    NaN   554              NaN      0.528511       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.927089      0.741726  0.793523  
1              NaN      0.407644       NaN  
2              NaN      0.788140       NaN  
3              NaN      0.757937       NaN  
4              NaN      0.603159       NaN  


100%|██████████| 101/101 [00:03<00:00, 25.29it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_no-system-prompt_temp0_train_run0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  2973  138.0  2364         0.955641      0.557055  0.703835   
1  discontinuous   141    NaN   247              NaN      0.363402       NaN   
2   exact-meddra  2438    NaN  1725              NaN      0.585635       NaN   
3        negated    19    NaN    13              NaN      0.593750       NaN   
4     non-meddra   535    NaN   640              NaN      0.455319       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.939865      0.651969  0.728818  
1              NaN      0.390567       NaN  
2              NaN      0.694286       NaN  
3              NaN      0.623016       NaN  
4              NaN      0.527415       NaN  


100%|██████████| 101/101 [00:04<00:00, 24.00it/s]


OpenAI_gpt-3.5-turbo-0125_fatal-prompt-v2_no-system-prompt_temp0_train_run1
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3083  143.0  2254         0.955673      0.577665  0.720075   
1  discontinuous   128    NaN   260              NaN      0.329897       NaN   
2   exact-meddra  2545    NaN  1618              NaN      0.611338       NaN   
3        negated    19    NaN    13              NaN      0.593750       NaN   
4     non-meddra   538    NaN   637              NaN      0.457872       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.933304      0.660527   0.73411  
1              NaN      0.369577       NaN  
2              NaN      0.704971       NaN  
3              NaN      0.623016       NaN  
4              NaN      0.527547       NaN  


100%|██████████| 101/101 [00:06<00:00, 15.24it/s]


OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run0
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4460  178.0  877         0.961621      0.835675  0.894236   
1  discontinuous   184    NaN  204              NaN      0.474227       NaN   
2   exact-meddra  3746    NaN  417              NaN      0.899832       NaN   
3        negated    25    NaN    7              NaN      0.781250       NaN   
4     non-meddra   715    NaN  460              NaN      0.608511       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.956018      0.871882  0.906223  
1              NaN      0.450282       NaN  
2              NaN      0.928735       NaN  
3              NaN      0.777778       NaN  
4              NaN      0.693977       NaN  


100%|██████████| 101/101 [00:06<00:00, 15.16it/s]

OpenAI_gpt-4-1106-preview_fatal-prompt-v2_no-system-prompt_temp0_train_run1
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4446  167.0  891         0.963798      0.833052  0.893668   
1  discontinuous   179    NaN  209              NaN      0.461340       NaN   
2   exact-meddra  3727    NaN  436              NaN      0.895268       NaN   
3        negated    24    NaN    8              NaN      0.750000       NaN   
4     non-meddra   719    NaN  456              NaN      0.611915       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.957266      0.865878  0.903949  
1              NaN      0.448641       NaN  
2              NaN      0.921725       NaN  
3              NaN      0.730159       NaN  
4              NaN      0.693704       NaN  



