In [1]:
import os
from tqdm import tqdm
import pandas as pd
from openai import OpenAI, AzureOpenAI
import openai
import constants
import csv
import numpy as np
import concurrent
import time
import json

## Set Up

### Functions

In [2]:
# function for running GPT
def extract_ade_terms(api_source, api_endpoint, gpt_model, system_content, prompt, text, openai_api):
  if api_source == 'OpenAI':
    client = OpenAI(api_key=openai_api)
  elif api_source == 'Azure':
    client = AzureOpenAI(api_key=openai_api, api_version="2023-12-01-preview", azure_endpoint=api_endpoint)
  else:
    raise Exception(f"Unexpected API source requested: {api_source}")
  
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": system_content},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model
  )
  term = chat_completion.choices[0].message.content
  return term

In [3]:
def evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'):
    '''
    For a given drug, evaluate the performance of GPT on a given subtype of ADEs. 
    '''
    
    drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
    if subtype == 'exact-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 1]
    if subtype == 'non-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 0]
    if subtype == 'negated': drug_df = drug_df[drug_df.negated_term == 1]
    if subtype == 'discontinuous': drug_df = drug_df[drug_df.discontinuous_term == 1]

    manual = set(drug_df['reaction_string'].to_list())
    gpt_drug = (gpt_output[
        (gpt_output['drug_name'] == drug)
        &
        (gpt_output['section_name'] == "adverse reactions")
        ]["gpt_output"].astype(str)
        .str.lower()
        .str.replace('\n-', ', ')
        .str.split(",").tolist())

    try:
        gpt_drug = [x.strip() for x in gpt_drug[0]]
        gpt_drug = set(gpt_drug)
    except:
        return [drug, subtype, len(manual), len(gpt_drug), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    
    #overall
    TP = len(manual.intersection(gpt_drug))
    FP = len(gpt_drug.difference(manual))
    FN = len(manual.difference(gpt_drug))
    if TP == 0 and FP == 0:
        precision = np.NAN
    else:
        precision = TP/(TP+FP)
    if TP == 0 and FN == 0:
        recall = np.NAN
    else:
        recall = TP/(TP+FN)
    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
    else:
        f1 = np.NAN
    
    if subtype != 'all':
        # these can't be computed for the subtypes
        precision = np.nan
        f1 = np.nan
        FP = np.nan
    
    return [drug, subtype, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1]

In [4]:
def evaluation(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):

        results.append(evaluation_subtype(manual_ades, gpt_output, drug))
        
    results = pd.DataFrame(results, columns=['drug_name', 'exclude', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [5]:
def evaluation_granular(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'exact-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'non-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'negated'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'discontinuous'))

    results = pd.DataFrame(results, columns=['drug_name', 'ade_type', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

### Variables

In [7]:
drug_file = 'data/train_drug_label_text.csv'
manual_file = 'data/train_drug_label_text_manual_ades.csv'
my_max = 10000

In [8]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

## Run GPT

In [9]:
outputs = {}

In [18]:
config = json.load(open('./config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint'] 

gpt_model = config[api_source]["gpt_model"]
gpt_model = "gpt-4-0613"


In [19]:
nruns = 1

system_options = {
    "pharmexpert-v0": "You are an expert in pharmacology.",
    "pharmexpert-v1": "You are an expert in medical natural language processing, adverse drug reactions, pharmacology, and clinical trials."
}

prompt_options = {
    "fatal-prompt-v2": """
Extract all adverse reactions as they appear, including all synonyms.
mentioned in the text and provide them as a comma-separated list.
If a fatal event is listed add 'death' to the list.
The text is :'{}' 
"""
}

system_name = "pharmexpert-v0"
system_content = system_options[system_name]

prompt_name = "fatal-prompt-v2"
prompt = prompt_options[prompt_name]

output_file_basename = '{}_{}_{}_{}_{}'.format(api_source, gpt_model, prompt_name, system_name, set_type)
output_file_basename

'OpenAI_gpt-4-0613_fatal-prompt-v2_pharmexpert-v0_train'

In [20]:
# if there is a max
new_rows = list()
unique_drugs = set()
for i, row in drugs.iterrows():
    unique_drugs.add(row["drug_name"])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue

    new_rows.append(row)

In [21]:
# run GPT
for i in range(nruns):
    if f"gpt{i}" in outputs:
        print(f"Run {i} already completed and stored. Skipping.")
        continue
    
    if os.path.exists('results/{}_run{}.csv'.format(output_file_basename, i)):
        gpt_output = pd.read_csv('results/{}_run{}.csv'.format(output_file_basename, i))
        outputs[f"gpt{i}"] = gpt_output
        print(f"Run {i} already completed and loaded from disk.")
        continue
    
    start = time.time()
    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text']
        try:
            gpt_out = extract_ade_terms(api_source, api_endpoint, gpt_model, system_content, prompt, text, api_key)
            time.sleep(5)
            results.append([name, section, gpt_out])
        except Exception as err:
            print(f"Encountered an exception for row: {name} {section}. Error message below:")
            print(err)
            continue
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    end = time.time()

    if gpt_output.shape[0] > 0:
        outputs[f"gpt{i}"] = gpt_output
        gpt_output.to_csv('results/{}_run{}.csv'.format(output_file_basename, i))
    
    print(f"Run: {i}, time elapsed: {end-start}s.")

 65%|██████▌   | 66/101 [21:08<05:24,  9.28s/it]  

Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 10144 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


100%|██████████| 101/101 [28:42<00:00, 17.06s/it]

Run: 0, time elapsed: 1722.6388738155365s.





## Exact Match Algorithm

In [23]:
exact_match_run = True

In [24]:
if not exact_match_run:
    # load the meddra terms
    fh = open('meddra_llt_pt_map.txt')
    reader = csv.reader(fh, delimiter='|')
    header = next(reader)

    meddra_llt_terms = set()
    meddra_pt_terms = set()

    for row in reader:
        meddra_llt_terms.add(row[1].lower())
        meddra_pt_terms.add(row[4].lower())

    fh.close()

    meddra_terms = meddra_llt_terms | meddra_pt_terms
    len(meddra_llt_terms), len(meddra_pt_terms), len(meddra_terms)

    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text'].lower()
        
        found_terms = set()
        for term in meddra_terms:
            if text.find(term) != -1:
                found_terms.add(term)
        
        exact_out = ', '.join(list(found_terms))
        
        results.append([name, section, exact_out])

    exact_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    outputs['exact'] = exact_output

## Evaluation

In [25]:
for method, output in outputs.items():
    results_granular = evaluation_granular(manual_ades, output)
    overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    save_filename = 'results/{}_{}_granular.csv'.format(output_file_basename, method)
    overall_results.to_csv(save_filename)
    print(method)
    print(overall_results)

100%|██████████| 101/101 [00:00<00:00, 154.67it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt0
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3864.0  529.0  1473.0         0.879581      0.724002  0.794245   
discontinuous    49.0    0.0   339.0         1.000000      0.126289  0.224256   
exact-meddra   3423.0    0.0   740.0         1.000000      0.822244  0.902452   
negated          14.0    0.0    18.0         1.000000      0.437500  0.608696   
non-meddra      441.0    0.0   734.0         1.000000      0.375319  0.545792   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.856748      0.756106  0.802578  
discontinuous              NaN      0.169050       NaN  
exact-meddra               NaN      0.842792       NaN  
negated                    NaN      0.408730       NaN  
non-meddra                 NaN      0.479649       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.80it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt1
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3816.0  489.0  1521.0         0.886411      0.715008  0.791537   
discontinuous    46.0    0.0   342.0         1.000000      0.118557  0.211982   
exact-meddra   3395.0    0.0   768.0         1.000000      0.815518  0.898386   
negated          15.0    0.0    17.0         1.000000      0.468750  0.638298   
non-meddra      421.0    0.0   754.0         1.000000      0.358298  0.527569   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.863214      0.749849  0.809641  
discontinuous              NaN      0.146801       NaN  
exact-meddra               NaN      0.842385       NaN  
negated                    NaN      0.464286       NaN  
non-meddra                 NaN      0.478864       NaN  


100%|██████████| 101/101 [00:00<00:00, 156.08it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt2
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3846.0  564.0  1491.0         0.872109      0.720630  0.789166   
discontinuous    42.0    0.0   346.0         1.000000      0.108247  0.195349   
exact-meddra   3440.0    0.0   723.0         1.000000      0.826327  0.904906   
negated          15.0    0.0    17.0         1.000000      0.468750  0.638298   
non-meddra      406.0    0.0   769.0         1.000000      0.345532  0.513599   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.851455      0.747144  0.803974  
discontinuous              NaN      0.107512       NaN  
exact-meddra               NaN      0.841754       NaN  
negated                    NaN      0.428571       NaN  
non-meddra                 NaN      0.461309       NaN  


100%|██████████| 101/101 [00:00<00:00, 152.64it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt3
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3900.0  612.0  1437.0         0.864362      0.730748  0.791959   
discontinuous    51.0    0.0   337.0         1.000000      0.131443  0.232346   
exact-meddra   3473.0    0.0   690.0         1.000000      0.834254  0.909639   
negated          14.0    0.0    18.0         1.000000      0.437500  0.608696   
non-meddra      427.0    0.0   748.0         1.000000      0.363404  0.533084   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.858624      0.757191  0.806293  
discontinuous              NaN      0.172924       NaN  
exact-meddra               NaN      0.847599       NaN  
negated                    NaN      0.448413       NaN  
non-meddra                 NaN      0.472399       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.60it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt4
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3902.0  537.0  1435.0         0.879027      0.731122  0.798282   
discontinuous    49.0    0.0   339.0         1.000000      0.126289  0.224256   
exact-meddra   3494.0    0.0   669.0         1.000000      0.839299  0.912629   
negated          16.0    0.0    16.0         1.000000      0.500000  0.666667   
non-meddra      408.0    0.0   767.0         1.000000      0.347234  0.515477   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.857887      0.754961  0.811239  
discontinuous              NaN      0.126719       NaN  
exact-meddra               NaN      0.854049       NaN  
negated                    NaN      0.472222       NaN  
non-meddra                 NaN      0.455963       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.08it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt5
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3886.0  607.0  1451.0         0.864901      0.728124  0.790641   
discontinuous    46.0    0.0   342.0         1.000000      0.118557  0.211982   
exact-meddra   3472.0    0.0   691.0         1.000000      0.834014  0.909496   
negated          16.0    0.0    16.0         1.000000      0.500000  0.666667   
non-meddra      414.0    0.0   761.0         1.000000      0.352340  0.521082   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.849404      0.753791  0.800686  
discontinuous              NaN      0.125418       NaN  
exact-meddra               NaN      0.851448       NaN  
negated                    NaN      0.440476       NaN  
non-meddra                 NaN      0.447550       NaN  


100%|██████████| 101/101 [00:00<00:00, 148.13it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt6
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3905.0  521.0  1432.0         0.882286      0.731684  0.799959   
discontinuous    58.0    0.0   330.0         1.000000      0.149485  0.260090   
exact-meddra   3477.0    0.0   686.0         1.000000      0.835215  0.910209   
negated          13.0    0.0    19.0         1.000000      0.406250  0.577778   
non-meddra      428.0    0.0   747.0         1.000000      0.364255  0.533999   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.864534      0.762666  0.802307  
discontinuous              NaN      0.157515       NaN  
exact-meddra               NaN      0.856517       NaN  
negated                    NaN      0.392857       NaN  
non-meddra                 NaN      0.447476       NaN  


100%|██████████| 101/101 [00:00<00:00, 153.62it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt7
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3949.0  521.0  1388.0         0.883445      0.739929  0.805343   
discontinuous    54.0    0.0   334.0         1.000000      0.139175  0.244344   
exact-meddra   3507.0    0.0   656.0         1.000000      0.842421  0.914472   
negated          13.0    0.0    19.0         1.000000      0.406250  0.577778   
non-meddra      442.0    0.0   733.0         1.000000      0.376170  0.546691   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                    0.87552      0.776765  0.824629  
discontinuous              NaN      0.185262       NaN  
exact-meddra               NaN      0.869156       NaN  
negated                    NaN      0.392857       NaN  
non-meddra                 NaN      0.491277       NaN  


100%|██████████| 101/101 [00:00<00:00, 153.10it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt8
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3828.0  547.0  1509.0         0.874971      0.717257  0.788303   
discontinuous    43.0    0.0   345.0         1.000000      0.110825  0.199536   
exact-meddra   3421.0    0.0   742.0         1.000000      0.821763  0.902162   
negated          14.0    0.0    18.0         1.000000      0.437500  0.608696   
non-meddra      407.0    0.0   768.0         1.000000      0.346383  0.514539   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.870708      0.762647   0.80554  
discontinuous              NaN      0.130122       NaN  
exact-meddra               NaN      0.857405       NaN  
negated                    NaN      0.412698       NaN  
non-meddra                 NaN      0.445008       NaN  


100%|██████████| 101/101 [00:00<00:00, 156.37it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt9
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3876.0  579.0  1461.0         0.870034      0.726251  0.791667   
discontinuous    44.0    0.0   344.0         1.000000      0.113402  0.203704   
exact-meddra   3453.0    0.0   710.0         1.000000      0.829450  0.906775   
negated          15.0    0.0    17.0         1.000000      0.468750  0.638298   
non-meddra      423.0    0.0   752.0         1.000000      0.360000  0.529412   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.862227      0.766594  0.804296  
discontinuous              NaN      0.127990       NaN  
exact-meddra               NaN      0.859293       NaN  
negated                    NaN      0.424603       NaN  
non-meddra                 NaN      0.468520       NaN  


100%|██████████| 101/101 [00:00<00:00, 154.10it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt10
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3876.0  563.0  1461.0          0.87317      0.726251  0.792962   
discontinuous    57.0    0.0   331.0          1.00000      0.146907  0.256180   
exact-meddra   3448.0    0.0   715.0          1.00000      0.828249  0.906057   
negated          14.0    0.0    18.0          1.00000      0.437500  0.608696   
non-meddra      428.0    0.0   747.0          1.00000      0.364255  0.533999   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.850053      0.751685  0.806194  
discontinuous              NaN      0.170760       NaN  
exact-meddra               NaN      0.841465       NaN  
negated                    NaN      0.416667       NaN  
non-meddra                 NaN      0.464813       NaN  


100%|██████████| 101/101 [00:00<00:00, 153.89it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt11
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3772.0  521.0  1565.0          0.87864      0.706764  0.783385   
discontinuous    47.0    0.0   341.0          1.00000      0.121134  0.216092   
exact-meddra   3357.0    0.0   806.0          1.00000      0.806390  0.892819   
negated          17.0    0.0    15.0          1.00000      0.531250  0.693878   
non-meddra      415.0    0.0   760.0          1.00000      0.353191  0.522013   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.859031      0.749093  0.798729  
discontinuous              NaN      0.151156       NaN  
exact-meddra               NaN      0.840660       NaN  
negated                    NaN      0.523810       NaN  
non-meddra                 NaN      0.461671       NaN  


100%|██████████| 101/101 [00:00<00:00, 157.30it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt12
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3908.0  671.0  1429.0         0.853461      0.732247  0.788221   
discontinuous    49.0    0.0   339.0         1.000000      0.126289  0.224256   
exact-meddra   3486.0    0.0   677.0         1.000000      0.837377  0.911492   
negated          16.0    0.0    16.0         1.000000      0.500000  0.666667   
non-meddra      422.0    0.0   753.0         1.000000      0.359149  0.528491   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.850202      0.767362    0.8082  
discontinuous              NaN      0.147133       NaN  
exact-meddra               NaN      0.862607       NaN  
negated                    NaN      0.440476       NaN  
non-meddra                 NaN      0.467240       NaN  


100%|██████████| 101/101 [00:00<00:00, 156.42it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt13
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3863.0  588.0  1474.0         0.867895      0.723815  0.789334   
discontinuous    52.0    0.0   336.0         1.000000      0.134021  0.236364   
exact-meddra   3424.0    0.0   739.0         1.000000      0.822484  0.902597   
negated          14.0    0.0    18.0         1.000000      0.437500  0.608696   
non-meddra      439.0    0.0   736.0         1.000000      0.373617  0.543990   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.844568      0.750524  0.795905  
discontinuous              NaN      0.138168       NaN  
exact-meddra               NaN      0.842457       NaN  
negated                    NaN      0.376984       NaN  
non-meddra                 NaN      0.474090       NaN  


100%|██████████| 101/101 [00:00<00:00, 156.66it/s]

gpt14
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3963.0  594.0  1374.0         0.869651      0.742552  0.801092   
discontinuous    59.0    0.0   329.0         1.000000      0.152062  0.263982   
exact-meddra   3524.0    0.0   639.0         1.000000      0.846505  0.916873   
negated          15.0    0.0    17.0         1.000000      0.468750  0.638298   
non-meddra      439.0    0.0   736.0         1.000000      0.373617  0.543990   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.853476      0.762947  0.815717  
discontinuous              NaN      0.157197       NaN  
exact-meddra               NaN      0.856362       NaN  
negated                    NaN      0.464286       NaN  
non-meddra                 NaN      0.466423       NaN  



  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
