In [1]:
import os
from tqdm import tqdm
import pandas as pd
from openai import OpenAI
import openai
import constants
import csv
import numpy as np
import concurrent
import time
import json

## Set Up

### Functions

In [2]:
# function for running GPT
def extract_ade_terms(gpt_model, system_content, prompt, text, openai_api):
  client = OpenAI(api_key=openai_api,)
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": system_content},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model,
  )
  term = chat_completion.choices[0].message.content
  return term

In [3]:
def evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'):
    '''
    For a given drug, evaluate the performance of GPT on a given subtype of ADEs. 
    '''
    
    drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
    if subtype == 'exact-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 1]
    if subtype == 'non-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 0]
    if subtype == 'negated': drug_df = drug_df[drug_df.negated_term == 1]
    if subtype == 'discontinuous': drug_df = drug_df[drug_df.discontinuous_term == 1]

    manual = set(drug_df['reaction_string'].to_list())
    gpt_drug = (gpt_output[
        (gpt_output['drug_name'] == drug)
        &
        (gpt_output['section_name'] == "adverse reactions")
        ]["gpt_output"].astype(str)
        .str.lower()
        .str.replace('\n-', ', ')
        .str.split(",").tolist())

    try:
        gpt_drug = [x.strip() for x in gpt_drug[0]]
        gpt_drug = set(gpt_drug)
    except:
        return [drug, subtype, len(manual), len(gpt_drug), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    
    #overall
    TP = len(manual.intersection(gpt_drug))
    FP = len(gpt_drug.difference(manual))
    FN = len(manual.difference(gpt_drug))
    if TP == 0 and FP == 0:
        precision = np.NAN
    else:
        precision = TP/(TP+FP)
    if TP == 0 and FN == 0:
        recall = np.NAN
    else:
        recall = TP/(TP+FN)
    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
    else:
        f1 = np.NAN
    
    if subtype != 'all':
        # these can't be computed for the subtypes
        precision = np.nan
        f1 = np.nan
        FP = np.nan
    
    return [drug, subtype, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1]

In [4]:
def evaluation(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):

        results.append(evaluation_subtype(manual_ades, gpt_output, drug))
        
    results = pd.DataFrame(results, columns=['drug_name', 'exclude', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [5]:
def evaluation_granular(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'exact-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'non-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'negated'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'discontinuous'))

    results = pd.DataFrame(results, columns=['drug_name', 'ade_type', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [6]:
config = json.load(open('./config.json'))

openai.organization = ""
openai.api_key = config['openai_api_key'] #constants.AZURE_OPENAI_KEY

### Variables

In [7]:
drug_file = 'data/train_drug_label_text.csv'
manual_file = 'data/train_drug_label_text_manual_ades.csv'
my_max = 10000

In [8]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

In [9]:
outputs = {}

## Run GPT

In [10]:
# gpt_model = 'gpt-4-1106-preview'
gpt_model = 'gpt-3.5-turbo-0125'
nruns = 15

system_options = {
    "pharmexpert-v0": "You are an expert in pharmacology.",
    "pharmexpert-v1": "You are an expert in adverse drug reactions, pharmacology, and clinical trials."
}

prompt_options = {
    "fatal-prompt-v2": """
Extract all adverse reactions as they appear, including all synonyms.
mentioned in the text and provide them as a comma-separated list.
If a fatal event is listed add 'death' to the list.
The text is :'{}' 
"""
}

system_name = "pharmexpert-v0"
system_content = system_options[system_name]

prompt_name = "fatal-prompt-v2"
prompt = prompt_options[prompt_name]

output_file_basename = '{}_{}_{}_{}'.format(gpt_model, prompt_name, system_name, set_type)
output_file_basename

'gpt-3.5-turbo-0125_fatal-prompt-v2_pharmexpert-v0_train'

In [11]:
# if there is a max
new_rows = list()
unique_drugs = set()
for i, row in drugs.iterrows():
    unique_drugs.add(row["drug_name"])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue

    new_rows.append(row)

In [12]:
# run GPT
for i in range(nruns):
    if f"gpt{i}" in outputs:
        print(f"Run {i} already completed and stored. Skipping.")
        continue
    
    if os.path.exists('results/{}_run{}.csv'.format(output_file_basename, i)):
        gpt_output = pd.read_csv('results/{}_run{}.csv'.format(output_file_basename, i))
        outputs[f"gpt{i}"] = gpt_output
        print(f"Run {i} already completed and loaded from disk.")
        continue
    
    start = time.time()
    def run_iteration(row):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text']
        try:
            gpt_out = extract_ade_terms(gpt_model, system_content, prompt, text, openai.api_key)
            return [name, section, gpt_out]
        except Exception as err:
            print(err)
            return None
            
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
        results = list(tqdm(
            exec.map(run_iteration, new_rows), 
            total=len(new_rows)
        ))

    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    gpt_output.to_csv('results/{}_run{}.csv'.format(output_file_basename, i))
    end = time.time()

    outputs[f"gpt{i}"] = gpt_output
    
    print(f"Run: {i}, time elapsed: {end-start}s.")

  0%|          | 0/101 [00:00<?, ?it/s]

  1%|          | 1/101 [00:10<17:56, 10.77s/it]

## Exact Match Algorithm

In [179]:
exact_match_run = True

In [180]:
if not exact_match_run:
    # load the meddra terms
    fh = open('meddra_llt_pt_map.txt')
    reader = csv.reader(fh, delimiter='|')
    header = next(reader)

    meddra_llt_terms = set()
    meddra_pt_terms = set()

    for row in reader:
        meddra_llt_terms.add(row[1].lower())
        meddra_pt_terms.add(row[4].lower())

    fh.close()

    meddra_terms = meddra_llt_terms | meddra_pt_terms
    len(meddra_llt_terms), len(meddra_pt_terms), len(meddra_terms)

    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text'].lower()
        
        found_terms = set()
        for term in meddra_terms:
            if text.find(term) != -1:
                found_terms.add(term)
        
        exact_out = ', '.join(list(found_terms))
        
        results.append([name, section, exact_out])

    exact_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    outputs['exact'] = exact_output

## Evaluation

In [181]:
for method, output in outputs.items():
    results_granular = evaluation_granular(manual_ades, output)
    overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    save_filename = 'results/{}_{}_granular.csv'.format(output_file_basename, method)
    overall_results.to_csv(save_filename)
    print(method)
    print(overall_results)

100%|██████████| 101/101 [00:00<00:00, 149.61it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt0
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3849.0  473.0  1488.0          0.89056      0.721192  0.796977   
discontinuous    37.0    0.0   351.0          1.00000      0.095361  0.174118   
exact-meddra   3431.0    0.0   732.0          1.00000      0.824165  0.903608   
negated          16.0    0.0    16.0          1.00000      0.500000  0.666667   
non-meddra      418.0    0.0   757.0          1.00000      0.355745  0.524796   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                    0.87551      0.756264  0.812291  
discontinuous              NaN      0.097744       NaN  
exact-meddra               NaN      0.850229       NaN  
negated                    NaN      0.472222       NaN  
non-meddra                 NaN      0.445092       NaN  


100%|██████████| 101/101 [00:00<00:00, 147.39it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt1
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3904.0  510.0  1433.0         0.884459      0.731497  0.800738   
discontinuous    52.0    0.0   336.0         1.000000      0.134021  0.236364   
exact-meddra   3465.0    0.0   698.0         1.000000      0.832332  0.908495   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      439.0    0.0   736.0         1.000000      0.373617  0.543990   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.872065      0.766317  0.816463  
discontinuous              NaN      0.160105       NaN  
exact-meddra               NaN      0.858389       NaN  
negated                    NaN      0.523810       NaN  
non-meddra                 NaN      0.484580       NaN  


100%|██████████| 101/101 [00:00<00:00, 145.25it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt2
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3920.0  472.0  1417.0         0.892532      0.734495  0.805838   
discontinuous    42.0    0.0   346.0         1.000000      0.108247  0.195349   
exact-meddra   3490.0    0.0   673.0         1.000000      0.838338  0.912061   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      430.0    0.0   745.0         1.000000      0.365957  0.535826   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                    0.86646      0.756071  0.815931  
discontinuous              NaN      0.109462       NaN  
exact-meddra               NaN      0.846832       NaN  
negated                    NaN      0.519841       NaN  
non-meddra                 NaN      0.471401       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.77it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt3
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3898.0  451.0  1439.0         0.896298      0.730373  0.804873   
discontinuous    33.0    0.0   355.0         1.000000      0.085052  0.156770   
exact-meddra   3472.0    0.0   691.0         1.000000      0.834014  0.909496   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      426.0    0.0   749.0         1.000000      0.362553  0.532167   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                    0.87195      0.755393  0.818401  
discontinuous              NaN      0.091599       NaN  
exact-meddra               NaN      0.850562       NaN  
negated                    NaN      0.452381       NaN  
non-meddra                 NaN      0.462978       NaN  


100%|██████████| 101/101 [00:00<00:00, 155.31it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt4
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3918.0  440.0  1419.0         0.899036      0.734120  0.808252   
discontinuous    57.0    0.0   331.0         1.000000      0.146907  0.256180   
exact-meddra   3469.0    0.0   694.0         1.000000      0.833293  0.909067   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      449.0    0.0   726.0         1.000000      0.382128  0.552956   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.873001      0.756857  0.820757  
discontinuous              NaN      0.125330       NaN  
exact-meddra               NaN      0.849240       NaN  
negated                    NaN      0.523810       NaN  
non-meddra                 NaN      0.487961       NaN  


100%|██████████| 101/101 [00:00<00:00, 155.55it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt5
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3880.0  431.0  1457.0         0.900023      0.727000  0.804312   
discontinuous    53.0    0.0   335.0         1.000000      0.136598  0.240363   
exact-meddra   3455.0    0.0   708.0         1.000000      0.829930  0.907062   
negated          13.0    0.0    19.0         1.000000      0.406250  0.577778   
non-meddra      425.0    0.0   750.0         1.000000      0.361702  0.531250   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.873747      0.757469  0.820567  
discontinuous              NaN      0.155238       NaN  
exact-meddra               NaN      0.850485       NaN  
negated                    NaN      0.436508       NaN  
non-meddra                 NaN      0.466687       NaN  


100%|██████████| 101/101 [00:00<00:00, 150.11it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt6
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3967.0  468.0  1370.0         0.894476      0.743301  0.811912   
discontinuous    54.0    0.0   334.0         1.000000      0.139175  0.244344   
exact-meddra   3528.0    0.0   635.0         1.000000      0.847466  0.917436   
negated          19.0    0.0    13.0         1.000000      0.593750  0.745098   
non-meddra      439.0    0.0   736.0         1.000000      0.373617  0.543990   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.879163      0.774956  0.815748  
discontinuous              NaN      0.126414       NaN  
exact-meddra               NaN      0.871072       NaN  
negated                    NaN      0.547619       NaN  
non-meddra                 NaN      0.471347       NaN  


100%|██████████| 101/101 [00:00<00:00, 154.13it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt7
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3925.0  484.0  1412.0         0.890225      0.735432  0.805459   
discontinuous    52.0    0.0   336.0         1.000000      0.134021  0.236364   
exact-meddra   3494.0    0.0   669.0         1.000000      0.839299  0.912629   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      431.0    0.0   744.0         1.000000      0.366809  0.536737   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.870562      0.761709  0.821836  
discontinuous              NaN      0.145542       NaN  
exact-meddra               NaN      0.857307       NaN  
negated                    NaN      0.519841       NaN  
non-meddra                 NaN      0.461440       NaN  


100%|██████████| 101/101 [00:00<00:00, 152.65it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt8
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3934.0  507.0  1403.0         0.885837      0.737118  0.804664   
discontinuous    38.0    0.0   350.0         1.000000      0.097938  0.178404   
exact-meddra   3508.0    0.0   655.0         1.000000      0.842662  0.914613   
negated          16.0    0.0    16.0         1.000000      0.500000  0.666667   
non-meddra      426.0    0.0   749.0         1.000000      0.362553  0.532167   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.862602      0.757628  0.815643  
discontinuous              NaN      0.126364       NaN  
exact-meddra               NaN      0.851567       NaN  
negated                    NaN      0.507937       NaN  
non-meddra                 NaN      0.456810       NaN  


100%|██████████| 101/101 [00:00<00:00, 153.24it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt9
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3927.0  485.0  1410.0         0.890073      0.735807  0.805621   
discontinuous    54.0    0.0   334.0         1.000000      0.139175  0.244344   
exact-meddra   3487.0    0.0   676.0         1.000000      0.837617  0.911634   
negated          15.0    0.0    17.0         1.000000      0.468750  0.638298   
non-meddra      440.0    0.0   735.0         1.000000      0.374468  0.544892   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.866206      0.755328  0.815375  
discontinuous              NaN      0.151517       NaN  
exact-meddra               NaN      0.843772       NaN  
negated                    NaN      0.424603       NaN  
non-meddra                 NaN      0.467362       NaN  


100%|██████████| 101/101 [00:00<00:00, 146.28it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt10
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3853.0  526.0  1484.0         0.879881      0.721941  0.793125   
discontinuous    45.0    0.0   343.0         1.000000      0.115979  0.207852   
exact-meddra   3434.0    0.0   729.0         1.000000      0.824886  0.904041   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      419.0    0.0   756.0         1.000000      0.356596  0.525721   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.860292      0.751828  0.809991  
discontinuous              NaN      0.110966       NaN  
exact-meddra               NaN      0.844679       NaN  
negated                    NaN      0.555556       NaN  
non-meddra                 NaN      0.474358       NaN  


100%|██████████| 101/101 [00:00<00:00, 145.21it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt11
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3858.0  497.0  1479.0         0.885878      0.722878  0.796121   
discontinuous    43.0    0.0   345.0         1.000000      0.110825  0.199536   
exact-meddra   3448.0    0.0   715.0         1.000000      0.828249  0.906057   
negated          13.0    0.0    19.0         1.000000      0.406250  0.577778   
non-meddra      410.0    0.0   765.0         1.000000      0.348936  0.517350   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.865456      0.748061   0.81144  
discontinuous              NaN      0.103138       NaN  
exact-meddra               NaN      0.843244       NaN  
negated                    NaN      0.396825       NaN  
non-meddra                 NaN      0.452177       NaN  


100%|██████████| 101/101 [00:00<00:00, 149.25it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt12
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3885.0  489.0  1452.0         0.888203      0.727937  0.800124   
discontinuous    43.0    0.0   345.0         1.000000      0.110825  0.199536   
exact-meddra   3453.0    0.0   710.0         1.000000      0.829450  0.906775   
negated          18.0    0.0    14.0         1.000000      0.562500  0.720000   
non-meddra      432.0    0.0   743.0         1.000000      0.367660  0.537648   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.877343      0.767090  0.819531  
discontinuous              NaN      0.127689       NaN  
exact-meddra               NaN      0.858716       NaN  
negated                    NaN      0.547619       NaN  
non-meddra                 NaN      0.475297       NaN  


100%|██████████| 101/101 [00:00<00:00, 144.92it/s]
  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


gpt13
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3878.0  492.0  1459.0         0.887414      0.726625  0.799011   
discontinuous    51.0    0.0   337.0         1.000000      0.131443  0.232346   
exact-meddra   3451.0    0.0   712.0         1.000000      0.828969  0.906488   
negated          17.0    0.0    15.0         1.000000      0.531250  0.693878   
non-meddra      427.0    0.0   748.0         1.000000      0.363404  0.533084   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.869834      0.750553  0.806777  
discontinuous              NaN      0.128458       NaN  
exact-meddra               NaN      0.844940       NaN  
negated                    NaN      0.515873       NaN  
non-meddra                 NaN      0.467192       NaN  


100%|██████████| 101/101 [00:00<00:00, 152.37it/s]

gpt14
                   tp     fp      fn  micro_precision  micro_recall  micro_f1  \
ade_type                                                                        
all            3884.0  513.0  1453.0          0.88333      0.727750  0.798028   
discontinuous    37.0    0.0   351.0          1.00000      0.095361  0.174118   
exact-meddra   3449.0    0.0   714.0          1.00000      0.828489  0.906201   
negated          18.0    0.0    14.0          1.00000      0.562500  0.720000   
non-meddra      435.0    0.0   740.0          1.00000      0.370213  0.540373   

               macro_precision  macro_recall  macro_f1  
ade_type                                                
all                   0.871128      0.759044  0.813051  
discontinuous              NaN      0.107796       NaN  
exact-meddra               NaN      0.850862       NaN  
negated                    NaN      0.531746       NaN  
non-meddra                 NaN      0.468709       NaN  



  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
  macro_results = results_granular.groupby('ade_type')['precision', 'recall', 'f1'].apply(np.mean)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
