In [1]:
import os
from tqdm import tqdm
import pandas as pd
from openai import OpenAI, AzureOpenAI
import csv
import numpy as np
import concurrent
import time
import json
from common_string import common_lenient_performance

## Set Up

### Functions

In [2]:
# function for running GPT
def extract_ade_terms(api_source, api_endpoint, gpt_model, system_content, prompt, text, openai_api):
  if api_source == 'OpenAI':
    client = OpenAI(api_key=openai_api)
  elif api_source == 'Azure':
    client = AzureOpenAI(api_key=openai_api, api_version="2023-12-01-preview", azure_endpoint=api_endpoint)
  else:
    raise Exception(f"Unexpected API source requested: {api_source}")
  
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": system_content},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model
  )
  term = chat_completion.choices[0].message.content
  return term

In [38]:
def evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all', lenient=False):
    '''
    For a given drug, evaluate the performance of GPT on a given subtype of ADEs. 
    '''
    
    drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
    if subtype == 'exact-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 1]
    if subtype == 'non-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 0]
    if subtype == 'negated': drug_df = drug_df[drug_df.negated_term == 1]
    if subtype == 'discontinuous': drug_df = drug_df[drug_df.discontinuous_term == 1]

    
    manual = set(drug_df['reaction_string'].to_list())
    gpt_drug = (gpt_output[
        (gpt_output['drug_name'] == drug)
        &
        (gpt_output['section_name'] == "adverse reactions")
        ]["gpt_output"].astype(str)
        .str.lower()
        .str.replace('\n-', ', ')
        .str.split(",").tolist())

    try:
        gpt_drug = [x.strip() for x in gpt_drug[0]]
        gpt_drug = set(gpt_drug)
    except:
        return [drug, subtype, len(manual), len(gpt_drug), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
        
    if not lenient:    
        #overall
        TP = len(manual.intersection(gpt_drug))
        FP = len(gpt_drug.difference(manual))
        FN = len(manual.difference(gpt_drug))
        if TP == 0 and FP == 0:
            precision = np.NAN
        else:
            precision = TP/(TP+FP)
        if TP == 0 and FN == 0:
            recall = np.NAN
        else:
            recall = TP/(TP+FN)
        if precision != 0 and recall != 0:
            f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
        else:
            f1 = np.NAN
    else:
        [TP, FP, FN, precision, recall, f1] = common_lenient_performance(gpt_drug, manual)
    
    if subtype != 'all':
            # these can't be computed for the subtypes
            precision = np.nan
            f1 = np.nan
            FP = np.nan
    
    return [drug, subtype, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1]


In [21]:
def evaluation(manual_ades, gpt_output, lenient=False, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
            results.append(evaluation_subtype(manual_ades, gpt_output, drug, lenient))        
    results = pd.DataFrame(results, columns=['drug_name', 'exclude', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [33]:
def evaluation_granular(manual_ades, gpt_output, limit = 1000, lenient=False):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all', lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'exact-meddra',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'non-meddra',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'negated',lenient=lenient))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'discontinuous',lenient=lenient))

    results = pd.DataFrame(results, columns=['drug_name', 'ade_type', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

### Variables

In [23]:
drug_file = 'data/train_drug_label_text.csv'
manual_file = 'data/train_drug_label_text_manual_ades.csv'
my_max = 10000

In [24]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

## Run GPT

In [25]:
outputs = {}

In [26]:
config = json.load(open('./config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint'] 

gpt_model = config[api_source]["gpt_model"]
gpt_model = "gpt-4-turbo-preview"


In [27]:
nruns = 1

system_options = {
    "pharmexpert-v0": "You are an expert in pharmacology.",
    "pharmexpert-v1": "You are an expert in medical natural language processing, adverse drug reactions, pharmacology, and clinical trials."
}

prompt_options = {
    "fatal-prompt-v2": """
Extract all adverse reactions as they appear, including all synonyms.
mentioned in the text and provide them as a comma-separated list.
If a fatal event is listed add 'death' to the list.
The text is :'{}' 
"""
}

system_name = "pharmexpert-v0"
system_content = system_options[system_name]

prompt_name = "fatal-prompt-v2"
prompt = prompt_options[prompt_name]

output_file_basename = '{}_{}_{}_{}_{}'.format(api_source, gpt_model, prompt_name, system_name, set_type)
output_file_basename

'OpenAI_gpt-4-turbo-preview_fatal-prompt-v2_pharmexpert-v0_train'

In [28]:
# if there is a max
new_rows = list()
unique_drugs = set()
for i, row in drugs.iterrows():
    unique_drugs.add(row["drug_name"])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue

    new_rows.append(row)

In [29]:
# run GPT
for i in range(nruns):
    run_key = "{}_run{}".format(output_file_basename, i)

    if run_key in outputs:
        print(f"Run {run_key} already completed and stored. Skipping.")
        continue
    
    if os.path.exists('results/{}.csv'.format(run_key)):
        gpt_output = pd.read_csv('results/{}.csv'.format(run_key))
        outputs[run_key] = gpt_output
        print(f"Run {run_key} already completed and loaded from disk.")
        continue
    
    start = time.time()
    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text']
        try:
            gpt_out = extract_ade_terms(api_source, api_endpoint, gpt_model, system_content, prompt, text, api_key)
            # time.sleep(5)
            results.append([name, section, gpt_out])
        except Exception as err:
            print(f"Encountered an exception for row: {name} {section}. Error message below:")
            print(err)
            continue
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    end = time.time()

    if gpt_output.shape[0] > 0:
        outputs[run_key] = gpt_output
        gpt_output.to_csv('results/{}.csv'.format(run_key))
    
    print(f"Run: {run_key}, time elapsed: {end-start}s.")

Run 0 already completed and loaded from disk.


## Exact Match Algorithm

TODO: Exact match really doesn't fit here any longer but we need it at least once to show how well (or poorly) it does. Refactor out into it's own notebook/script.

In [30]:
exact_match_run = True

In [31]:
run_key = 'exact_{}_{}_{}'.format(prompt_name, system_name, set_type)

if not os.path.exists('results/{}.csv'.format(run_key)):
    # load the meddra terms
    fh = open('data/meddra_llt_pt_map.txt')
    reader = csv.reader(fh, delimiter='|')
    header = next(reader)

    meddra_llt_terms = set()
    meddra_pt_terms = set()

    for row in reader:
        meddra_llt_terms.add(row[1].lower())
        meddra_pt_terms.add(row[4].lower())
    
    fh.close()

    meddra_terms = meddra_llt_terms | meddra_pt_terms
    len(meddra_llt_terms), len(meddra_pt_terms), len(meddra_terms)

    results = list()
    for row in tqdm(new_rows):
        name, section = row['drug_name'], row['section_name']
        text = row['section_text'].lower()
        
        found_terms = set()
        for term in meddra_terms:
            if text.find(term) != -1:
                found_terms.add(term)
        
        exact_out = ', '.join(list(found_terms))
        
        results.append([name, section, exact_out])

    exact_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'section_name', 'gpt_output']
    )
    exact_output.to_csv('results/{}.csv'.format(run_key))
    
    outputs[run_key] = exact_output

## Evaluation

In [36]:
for run_key, output in outputs.items():
    results_granular = evaluation_granular(manual_ades, output)
    overall_results = results_granular.groupby('ade_type')[['tp', 'fp', 'fn']].sum(min_count = 1).reset_index()
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')[['precision', 'recall', 'f1']].mean(numeric_only=True).reset_index()
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    save_filename = 'results/{}_granular.csv'.format(run_key)

    overall_results.to_csv(save_filename)
    
    print(run_key)
    print(overall_results)

100%|██████████| 100/100 [00:01<00:00, 60.63it/s]

gpt0
        ade_type    tp     fp    fn  micro_precision  micro_recall  micro_f1  \
0            all  3650  495.0  1466         0.880579      0.713448  0.788252   
1  discontinuous    22    NaN   348              NaN      0.059459       NaN   
2   exact-meddra  3253    NaN   736              NaN      0.815493       NaN   
3        negated    12    NaN    19              NaN      0.387097       NaN   
4     non-meddra   397    NaN   731              NaN      0.351950       NaN   

   macro_precision  macro_recall  macro_f1  
0         0.854821      0.736277  0.792026  
1              NaN      0.062585       NaN  
2              NaN      0.828049       NaN  
3              NaN      0.400000       NaN  
4              NaN      0.448830       NaN  





#### Lenient Matching using Longest Commmon Substring

In [39]:
# using lenient matching method 'longest common substring'
for method, output in outputs.items():
    results_granular = evaluation_granular(manual_ades, output, lenient=True)
    overall_results = results_granular.groupby('ade_type')[['tp', 'fp', 'fn']].sum(min_count = 1).reset_index()
    overall_results['micro_precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
    overall_results['micro_recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
    overall_results['micro_f1'] = (2 * overall_results['micro_precision'] * overall_results['micro_recall'])/(overall_results['micro_precision'] + overall_results['micro_recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
    macro_results = results_granular.groupby('ade_type')[['precision', 'recall', 'f1']].mean(numeric_only=True).reset_index()
    overall_results['macro_precision'] = macro_results['precision']
    overall_results['macro_recall'] = macro_results['recall']
    overall_results['macro_f1'] = macro_results['f1']
    # if method == 'exact':
    #     save_filename = 'results/exact_{}_{}_{}.csv'.format(prompt_name, system_name, set_type)
    # else: 
    #     save_filename = 'results/{}_{}_granular.csv'.format(output_file_basename, method)

    # overall_results.to_csv(save_filename)
    print(method)
    print(overall_results)

100%|██████████| 100/100 [00:09<00:00, 10.67it/s]

gpt0
        ade_type    tp     fp   fn  micro_precision  micro_recall  micro_f1  \
0            all  4249  169.0  867         0.961747      0.830532  0.891336   
1  discontinuous   140    NaN  230              NaN      0.378378       NaN   
2   exact-meddra  3572    NaN  417              NaN      0.895463       NaN   
3        negated    23    NaN    8              NaN      0.741935       NaN   
4     non-meddra   677    NaN  451              NaN      0.600177       NaN   

   macro_precision  macro_recall  macro_f1  
0          0.95712      0.867860  0.904589  
1              NaN      0.343329       NaN  
2              NaN      0.922861       NaN  
3              NaN      0.787500       NaN  
4              NaN      0.698721       NaN  



