In [1]:
from tqdm import tqdm
import pandas as pd
from openai import OpenAI
import openai
import constants
import csv
import numpy as np

## Set Up

### Functions

In [2]:
# function for running GPT
def extract_ade_terms(gpt_model, prompt, text, openai_api):
  client = OpenAI(api_key=openai_api,)
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": "You are an expert in pharmacology."},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model,
  )
  term = chat_completion.choices[0].message.content
  return term

In [3]:
def evaluation(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
        manual = set(drug_df['reaction_string'].to_list())
        gpt_drug = (gpt_output[
            (gpt_output['drug_name'] == drug)
            &
            (gpt_output['section_name'] == "adverse reactions")
            ]["gpt_output"].astype(str)
            .str.lower()
            .str.replace('\n-', ', ')
            .str.split(",").tolist())
    
        try:
            gpt_drug = [x.strip() for x in gpt_drug[0]]
            gpt_drug = set(gpt_drug)
        except:
            results.append([drug, len(manual), len(gpt_drug), np.nan, np.nan,
                             np.nan, np.nan, np.nan, np.nan])
            continue

        TP = len(manual.intersection(gpt_drug))
        FP = len(gpt_drug.difference(manual))
        FN = len(manual.difference(gpt_drug))
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        if precision != 0 and recall != 0:
            f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
        else:
            f1 = np.NAN

        results.append([drug, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1])
    results = pd.DataFrame(results, columns=['drug_name', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [4]:
openai.organization = ""
openai.api_key = constants.AZURE_OPENAI_KEY

### Variables

In [37]:
drug_file = 'test_drug_label_text.csv'
manual_file = 'test_drug_label_text_manual_ades.csv'
my_max = 10000
gpt_model = 'gpt-4-1106-preview'

In [38]:
prompt_name = 'baseline-ade'
prompt = """ Extract all adverse reactions as they appear, including all synonyms,
mentioned in this text provide them as a comma-separated list: 
'{}' """

## Run GPT

In [39]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('_')[0] # assuming file follows format "train_..." or "test...."

In [None]:
# run GPT    
gpt_output = []
unique_drugs = set()
for i, row in tqdm(drugs.iterrows()):
    unique_drugs.add(row['drug_name'])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue
    name, section = row['drug_name'], row['section_name']
    text = row['section_text']
    try:
        gpt_out = extract_ade_terms(gpt_model, prompt, text, openai.api_key)
    except:
        gpt_out = None
        continue
    gpt_output.append([name, section, gpt_out])
gpt_output = pd.DataFrame(gpt_output, columns=['drug_name', 'section_name', 'gpt_output'])
gpt_output.to_csv('{}_{}_{}.csv'.format(gpt_model, prompt_name, set_type))

104it [14:16,  7.54s/it]

In [32]:
results = evaluation(manual_ades, gpt_output)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 189.13it/s]


In [32]:
results_test = evaluation(manual_ades, gpt_output)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 177.01it/s]


In [33]:
results_test

Unnamed: 0,drug_name,n_manual,n_gpt,tp,fp,fn,precision,recall,f1
0,KYPROLIS,128,93,88,5,40,0.946237,0.687500,0.796380
1,MULTAQ,39,32,25,7,14,0.781250,0.641026,0.704225
2,JUBLIA,5,4,4,0,1,1.000000,0.800000,0.888889
3,TEFLARO,30,38,29,9,1,0.763158,0.966667,0.852941
4,DATSCAN,11,10,10,0,1,1.000000,0.909091,0.952381
...,...,...,...,...,...,...,...,...,...
96,BENLYSTA,26,20,18,2,8,0.900000,0.692308,0.782609
97,TOVIAZ,37,35,32,3,5,0.914286,0.864865,0.888889
98,ENTEREG,1,1,1,0,0,1.000000,1.000000,1.000000
99,VIZAMYL,8,8,7,1,1,0.875000,0.875000,0.875000


In [34]:
[tp_total, fp_total, fn_total] =  results_test[['tp', 'fp', 'fn']].sum()
precision = tp_total/(tp_total+fp_total)
recall =  tp_total/(tp_total+fn_total)
f1 = (2 * precision * recall)/(precision + recall) # 2*tp_total/(2*tp_total+fp_total+fn_total) 
print("prompt: {}".format(prompt_name))
print("precision: {}\nrecall: {}\nf1: {}".format(precision, recall, f1)) 

prompt: baseline-ade
precision: 0.8817081525462435
recall: 0.7234401349072512
f1: 0.7947715109098394


In [19]:
results_test.precision.mean()
results_test.recall.mean()

0.7647715316537429

In [36]:
# model, data, prompt, precision, recall, f1
with open('gpt_model_results.csv', 'a') as file:
    file.write('{}, {}, {}, {}, {}, {}\n'.format(gpt_model, set_type, prompt_name, precision, recall, f1))