In [7]:
from tqdm import tqdm
import pandas as pd
from openai import OpenAI
import openai
import constants
import csv
import numpy as np
import concurrent
import time

## Set Up

### Functions

In [8]:
# function for running GPT
def extract_ade_terms(gpt_model, prompt, text, openai_api):
  client = OpenAI(api_key=openai_api,)
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": "You are an expert in pharmacology."},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model,
  )
  term = chat_completion.choices[0].message.content
  return term

In [9]:
def evaluation(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
        manual = set(drug_df['reaction_string'].to_list())
        gpt_drug = (gpt_output[
            (gpt_output['drug_name'] == drug)
            &
            (gpt_output['section_name'] == "adverse reactions")
            ]["gpt_output"].astype(str)
            .str.lower()
            .str.replace('\n-', ', ')
            .str.split(",").tolist())
    
        try:
            gpt_drug = [x.strip() for x in gpt_drug[0]]
            gpt_drug = set(gpt_drug)
        except:
            results.append([drug, len(manual), len(gpt_drug), np.nan, np.nan,
                             np.nan, np.nan, np.nan, np.nan])
            continue

        TP = len(manual.intersection(gpt_drug))
        FP = len(gpt_drug.difference(manual))
        FN = len(manual.difference(gpt_drug))
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        if precision != 0 and recall != 0:
            f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
        else:
            f1 = np.NAN

        results.append([drug, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1])
    results = pd.DataFrame(results, columns=['drug_name', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [26]:
def evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'):
    '''
    For a given drug, evaluate the performance of GPT on a given subtype of ADEs. 
    '''
    
    drug_df = manual_ades.query("(drug_name == '{}') & (section_name == 'adverse reactions')".format(drug))
    if subtype == 'exact-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 1]
    if subtype == 'non-meddra': drug_df = drug_df[drug_df.meddra_exact_term == 0]
    if subtype == 'negated': drug_df = drug_df[drug_df.negated_term == 1]
    if subtype == 'discontinuous': drug_df = drug_df[drug_df.discontinuous_term == 1]

    manual = set(drug_df['reaction_string'].to_list())
    gpt_drug = (gpt_output[
        (gpt_output['drug_name'] == drug)
        &
        (gpt_output['section_name'] == "adverse reactions")
        ]["gpt_output"].astype(str)
        .str.lower()
        .str.replace('\n-', ', ')
        .str.split(",").tolist())

    try:
        gpt_drug = [x.strip() for x in gpt_drug[0]]
        gpt_drug = set(gpt_drug)
    except:
        return [drug, subtype, len(manual), len(gpt_drug), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    
    #overall
    TP = len(manual.intersection(gpt_drug))
    FP = len(gpt_drug.difference(manual))
    FN = len(manual.difference(gpt_drug))
    if TP == 0 and FP == 0:
        precision = np.NAN
    else:
        precision = TP/(TP+FP)
    if TP == 0 and FN == 0:
        recall = np.NAN
    else:
        recall = TP/(TP+FN)
    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
    else:
        f1 = np.NAN
    
    return [drug, subtype, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1]

In [11]:
def evaluation_granular(manual_ades, gpt_output, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'all'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'exact-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'non-meddra'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'negated'))
        results.append(evaluation_subtype(manual_ades, gpt_output, drug, subtype = 'discontinuous'))

    results = pd.DataFrame(results, columns=['drug_name', 'ade_type', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [15]:
openai.organization = ""
openai.api_key = "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" #constants.AZURE_OPENAI_KEY

### Variables

In [16]:
drug_file = 'test_drug_label_text.csv'
manual_file = 'test_drug_label_text_manual_ades.csv'
my_max = 10000
gpt_model = 'gpt-4-1106-preview'

In [17]:
prompt_name = 'fatal-prompt-v2'
prompt = """
Extract all adverse reactions as they appear, including all synonyms.
mentioned in the text and provide them as a comma-separated list.
If a fatal event is listed add 'death' to the list.
The text is :'{}' """

## Run GPT

In [18]:
drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('_')[0] # assuming file follows format "train_..." or "test...."
print('{}_{}_{}.csv'.format(gpt_model, prompt_name, set_type))

gpt-4-1106-preview_fatal-prompt-v2_test.csv


In [19]:
# if there is a max
new_rows = list()
unique_drugs = set()
for i, row in drugs.iterrows():
    unique_drugs.add(row["drug_name"])
    if len(unique_drugs) > my_max: 
        break
    if row['section_name'] != 'adverse reactions':
        continue

    new_rows.append(row)

In [20]:
# run GPT    
start = time.time()
def run_iteration(row):
    name, section = row['drug_name'], row['section_name']
    text = row['section_text']
    try:
        gpt_out = extract_ade_terms(gpt_model, prompt, text, openai.api_key)
        return [name, section, gpt_out]
    except:
        return None
        
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
    results = list(tqdm(
		exec.map(run_iteration, new_rows), 
		total=len(new_rows)
	))

gpt_output = pd.DataFrame(
    [r for r in results if r is not None],
    columns=['drug_name', 'section_name', 'gpt_output']
)
gpt_output.to_csv('{}_{}_{}.csv'.format(gpt_model, prompt_name, set_type))
end = time.time()

100%|██████████| 99/99 [02:39<00:00,  1.62s/it]


In [21]:
print(end - start)

160.09201788902283


In [22]:
results = evaluation(manual_ades, gpt_output)

  0%|          | 0/99 [00:00<?, ?it/s]

100%|██████████| 99/99 [00:00<00:00, 594.45it/s]


In [23]:
[tp_total, fp_total, fn_total] =  results[['tp', 'fp', 'fn']].sum()
precision = tp_total/(tp_total+fp_total)
recall =  tp_total/(tp_total+fn_total)
f1 = (2 * precision * recall)/(precision + recall) # 2*tp_total/(2*tp_total+fp_total+fn_total) 
print("prompt: {}".format(prompt_name))
print("precision: {}\nrecall: {}\nf1: {}".format(precision, recall, f1)) 

prompt: fatal-prompt-v2
precision: 0.8749677751997937
recall: 0.7273896270895842
f1: 0.7943826799297835


In [24]:
# model, data, prompt, precision, recall, f1
with open('gpt_model_results.csv', 'a') as file:
    file.write('{}, {}, {}, {}, {}, {}\n'.format(gpt_model, set_type, prompt_name, precision, recall, f1))

----

In [27]:
results_granular = evaluation_granular(manual_ades, gpt_output)

100%|██████████| 99/99 [00:00<00:00, 137.81it/s]


In [31]:
overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)
overall_results['precision'] = overall_results['tp']/(overall_results['tp']+overall_results['fp'])
overall_results['recall'] = overall_results['tp']/(overall_results['tp']+overall_results['fn'])
overall_results['f1'] = (2 * overall_results['precision'] * overall_results['recall'])/(overall_results['precision'] + overall_results['recall']) # 2*tp_total/(2*tp_total+fp_total+fn_total)
overall_results.to_csv('{}_{}_{}_granular.csv'.format(gpt_model, prompt_name, set_type))
overall_results.head()

  overall_results = results_granular.groupby('ade_type')['tp', 'fp', 'fn'].apply(sum)


Unnamed: 0_level_0,tp,fp,fn,precision,recall,f1
ade_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
all,3394,485,1272,0.874968,0.72739,0.794383
discontinuous,20,3859,329,0.005156,0.057307,0.009461
exact-meddra,3003,876,539,0.774169,0.847826,0.809325
negated,21,3858,28,0.005414,0.428571,0.010692
non-meddra,391,3488,733,0.100799,0.347865,0.156306


In [None]:
results_granular.groupby(['ade_type'])