In [40]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [43]:
gpt_output = pd.read_csv('./train_drug_label_gpt4_output.csv')
manual_ades = pd.read_csv('./train_drug_label_text_manual_ades.csv')
gpt_output.head()

Unnamed: 0,drug_name,section_name,gpt4_output
0,KYPROLIS,adverse reactions,"Cardiac Toxicities, Acute Renal Failure, Tumor..."
1,KYPROLIS,warnings and precautions,"Cardiac toxicities, cardiac failure, myocardia..."
2,MULTAQ,adverse reactions,"New or worsening heart failure, Liver Injury, ..."
3,MULTAQ,boxed warnings,"Increased risk of death, Stroke, Heart failure..."
4,MULTAQ,warnings and precautions,"Cardiac rhythm, Liver injury, Pulmonary toxici..."


In [5]:
# 10 random drugs
drugs = gpt_output['drug_name'].unique()

In [59]:
def evaluation(manual_ades, gpt_output, output_column, limit = 1000):
    drugs = gpt_output['drug_name'].unique()
    drugs_set = set()
    results = []
    for drug in tqdm(drugs):
        drugs_set.add(drug)
        if len(drugs_set) > limit:
            break
        
        for section in gpt_output[gpt_output['drug_name'] == drug].section_name.values:
            drug_df = manual_ades.query(
                "(drug_name == '{}') & (section_name == '{}')".format(drug, section))
            manual = set(drug_df['reaction_string'].to_list())
            gpt_drug = (gpt_output[
                (gpt_output['drug_name'] == drug)
                &
                (gpt_output['section_name'] == section)
                ][output_column].astype(str)
                .str.lower()
                .str.replace('\n-', ', ')
                .str.split(",").tolist())
        
            try:
                gpt_drug = [x.strip() for x in gpt_drug[0]]
                gpt_drug = set(gpt_drug)
            except:
                results.append([drug, len(manual), len(gpt_drug), np.nan, np.nan,
                                np.nan, np.nan, np.nan, np.nan])
                continue
            
            if len(gpt_drug) == 0 or len(manual) == 0:
                results.append([drug, len(manual), len(gpt_drug), np.nan, np.nan,
                                np.nan, np.nan, np.nan, np.nan])
                continue
            
            TP = len(manual.intersection(gpt_drug))
            FP = len(gpt_drug.difference(manual))
            FN = len(manual.difference(gpt_drug))
            precision = TP/(TP+FP)
            recall = TP/(TP+FN)
            if precision != 0 and recall != 0:
                f1 = (2 * precision * recall)/(precision + recall)# 2*TP/(2*TP+FP+FN)
            else:
                f1 = np.NAN

            results.append([drug, len(manual), len(gpt_drug), TP, FP, FN, precision, recall, f1])
    results = pd.DataFrame(results, columns=['drug_name', 'n_manual', 'n_gpt', 'tp', 'fp', 'fn', 'precision', 'recall', 'f1'])
    return results

In [64]:
results = evaluation(manual_ades, gpt_output, 'gpt4_output')

100%|██████████| 101/101 [00:00<00:00, 115.67it/s]


In [65]:
[tp_total, fp_total, fn_total] =  results[['tp', 'fp', 'fn']].sum()
precision = tp_total/(tp_total+fp_total)
recall = tp_total/(tp_total+fn_total)
f1 = 2*tp_total/(2*tp_total+fp_total+fn_total)
print("precision: {}\nrecall: {}\nf1: {}".format(precision, recall, f1))  

precision: 0.6551868962620747
recall: 0.19376474972053162
f1: 0.299079754601227
