In [3]:
import os
import csv
import numpy as np
import pandas as pd

from tqdm import tqdm
from glob import glob
from bs4 import BeautifulSoup
from collections import defaultdict

from sklearn.metrics import confusion_matrix
from common_string import longest_common_substring_percentage

In [4]:
umls_file = 'data/umls_meddra_en.csv'
fh = open(umls_file)
reader = csv.reader(fh)
header = next(reader)

meddra_terms = set()
meddra_code2term = dict()
for row in reader:
    d = dict(zip(header, row))
    meddra_terms.add(d['STR'].lower())
    meddra_code2term[int(d['CODE'])] = d['STR'].lower()

fh.close()

In [5]:
# load the testing set
folder = 'data/TAC2017/'

test_labels = glob(folder+'gold_xml/*')

drug2mentions = defaultdict(set)
drug2reactions = defaultdict(set)

for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          if section_name != 'S1':
              continue
            
          mention_str = mention['str'].lower()
          drug2mentions[drug_name].add(mention_str)
    
    for reaction in soup.find_all('Reaction'):
        reaction_str = reaction['str']
        drug2reactions[drug_name].add(reaction_str)

len(drug2mentions), len(drug2reactions)

  0%|          | 0/99 [00:00<?, ?it/s]

100%|██████████| 99/99 [00:00<00:00, 217.78it/s]


(99, 99)

In [6]:
diffs_list = list()

task3ref = list()

for drug in drug2mentions.keys():

    for rxn in (drug2mentions[drug] & drug2reactions[drug]):
        task3ref.append([drug, rxn, 1])
    
    for rxn in (drug2mentions[drug] - drug2reactions[drug]):
        task3ref.append([drug, rxn, 0])

    setdiff = drug2mentions[drug]-drug2reactions[drug]
    diff = len(drug2mentions[drug])-len(drug2reactions[drug])
    setdiff_inmeddra = meddra_terms & drug2mentions[drug]
    diffs_list.append([drug, len(drug2mentions[drug]), len(drug2reactions[drug]), diff, len(setdiff), len(setdiff_inmeddra)])

diffs = pd.DataFrame(diffs_list, columns=['drug', 'nmentions', 'nreactions', 'diff', 'setdiff', 'nmeddraexact'])
diffs.shape, len(task3ref)

((99, 6), 4743)

## OnSIDES BERT

In [7]:
# load onsides from the best model and evaluated on the testing set
ob_pred = pd.read_csv('data/task3/grouped-mean-final-bydrug-PMB_14-AR-125-all_222_TAC_25_2.5e-05_256_32.csv', index_col=0)
events = list()

for _, row in ob_pred.iterrows():
    events.append(meddra_code2term[row['pt_meddra_id']])

ob_pred.insert(3, "event", events)
ob_pred

Unnamed: 0,section,drug,pt_meddra_id,event,class,Pred0,Pred1,scored,split
0,AR,ACTEMRA,10000081,abdominal pain,not_event,0.270623,0.000000,scored,test
1,AR,ACTEMRA,10000087,abdominal pain upper,is_event,0.000000,7.458376,scored,test
2,AR,ACTEMRA,10000220,abortion induced,not_event,9.525578,0.000000,scored,test
3,AR,ACTEMRA,10000269,abscess,is_event,0.000000,2.609661,scored,test
4,AR,ACTEMRA,10000565,acquired immunodeficiency syndrome,not_event,9.456971,0.000000,scored,test
...,...,...,...,...,...,...,...,...,...
11022,AR,XEOMIN,10048961,localised oedema,is_event,0.000000,0.000000,not_scored,valid
11023,AR,XIAFLEX,10061225,limb injury,is_event,0.000000,0.000000,not_scored,valid
11024,AR,XTANDI,10057167,mental impairment disorders,is_event,0.000000,0.000000,not_scored,valid
11025,AR,XALKORI,10011906,death,is_event,0.000000,0.000000,not_scored,valid


In [8]:
# from releases.json file in onsides
threshold = 0.4633
ob_predictions = list()

for drug, rxn, label in tqdm(task3ref):
    
    if rxn.find('"') != -1:
        querystr = """drug == '{}' & event == '{}' """.format(drug, rxn)
    elif rxn.find("'") != -1:
        querystr = """drug == "{}" & event == "{}" """.format(drug, rxn)
    else:
        querystr = "drug == '{}' & event == '{}'".format(drug, rxn)    
    
    p = ob_pred.query(querystr)

    # NOTE: leniency is irrelevant here because OnSIDES BERT only considers 
    # NOTE: terms that are exact matches from the label. So each term from OnSIDES BERT
    # NOTE: must be present in the reference if mentioned.

    if p.shape[0] == 0:
        # not an exact match or not scored by OnsidesBERT
        pred1 = 0.0
    else:
        pred1 = float(p['Pred1'])
    
    if pred1 >= threshold:
        ob_predictions.append(1)
    else:
        ob_predictions.append(0)
    
len(ob_predictions), sum(ob_predictions), len(task3ref)

100%|██████████| 4743/4743 [00:03<00:00, 1392.03it/s]


(4743, 2693, 4743)

## DeepCADRME

In [9]:
d_pred = pd.read_csv("results/extract/deepcadrme_100_test.csv", index_col=0).query("section_name == 'adverse reactions'")
d_pred

Unnamed: 0,drug_name,section_name,gpt_output
0,IMPAVIDO,adverse reactions,"nausea, vomiting, diarrhea, headache, decrease..."
3,LIVALO,adverse reactions,"rhabdomyolysis, myoglobinuria, acute renal fai..."
5,XENAZINE,adverse reactions,"depression, suicidality, akathisia, restlessne..."
8,LINZESS,adverse reactions,"diarrhea, abdominal pain, flatulence, abdomina..."
11,OPSUMIT,adverse reactions,"embryo fetal toxicity, hepatotoxicity, decreas..."
...,...,...,...
223,AUBAGIO,adverse reactions,"hepatotoxicity, bone marrow effects, immunosup..."
226,POMALYST,adverse reactions,"fetal risk, venous, arterial thromboembolism, ..."
229,SURFAXIN,adverse reactions,"endotracheal tube reflux, pallor, endotracheal..."
231,ARZERRA,adverse reactions,"infusion reactions, hepatitis b virus reactiva..."


In [10]:
d_predictions = list()

for drug, rxn, label in tqdm(task3ref):

    # extractions = str(d_pred.query(f"drug_name == '{drug}'")['gpt_output'])
    extractions = list(d_pred.query(f"drug_name == '{drug}'")['gpt_output'].str.split(', '))[0]

    # strict
    # if rxn in extractions:
    # lenient
    if any([longest_common_substring_percentage(rxn, x) > 0.8 for x in extractions]):
        d_predictions.append(1)
    else:
        d_predictions.append(0)

len(d_predictions), sum(d_predictions), len(task3ref)

100%|██████████| 4743/4743 [00:05<00:00, 857.45it/s] 


(4743, 4560, 4743)

## Onsides LLM

In [15]:
# fn = "results/extract/OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_test_run0.csv"
# fn = 'results/extract/OpenAI_gpt-4-1106-preview_only-positives-v0_pharmexpert-v0_temp0_test_run0.csv'
fn = "results/extract/OpenAI_gpt-4-1106-preview_gpt-written-prompt_pharmexpert-v0_temp0_test_run0.csv"
ol_pred = pd.read_csv(fn, index_col=0).query("section_name == 'adverse reactions'")
ol_pred

FileNotFoundError: [Errno 2] No such file or directory: 'results/extract/OpenAI_gpt-4-turbo-preview_gpt-written-prompt_pharmexpert-v0_temp0_test_run0.csv'

In [14]:
ol_predictions = list()

for drug, rxn, label in tqdm(task3ref):

    extractions = list(ol_pred.query(f"drug_name == '{drug}'")['gpt_output'].str.split(', '))[0]

    # strict
    # if rxn in extractions:
    # lenient
    if any([longest_common_substring_percentage(rxn, x) > 0.8 for x in extractions]):
        ol_predictions.append(1)
    else:
        ol_predictions.append(0)

len(ol_predictions), sum(ol_predictions), len(task3ref)

100%|██████████| 4743/4743 [00:03<00:00, 1331.95it/s]


(4743, 3364, 4743)

## Evaluation

In [142]:
# compile results
d, e, l = zip(*task3ref)
df_data = zip(d, e, l, ob_predictions, d_predictions, ol_predictions)

predictions = pd.DataFrame(df_data, columns=["drug", "event", "label", "OB", "D", "OL"])

# flip all the labels
flip_labels = True
if flip_labels:
    for colname in ('label', 'OB', 'D', 'OL'):
        predictions[colname] = np.abs(predictions[colname]-2)-1

predictions

Unnamed: 0,drug,event,label,OB,D,OL
0,IMPAVIDO,alt increase,0,1,0,1
1,IMPAVIDO,vomiting,0,0,0,0
2,IMPAVIDO,malaise,0,0,0,0
3,IMPAVIDO,melena,0,1,0,0
4,IMPAVIDO,ast increase,0,1,0,1
...,...,...,...,...,...,...
4738,ESBRIET,pruritus,0,0,0,0
4739,ESBRIET,nausea,0,0,0,0
4740,ESBRIET,asthenia,0,0,0,0
4741,ESBRIET,photosensitivity reaction,0,0,0,1


In [143]:
for key in ('D', 'OB', 'OL'):
    tn, fp, fn, tp = confusion_matrix(predictions['label'], predictions[key]).ravel()
    print(f"{key:2s} Specificity: {tn/(tn+fp):5.3f}")

print()
for key in ('D', 'OB', 'OL'):
    tn, fp, fn, tp = confusion_matrix(predictions['label'], predictions[key]).ravel()
    print(f"{key:2s} Recall/Sens: {tp/(tp+fn):5.3f}")

print()
for key in ('D', 'OB', 'OL'):
    tn, fp, fn, tp = confusion_matrix(predictions['label'], predictions[key]).ravel()
    print(f"{key:2s} FPR        : {fp/(fp+tn):5.3f}")

print()
for key in ('D', 'OB', 'OL'):
    tn, fp, fn, tp = confusion_matrix(predictions['label'], predictions[key]).ravel()
    print(f"{key:2s} Precision  : {tp/(tp+fp):5.3f}")

print()
for key in ('D', 'OB', 'OL'):
    tn, fp, fn, tp = confusion_matrix(predictions['label'], predictions[key]).ravel()
    print(f"{key:2s} F1         : {tp/(tp+0.5*(fp+fn)):5.3f}")


D  Specificity: 0.962
OB Specificity: 0.575
OL Specificity: 0.716

D  Recall/Sens: 0.065
OB Recall/Sens: 0.857
OL Recall/Sens: 0.714

D  FPR        : 0.038
OB FPR        : 0.425
OL FPR        : 0.284

D  Precision  : 0.027
OB Precision  : 0.032
OL Precision  : 0.040

D  F1         : 0.038
OB F1         : 0.062
OL F1         : 0.076
