In [1]:
import os
import csv
import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

from evaluation_functions import evaluation_granular, evaluate


In [2]:
section_names = {
    'S1': 'adverse reactions',
    'S2': 'boxed warnings',
    'S3': 'warnings and precautions'
}

In [3]:
folder = 'data/DeepCADRME/'

In [4]:
guess_labels = glob(folder+'guess_xml/*')

extractions = list()
for label in tqdm(guess_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    
    mentions = defaultdict(list)
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          mention_str = mention['str'].lower()
          mentions[section_name].append(mention_str)
    
    for section_code, ades in mentions.items():
      
      extractions.append([drug_name, section_names[section_code], ', '.join(ades)])

extractions_df = pd.DataFrame(extractions, columns=['drug_name', 'section_name', 'gpt_output'])
extractions_df.to_csv(os.path.join(folder, 'deepcadrme_guess_ades.csv'))

100%|██████████| 99/99 [00:00<00:00, 547.11it/s]


In [5]:
extractions_df

Unnamed: 0,drug_name,section_name,gpt_output
0,IMPAVIDO,adverse reactions,"nausea, vomiting, diarrhea, headache, decrease..."
1,IMPAVIDO,warnings and precautions,"embryo fetal toxicity, testicular atrophy, imp..."
2,IMPAVIDO,boxed warnings,"embryo fetal toxicity, embryo fetal toxicity, ..."
3,LIVALO,adverse reactions,"rhabdomyolysis, myoglobinuria, acute renal fai..."
4,LIVALO,boxed warnings,"skeletal muscle effects, myopathy, rhabdomyoly..."
...,...,...,...
231,ARZERRA,adverse reactions,"infusion reactions, hepatitis b virus reactiva..."
232,ARZERRA,warnings and precautions,"infusion reactions, tumor lysis syndrome, cyto..."
233,ARZERRA,boxed warnings,"hepatitis virus hbv reactivation, hepatitis re..."
234,ESBRIET,adverse reactions,"liver enzyme elevations, photosensitivity reac..."


In [6]:
# test
drug_file = 'data/test_drug_label_text.csv'
manual_file = 'data/test_drug_label_text_manual_ades.csv'

# my_max = 10000

drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

In [7]:
outputs = dict()
outputs['deepcadrme_test'] = extractions_df

In [8]:
evaluate(outputs, manual_ades, 'strict')
evaluate(outputs, manual_ades, 'lenient')

Running strict evaluation and saving results to disk.
deepcadrme_test


100%|██████████| 99/99 [00:01<00:00, 50.82it/s]


Running lenient evaluation and saving results to disk.
deepcadrme_test


100%|██████████| 99/99 [00:10<00:00,  9.10it/s]
