In [1]:
import os
import csv
import numpy as np
import pandas as pd

from glob import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

from evaluation_functions import evaluation_granular, evaluate


In [2]:
section_names = {
    'S1': 'adverse reactions',
    'S2': 'boxed warnings',
    'S3': 'warnings and precautions'
}

In [3]:
folder = 'data/DeepCADRME/'
size = 2
!ls $folder

deepcadrme_guess_ades.csv [1m[36mguess_xml_2[m[m               [1m[36mguess_xml_75[m[m
[1m[36mguess_xml_10[m[m              [1m[36mguess_xml_25[m[m              [1m[36mguess_xml_98[m[m
[1m[36mguess_xml_100[m[m             [1m[36mguess_xml_5[m[m
[1m[36mguess_xml_15[m[m              [1m[36mguess_xml_50[m[m


In [4]:
guess_labels = glob(folder+f"guess_xml_{size}/*")

extractions = list()
for label in tqdm(guess_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    
    mentions = defaultdict(list)
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          mention_str = mention['str'].lower()
          mentions[section_name].append(mention_str)
    
    for section_code, ades in mentions.items():
      
      extractions.append([drug_name, section_names[section_code], ', '.join(ades).replace(', ,', '')])

extractions_df = pd.DataFrame(extractions, columns=['drug_name', 'section_name', 'gpt_output'])
extractions_df.to_csv(os.path.join('results', f"deepcadrme_{str(size).zfill(3)}_test.csv"))

100%|██████████| 99/99 [00:00<00:00, 626.60it/s]


In [5]:
# test
drug_file = 'data/test_drug_label_text.csv'
manual_file = 'data/test_drug_label_text_manual_ades.csv'

# my_max = 10000

drugs = pd.read_csv(drug_file)
manual_ades = pd.read_csv(manual_file)
set_type = drug_file.split('/')[1].split('_')[0] # assuming file follows format "train_..." or "test...."

In [6]:
outputs = dict()
outputs[f"deepcadrme_{str(size).zfill(3)}_test"] = extractions_df

In [7]:
evaluate(outputs, manual_ades, 'strict')
evaluate(outputs, manual_ades, 'lenient')

  output['gpt_output'] = gpt_output['gpt_output'].str.lower().str.replace('.', '').str.replace('\n-', ', ').str.split(', ')


Running strict evaluation and saving results to disk.
deepcadrme_002_test
saving results to results/deepcadrme_002_test_strict_granular.csv and results/deepcadrme_002_test_strict_overall.csv


100%|██████████| 99/99 [00:01<00:00, 82.14it/s]
  output['gpt_output'] = gpt_output['gpt_output'].str.lower().str.replace('.', '').str.replace('\n-', ', ').str.split(', ')


Running lenient evaluation and saving results to disk.
deepcadrme_002_test
saving results to results/deepcadrme_002_test_lenient_granular.csv and results/deepcadrme_002_test_lenient_overall.csv


100%|██████████| 99/99 [00:07<00:00, 12.38it/s]
