In [5]:
import csv

import pandas as pd
from glob import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

In [6]:
umls_file = 'data/umls_meddra_en.csv'
fh = open(umls_file)
reader = csv.reader(fh)
header = next(reader)

meddra_terms = set()
for row in reader:
    d = dict(zip(header, row))
    meddra_terms.add(d['STR'].lower())

fh.close()

In [7]:
folder = 'data/TAC2017/'

In [8]:
train_labels = glob(folder+'train_xml/*')

drug2mentions = defaultdict(set)
drug2reactions = defaultdict(set)

for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    
    for mention in soup.find_all('Mention'):
        if mention['type'] == 'AdverseReaction':
          section_name = mention['section']
          mention_str = mention['str'].lower()
          drug2mentions[drug_name].add(mention_str)
    
    for reaction in soup.find_all('Reaction'):
        reaction_str = reaction['str']
        # for norm in reaction.find_all('Normalization'):
        #   try:
        #     meddra_str = norm['meddra_pt']
        #     meddra_id = norm['meddra_pt_id']
        #   except:
        #     meddra_str, meddra_id = None, None
        #   try:
        #     meddra_llt = norm['meddra_llt']
        #     meddra_llt_id = norm['meddra_llt_id']
        #   except:
        #     meddra_llt, meddra_llt_id = None, None
        # drug_label_text.append([drug_name, string, meddra_str, meddra_id, meddra_llt, meddra_llt_id])
        drug2reactions[drug_name].add(reaction_str)

len(drug2mentions), len(drug2reactions)

  0%|          | 0/101 [00:00<?, ?it/s]

100%|██████████| 101/101 [00:00<00:00, 209.58it/s]


(101, 101)

In [16]:
diffs_list = list()

for drug in drug2mentions.keys():

    setdiff = drug2mentions[drug]-drug2reactions[drug]
    diff = len(drug2mentions[drug])-len(drug2reactions[drug])
    setdiff_inmeddra = meddra_terms & drug2mentions[drug]
    diffs_list.append([drug, len(drug2mentions[drug]), len(drug2reactions[drug]), diff, len(setdiff), len(setdiff_inmeddra)])

diffs = pd.DataFrame(diffs_list, columns=['drug', 'nmentions', 'nreactions', 'diff', 'setdiff', 'nmeddraexact'])
diffs

Unnamed: 0,drug,nmentions,nreactions,diff,setdiff,nmeddraexact
0,XEOMIN,81,72,9,9,59
1,QUTENZA,45,45,0,0,37
2,ZYTIGA,68,68,0,0,45
3,TOVIAZ,46,46,0,0,33
4,STRIBILD,136,135,1,1,80
...,...,...,...,...,...,...
96,CARBAGLU,23,23,0,0,21
97,PROMACTA,82,82,0,0,56
98,BESIVANCE,9,9,0,0,6
99,INLYTA,116,113,3,3,85


In [17]:
diffs.query("diff != setdiff")

Unnamed: 0,drug,nmentions,nreactions,diff,setdiff,nmeddraexact


In [18]:
diffs.query("diff < 0")

Unnamed: 0,drug,nmentions,nreactions,diff,setdiff,nmeddraexact


In [20]:
diffs["diff"].sum(), diffs["nmeddraexact"].sum()

(344, 5071)

In [21]:
diffs.query("diff > 0")

Unnamed: 0,drug,nmentions,nreactions,diff,setdiff,nmeddraexact
0,XEOMIN,81,72,9,9,59
4,STRIBILD,136,135,1,1,80
9,PRISTIQ,148,128,20,20,104
10,TAFINLAR,158,155,3,3,114
11,MULTAQ,57,56,1,1,34
12,TANZEUM,60,54,6,6,37
13,BELEODAQ,51,50,1,1,37
14,HALAVEN,66,65,1,1,56
15,XIAFLEX,95,94,1,1,56
16,CIMZIA,172,146,26,26,110


In [14]:
drug = 'NATAZIA'
drug2mentions[drug]-drug2reactions[drug]

{'adverse lipid changes',
 'arterial thromboses',
 'birth defects',
 'breakthrough bleeding',
 'breast cancer',
 'cardiac anomalies',
 'cerebrovascular events',
 'cervical cancer',
 'cervical intraepithelial neoplasia',
 'cholestasis',
 'decrease glucose tolerance',
 'hemorrhagic strokes',
 'hepatic adenomas',
 'hepatocellular carcinoma',
 'increase in blood pressure',
 'limb-reduction defects',
 'liver cancers',
 'migraine',
 'myocardial infarctions',
 'pancreatitis',
 'spotting',
 'strokes',
 'teratogenic effect',
 'thromboembolic disease',
 'thrombotic strokes',
 'thyroid-binding globulin increase',
 'venous thromboembolism'}