In [132]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import re
from math import floor
from random import randint, seed

In [2]:
import os
os.path.exists('train_xml/')

True

In [3]:
train_labels = glob('train_xml/*.xml')
drug_label_text = []
for label in tqdm(train_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    for mention in soup.find_all('Mention'):
        if mention['type'] == "AdverseReaction":
            section = mention['section']
            start = mention['start']
            length = mention['len']
            term = mention['str']

            drug_label_text.append([drug_name, section, start, length, term])
drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'section', 'start', 'len', 'term'])
drug_label_text.sort_values(by = 'drug_name').head()

  0%|          | 0/101 [00:00<?, ?it/s]

100%|██████████| 101/101 [00:00<00:00, 135.35it/s]


Unnamed: 0,drug_name,section,start,len,term
11196,ADCETRIS,S1,17377,7,dyspnea
11052,ADCETRIS,S1,516,20,Tumor Lysis Syndrome
11053,ADCETRIS,S1,595,18,Increased Toxicity
11054,ADCETRIS,S1,715,18,Increased Toxicity
11055,ADCETRIS,S1,849,14,Hepatotoxicity


In [4]:
data = pd.read_csv('data/train_drug_label_text.csv').query('section_name == "adverse reactions"')
data.sort_values(by = 'drug_name').head()

Unnamed: 0,drug_name,section_name,section_text
212,ADCETRIS,adverse reactions,6 ADVERSE REACTIONS\n\n The following ser...
59,ADREVIEW,adverse reactions,6 ADVERSE REACTIONS\n\n EXCERPT: Seriou...
157,AFINITOR,adverse reactions,6 ADVERSE REACTIONS\n\n The following...
237,AMPYRA,adverse reactions,6 ADVERSE REACTIONS\n\n Because clinical ...
176,AMYVID,adverse reactions,6 ADVERSE REACTIONS\n\n EXCERPT: The mo...


In [144]:
seed(100)
count = 0
overlap_data = []
context_size = 100
for row in drug_label_text.itertuples():
    drug = row.drug_name
    sub_data = data[data.drug_name == drug].section_text.values[0]
    adverse_event = row.term
    if len(row.start.split(',')) <= 1:
        before = re.sub(r'[^A-Za-z ]+', '', 
        sub_data[max(0, int(row.start)-context_size)
                 : int(row.start)]).split()
        after = re.sub(r'[^A-Za-z ]+', '',
        sub_data[int(row.start)+int(row.len):
                 min(int(row.start)+int(row.len)+context_size, len(sub_data))]).split()
        term = sub_data[int(row.start):int(row.start)+int(row.len)]
        num_overlap = randint(0, 10)
        num_extra_before = min([5, len(before)])
        num_extra_after = min([5, len(after)])
        for i in range(num_overlap):
            option = randint(0, 4)
            if len(before) > 1:
                start_diff = randint(1, num_extra_before) * -1
                before_terms = before[start_diff:-1]
            else:
                continue
            if len(after) > 1:
                end_diff = randint(0, num_extra_after)
                after_terms = after[0:end_diff]
            else:
                continue
            
            if option == 0: # start - X: end + Y
                overlap_data.append([drug, adverse_event,
                                     ' '.join(before_terms + [term] + after_terms), 1])
            elif option == 1: # start + X: end + Y
                if len(term.split()) > 1:
                    term_diff = randint(0, len(term.split())-2)
                    sub_term = term.split()[term_diff:]
                    comp_term = ' '.join(sub_term + after_terms)
                    is_overlap = 1
                else:
                    comp_term = ' '.join([term] + after_terms)
                    is_overlap = 1
            elif option == 2: # start - X: end - Y
                if len(term.split()) > 1:
                    term_diff = randint(1, len(term.split()))
                    sub_term = term.split()[:term_diff]
                    comp_term = ' '.join(before_terms + sub_term)
                    is_overlap = 1
                else:
                    comp_term = ' '.join([term] + after_terms)
                    is_overlap = 1
                    
            elif option in [3,4,]: # no overlap
                # either context after or context from before
                before_or_after = randint(0, 1)
                if before_or_after == 0:
                    try:
                        start = randint(0, (len(before))-1)
                        end = randint(start+1, len(before))
                        comp_term = ' '.join(before[start:end])
                        is_overlap = 0
                    except:
                        continue
                else:
                    try:
                        start = randint(0, len(after)-3)
                        end = randint(start+1, len(after))
                        comp_term = ' '.join(after[start:end])
                        is_overlap = 0
                    except:
                        continue
            
            if len(comp_term) == 0:
                            continue
            overlap_data.append([drug, adverse_event, comp_term, is_overlap])

In [145]:
overlap_data_df = pd.DataFrame(overlap_data, columns=['drug', 'adverse_event', 'comp_term', 'is_overlapping'])
overlap_data_df.to_csv('data/overlap_data.csv', index=False)