In [72]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
#change to appropriate home folder
folder = ''

In [4]:
#read in drug label text csv generated in `1.extract_data.ipynb`
drug_label_text = pd.read_csv(folder+'train_drug_label_text.csv')
drug_label_text.head(10)

Unnamed: 0,drug_name,section_name,section_text
0,KYPROLIS,adverse reactions,6 ADVERSE REACTIONS\n\n The following adv...
1,KYPROLIS,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...
2,MULTAQ,adverse reactions,6 ADVERSE REACTIONS\n\n The following saf...
3,MULTAQ,boxed warnings,\n\n BOXED WARNING: WARNING: INCREASED RISK...
4,MULTAQ,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...
5,JUBLIA,adverse reactions,6 ADVERSE REACTIONS\n\n EXCERPT: The mo...
6,TEFLARO,adverse reactions,6. ADVERSE REACTIONS\n\n The following se...
7,TEFLARO,warnings and precautions,5. WARNINGS AND PRECAUTIONS\n\n\n\n EXCE...
8,DATSCAN,adverse reactions,6 ADVERSE REACTIONS\n\n EXCERPT: Hypers...
9,DATSCAN,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...


----

In [110]:
def organize_bw(strings):
    organized_list = []
    current_title = None
    current_content = []
    for string in strings:
        if string.isupper():
            if current_title:
                organized_list.append([current_title, current_content])
            current_title = string.strip()
            current_content = []
        else:
            current_content.append(string.strip())
    # Adding the last title and content if they exist
    if current_title:
        organized_list.append([current_title, current_content])
    return organized_list
def organize_wp(strings):
    organized_list = []
    current_subtitle = None
    current_context = []
    for string in strings:
        if string.startswith('5'):
            if current_subtitle:
                organized_list.append([current_subtitle, current_context])
            current_subtitle = string
            current_context = []
        else:
            current_context.append(string)
    # Adding the last subtitle and context if they exist
    if current_subtitle:
        organized_list.append([current_subtitle, current_context])
    return organized_list
def organize_ar(strings):
    organized_list = []
    current_subtitle = None
    current_context = []
    for string in strings:
        string = string.strip()
        if string.startswith('6') or string.startswith('Table'):
            if current_subtitle:
                organized_list.append([current_subtitle, current_context])
            current_subtitle = string
            current_context = []
        else:
            current_context.append(string)
    # Adding the last subtitle and context if they exist
    if current_subtitle:
        organized_list.append([current_subtitle, current_context])
    return organized_list

----

In [111]:
subsections = []
for i, row in tqdm(drug_label_text.iterrows()):
    section_name = row['section_name']
    text = row['section_text']
    if section_name == 'warnings and precautions':
        wp_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        wp_text = organize_wp(wp_text)
        for subtitle, context in wp_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])
    if section_name == 'boxed warnings':
        bw_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        bw_text = organize_bw(bw_text)
        for subtitle, context in bw_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])
    if section_name == 'adverse reactions':
        ar_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        ar_text = organize_ar(ar_text)
        for subtitle, context in ar_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])
        #subsections.append([row['drug_name'], section_name, text.split('6.1 ')[0]]) #general, non-subsection adverse reactions
        #if '6.1 ' in text:
        #    subsections.append([row['drug_name'], section_name, '6.1', text.split('6.1 ')[-1].split('6.2 ')[0]]) #subsection 1
        #    if '6.2 ' in text:
        #        subsections.append([row['drug_name'], section_name, '6.2', text.split('6.2 ')[-1]]) #subsection 2

239it [00:00, 8169.57it/s]


In [115]:
subsections_df = pd.DataFrame(subsections, columns=['drug_name', 'section_name', 'subsection_name', 'subsection_text'])
subsections_df['subsection_text'] = subsections_df['subsection_text'].apply(lambda x: ' '.join(x))
subsections_df.to_csv('train_drug_label_text_subsections.csv', index=False)

---

In [116]:
test_labels = glob(folder+'gold_xml/*')
drug_label_text = []
for label in tqdm(test_labels):
    drug_name = label.split('/')[-1].split('.')[0]
    with open(label, 'r') as f:
        soup = BeautifulSoup(f, 'xml')
    for section in soup.find_all('Section'):
        section_name = section['name']
        drug_label_text.append([drug_name, section_name, section.text])
drug_label_text = pd.DataFrame(drug_label_text, columns=['drug_name', 'section_name', 'section_text'])
drug_label_text.to_csv(folder+'test_drug_label_text.csv', index=False)
drug_label_text.head()

100%|██████████| 99/99 [00:00<00:00, 283.13it/s]


Unnamed: 0,drug_name,section_name,section_text
0,IMPAVIDO,adverse reactions,6 ADVERSE REACTIONS\n\n Because clinical ...
1,IMPAVIDO,boxed warnings,\n\n BOXED WARNING: WARNING: EMBRYO-FETAL T...
2,IMPAVIDO,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...
3,LIVALO,adverse reactions,6 ADVERSE REACTIONS\n\n The following ser...
4,LIVALO,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...


In [117]:
subsections = []
for i, row in tqdm(drug_label_text.iterrows()):
    section_name = row['section_name']
    text = row['section_text']
    if section_name == 'warnings and precautions':
        wp_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        wp_text = organize_wp(wp_text)
        for subtitle, context in wp_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])
    if section_name == 'boxed warnings':
        bw_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        bw_text = organize_bw(bw_text)
        for subtitle, context in bw_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])
    if section_name == 'adverse reactions':
        ar_text = [' '.join(i.strip().split()) for i in text.strip().split('\n\n') if i.strip() != '']
        ar_text = organize_ar(ar_text)
        for subtitle, context in ar_text:
            subsections.append([row['drug_name'], section_name, subtitle, context])

237it [00:00, 10441.48it/s]


In [118]:
subsections_df = pd.DataFrame(subsections, columns=['drug_name', 'section_name', 'subsection_name', 'subsection_text'])
subsections_df['subsection_text'] = subsections_df['subsection_text'].apply(lambda x: ' '.join(x))
subsections_df.to_csv('test_drug_label_text_subsections.csv', index=False)