In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [3]:
#change to appropriate home folder
folder = 'data/TAC2017/'

## train

In [4]:
#first we want to identify the sentences that are unncecessary. we assume they are commonly found.
drug_label_text = pd.read_csv(folder+'train_drug_label_text.csv')
drug_label_text['section_text'] = drug_label_text.section_text.apply(lambda x: [i.strip() for i in x.strip().split('\n') if i.strip() != ''])
drug_label_text = drug_label_text.explode('section_text').reset_index(drop=True)
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: x.split('. '))
drug_label_text = drug_label_text.explode('section_text').reset_index(drop=True)
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: x.strip())
drug_label_text.head(10)

Unnamed: 0,drug_name,section_name,section_text
0,XEOMIN,adverse reactions,6 ADVERSE REACTIONS
1,XEOMIN,adverse reactions,The following adverse reactions to XEOMIN are ...
2,XEOMIN,adverse reactions,* Hypersensitivity [ see Contraindications (...
3,XEOMIN,adverse reactions,* Dysphagia and Breathing Difficulties in Tre...
4,XEOMIN,adverse reactions,* Spread of Effects from Toxin [see Warnings...
5,XEOMIN,adverse reactions,EXCERPT: Cervical Dystonia: The most comm...
6,XEOMIN,adverse reactions,Blepharospasm: The most commonly observed adv...
7,XEOMIN,adverse reactions,Glabellar Lines: The most commonly observed a...
8,XEOMIN,adverse reactions,"To report SUSPECTED ADVERSE REACTIONS, contact..."
9,XEOMIN,adverse reactions,6.1 Clinical Trials Experience


In [5]:
df = drug_label_text[(drug_label_text['section_text'].str.contains('\[see'))|
                (drug_label_text['section_text'].str.contains('\[ see'))|
                (drug_label_text['section_text'].str.contains('\[  see'))|
                (drug_label_text['section_text'].str.contains('\[   see'))]
df['sub'] = df['section_text'].apply(lambda x: '[' + x.split('[')[1])
df['section_text_replace'] = df['section_text'].apply(lambda x: x.split('[')[0])
cut_bracket = dict(zip(df['section_text'], df['section_text_replace']))
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: cut_bracket[x] if x in cut_bracket.keys() else x)
drug_label_text.head()

Unnamed: 0,drug_name,section_name,section_text
0,XEOMIN,adverse reactions,6 ADVERSE REACTIONS
1,XEOMIN,adverse reactions,The following adverse reactions to XEOMIN are ...
2,XEOMIN,adverse reactions,* Hypersensitivity
3,XEOMIN,adverse reactions,* Dysphagia and Breathing Difficulties in Tre...
4,XEOMIN,adverse reactions,* Spread of Effects from Toxin


In [6]:
generics = ['Because clinical trials are conducted under widely varying conditions,'\
            'adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clinical trials '\
            'of another drug and may not reflect the rates observed in practice.',
            'Because these reactions are reported voluntarily from a population of uncertain size, '\
            'it is not always possible to reliably estimate their frequency or establish a causal relationship to drug exposure.',
            'See full prescribing information for complete boxed warning. ',
            'To report SUSPECTED ADVERSE REACTIONS, contact Amgen Medical Information at 1-800-77-AMGEN (1-800-772-6436) '\
            'or FDA at 1-800-FDA-1088 or    www.fda.gov/medwatch    .']
def remove_similar_strings(string_list, generic, threshold=100):
    def levenshtein_distance(s1, s2):
        if len(s1) > len(s2):
            s1, s2 = s2, s1
        distances = range(len(s1) + 1)
        for i2, c2 in enumerate(s2):
            distances_ = [i2+1]
            for i1, c1 in enumerate(s1):
                if c1 == c2:
                    distances_.append(distances[i1])
                else:
                    distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
            distances = distances_
        return distances[-1]
    # Example usage
    #specific_string = generic
    string_list = string_list #drug_label_text.section_text.unique()
    distances = pd.DataFrame([(s, levenshtein_distance(generic, s)) for s in string_list], columns = ['str', 'compute'])
    #strings to remove
    omit = distances[distances.compute < threshold].str.tolist()
    return omit

In [7]:
omit = []
thresholds = [100, 100, 25, 50]
for threshold, generic in zip(thresholds, generics):
    omit.extend(remove_similar_strings(drug_label_text.section_text.unique(), generic, threshold = threshold))
omit

['Because clinical trials are conducted under widely varying conditions, adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clinical trials of another drug, and may not reflect the rates observed in practice.',
 'Because clinical trials are conducted under widely varying conditions, adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clinical trials of another drug and may not reflect the rates observed in practice.',
 'Because clinical trials are conducted under widely varying conditions, adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clinical trials of other drugs and may not reflect the rates observed in clinical practice.',
 'Because clinical trials are conducted under widely varying conditions, adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clini

In [8]:
bracket_omit = drug_label_text[(drug_label_text.section_text.str[0] == '(')&
                               (drug_label_text.section_text.str[-1] == ')')].section_text.unique().tolist()
subtitle_omit = drug_label_text[(drug_label_text.section_text.str.startswith('5'))|
                                (drug_label_text.section_text.str.startswith('6'))
                                ].section_text.unique().tolist()

---

In [9]:
#first we want to identify the sentences that are unncecessary. we assume they are commonly found.
drug_label_text_raw = pd.read_csv(folder+'train_drug_label_text.csv')
drug_label_text_raw.head(1)

Unnamed: 0,drug_name,section_name,section_text
0,XEOMIN,adverse reactions,6 ADVERSE REACTIONS\n\n The following adv...


In [10]:
df = drug_label_text_raw[(drug_label_text_raw['section_text'].str.contains('\[see'))|
                (drug_label_text_raw['section_text'].str.contains('\[ see'))|
                (drug_label_text_raw['section_text'].str.contains('\[  see'))|
                (drug_label_text_raw['section_text'].str.contains('\[   see'))]
df['sub'] = df['section_text'].apply(lambda x: '[' + x.split('[')[1].split(']')[0] + ']')
df['section_text_replace'] = df.apply(lambda x: x['section_text'].replace(x['sub'], ''), axis = 1 )
cut_bracket = dict(zip(df['section_text'], df['section_text_replace']))
drug_label_text_raw['section_text'] = drug_label_text_raw['section_text'].apply(lambda x: cut_bracket[x] if x in cut_bracket.keys() else x)
drug_label_text_raw.head()

Unnamed: 0,drug_name,section_name,section_text
0,XEOMIN,adverse reactions,6 ADVERSE REACTIONS\n\n The following adv...
1,XEOMIN,boxed warnings,\n\n BOXED WARNING: WARNING: DISTANT SPREAD...
2,XEOMIN,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCERP...
3,QUTENZA,adverse reactions,6 ADVERSE REACTIONS\n\n The following ser...
4,QUTENZA,warnings and precautions,5 WARNINGS AND PRECAUTIONS\n\n\n\n EXCER...


In [11]:
section_texts_removed = []
section_texts = drug_label_text_raw.section_text.tolist()
for section_text in section_texts:
    for i in omit:
        section_text = section_text.replace(i, '')
    for i in bracket_omit:
        section_text = section_text.replace(i, '')
    for i in subtitle_omit:
        section_text = section_text.replace(i, '')
    section_text = section_text.strip()
    section_texts_removed.append(section_text)
drug_label_text_raw['section_text_removed'] = section_texts_removed 

In [12]:
drug_label_text_raw['len_shortened'] = drug_label_text_raw.apply(lambda x: len(x['section_text']) - len(x['section_text_removed']), axis = 1)

In [13]:
drug_label_text_raw['len_shortened'].describe()

count     239.000000
mean      337.150628
std       248.554245
min        14.000000
25%       105.000000
50%       303.000000
75%       533.000000
max      1318.000000
Name: len_shortened, dtype: float64

In [14]:
drug_label_text_raw.drop('len_shortened', axis = 1).to_csv(folder+'train_drug_label_text_remove_unnecessary_info.csv', index = False)

## test

In [15]:
#first we want to identify the sentences that are unncecessary. we assume they are commonly found.
drug_label_text = pd.read_csv(folder+'test_drug_label_text.csv')
drug_label_text['section_text'] = drug_label_text.section_text.apply(lambda x: [i.strip() for i in x.strip().split('\n') if i.strip() != ''])
drug_label_text = drug_label_text.explode('section_text').reset_index(drop=True)
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: x.split('. '))
drug_label_text = drug_label_text.explode('section_text').reset_index(drop=True)
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: x.strip())
drug_label_text.head(10)

Unnamed: 0,drug_name,section_name,section_text
0,IMPAVIDO,adverse reactions,6 ADVERSE REACTIONS
1,IMPAVIDO,adverse reactions,Because clinical trials are conducted under wi...
2,IMPAVIDO,adverse reactions,EXCERPT: * Adverse reactions occurring i...
3,IMPAVIDO,adverse reactions,"To report SUSPECTED ADVERSE REACTIONS, contact..."
4,IMPAVIDO,adverse reactions,at 1-888-550-6060 or FDA at 1-800-FDA-1088 ...
5,IMPAVIDO,adverse reactions,6.1 Clinical Trials Experience
6,IMPAVIDO,adverse reactions,Visceral Leishmaniasis
7,IMPAVIDO,adverse reactions,One Phase 3 trial was conducted in patients >=...
8,IMPAVIDO,adverse reactions,Two-hundred and ninety-nine (299) patients (21...
9,IMPAVIDO,adverse reactions,Patients ranged between 12 and 64 years of age


In [16]:
df = drug_label_text[(drug_label_text['section_text'].str.contains('\[see'))|
                (drug_label_text['section_text'].str.contains('\[ see'))|
                (drug_label_text['section_text'].str.contains('\[  see'))|
                (drug_label_text['section_text'].str.contains('\[   see'))]
df['sub'] = df['section_text'].apply(lambda x: '[' + x.split('[')[1])
df['section_text_replace'] = df['section_text'].apply(lambda x: x.split('[')[0])
cut_bracket = dict(zip(df['section_text'], df['section_text_replace']))
drug_label_text['section_text'] = drug_label_text['section_text'].apply(lambda x: cut_bracket[x] if x in cut_bracket.keys() else x)
drug_label_text.head()

Unnamed: 0,drug_name,section_name,section_text
0,IMPAVIDO,adverse reactions,6 ADVERSE REACTIONS
1,IMPAVIDO,adverse reactions,Because clinical trials are conducted under wi...
2,IMPAVIDO,adverse reactions,EXCERPT: * Adverse reactions occurring i...
3,IMPAVIDO,adverse reactions,"To report SUSPECTED ADVERSE REACTIONS, contact..."
4,IMPAVIDO,adverse reactions,at 1-888-550-6060 or FDA at 1-800-FDA-1088 ...


In [17]:
generics = ['Because clinical trials are conducted under widely varying conditions,'\
            'adverse reaction rates observed in the clinical trials of a drug cannot be directly compared to rates in the clinical trials '\
            'of another drug and may not reflect the rates observed in practice.',
            'Because these reactions are reported voluntarily from a population of uncertain size, '\
            'it is not always possible to reliably estimate their frequency or establish a causal relationship to drug exposure.',
            'See full prescribing information for complete boxed warning. ',
            'To report SUSPECTED ADVERSE REACTIONS, contact Amgen Medical Information at 1-800-77-AMGEN (1-800-772-6436) '\
            'or FDA at 1-800-FDA-1088 or    www.fda.gov/medwatch    .']
def remove_similar_strings(string_list, generic, threshold=100):
    def levenshtein_distance(s1, s2):
        if len(s1) > len(s2):
            s1, s2 = s2, s1
        distances = range(len(s1) + 1)
        for i2, c2 in enumerate(s2):
            distances_ = [i2+1]
            for i1, c1 in enumerate(s1):
                if c1 == c2:
                    distances_.append(distances[i1])
                else:
                    distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
            distances = distances_
        return distances[-1]
    # Example usage
    #specific_string = generic
    string_list = string_list #drug_label_text.section_text.unique()
    distances = pd.DataFrame([(s, levenshtein_distance(generic, s)) for s in string_list], columns = ['str', 'compute'])
    #strings to remove
    omit = distances[distances.compute < threshold].str.tolist()
    return omit
omit = []
thresholds = [100, 100, 25, 50]
for threshold, generic in zip(thresholds, generics):
    omit.extend(remove_similar_strings(drug_label_text.section_text.unique(), generic, threshold = threshold))
bracket_omit = drug_label_text[(drug_label_text.section_text.str[0] == '(')&
                               (drug_label_text.section_text.str[-1] == ')')].section_text.unique().tolist()
subtitle_omit = drug_label_text[(drug_label_text.section_text.str.startswith('5'))|
                                (drug_label_text.section_text.str.startswith('6'))
                                ].section_text.unique().tolist()

In [18]:
#first we want to identify the sentences that are unncecessary. we assume they are commonly found.
drug_label_text_raw = pd.read_csv(folder+'test_drug_label_text.csv')
drug_label_text_raw.head(1)
df = drug_label_text_raw[(drug_label_text_raw['section_text'].str.contains('\[see'))|
                (drug_label_text_raw['section_text'].str.contains('\[ see'))|
                (drug_label_text_raw['section_text'].str.contains('\[  see'))|
                (drug_label_text_raw['section_text'].str.contains('\[   see'))]
df['sub'] = df['section_text'].apply(lambda x: '[' + x.split('[')[1].split(']')[0] + ']')
df['section_text_replace'] = df.apply(lambda x: x['section_text'].replace(x['sub'], ''), axis = 1 )
cut_bracket = dict(zip(df['section_text'], df['section_text_replace']))
drug_label_text_raw['section_text'] = drug_label_text_raw['section_text'].apply(lambda x: cut_bracket[x] if x in cut_bracket.keys() else x)
drug_label_text_raw.head()
section_texts_removed = []
section_texts = drug_label_text_raw.section_text.tolist()
for section_text in section_texts:
    for i in omit:
        section_text = section_text.replace(i, '')
    for i in bracket_omit:
        section_text = section_text.replace(i, '')
    for i in subtitle_omit:
        section_text = section_text.replace(i, '')
    section_text = section_text.strip()
    section_texts_removed.append(section_text)
drug_label_text_raw['section_text_removed'] = section_texts_removed 
drug_label_text_raw['len_shortened'] = drug_label_text_raw.apply(lambda x: len(x['section_text']) - len(x['section_text_removed']), axis = 1)
drug_label_text_raw['len_shortened'].describe()
drug_label_text_raw.drop('len_shortened', axis = 1).to_csv(folder+'test_drug_label_text_remove_unnecessary_info.csv', index = False)
