In [1]:
import xml.etree.cElementTree as ET
import os
import nltk
import string

### Get the files for parsing

In [3]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
print(codefolder)
datafolder = (codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set1"))
print (datafolder)

filenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    if filename.endswith( ('.xml') ): # select xml files
        #print(filename)
        filenames.append(filename)


C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final Project\lheart-disease-risk-prediction\Code
C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final Project\lheart-disease-risk-prediction\Dataset\training-RiskFactors-Complete-Set1


### Define function to get tokens & their attributes

In [4]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

In [5]:
print(filenames[30])

C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final Project\lheart-disease-risk-prediction\Dataset\training-RiskFactors-Complete-Set1\226-04.xml


### Define function to generate BIO Coding (Model 14 and 15)

BIO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* B - marks beginning of the entity
* I - marks inside of the entity
* O - marks that the token is NOT part of any entity


In [6]:
def Generate_BIO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (BIO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

        bio_labels = []

        count = 0

    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            next_tag="B-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    return tokens, bio_labels

In [7]:
def getBIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    all_tokens = []
    all_labels = []

    for file in filenames:
        print("processing file ... ", file)
        tokens, bio_labels = Generate_BIO_Coding(file_path=file, tag=tag, attribute=attrib)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_tokens, all_labels


### Generate Data for Models

In [14]:
# get data for model #14
tag = 'SMOKER'
attribute = 'status'

smoker_indicator_tokens, smoker_indicator_labels = getBIOCoding_data(tag, attribute, filenames)

In [18]:
# capture data into dataframe to work with it
dfs = pd.DataFrame({'token': smoker_indicator_tokens, 'label': smoker_indicator_labels})

In [19]:
dfs[dfs['label']!='O']

Unnamed: 0,label,token
97,B-smoker.past,no
98,I-smoker.past,smoking
99,I-smoker.past,for
100,I-smoker.past,3
101,I-smoker.past,months
519,B-smoker.past,smoking
520,I-smoker.past,:
521,I-smoker.past,quit
1007,B-smoker.past,smoking
1008,I-smoker.past,:


In [34]:
dfs[dfs['token']=='allergic']

Unnamed: 0,label,token
1430,O,allergic
2328,O,allergic
3295,O,allergic
123603,O,allergic
174148,O,allergic
243868,O,allergic
255938,O,allergic
291284,O,allergic
297283,O,allergic
297290,O,allergic


In [28]:
# get data for model #15
tag = 'MEDICATION'
attribute = 'type1'

medication1_indicator_tokens, medication1_indicator_labels = getBIOCoding_data(tag, attribute, filenames)


In [26]:
# capture data into dataframe to work with it
dfm1 = pd.DataFrame({'token': medication1_indicator_tokens, 'label': medication1_indicator_labels})

In [27]:
dfm1[dfm1['label']!='O']

Unnamed: 0,label,token
31,B-medication.diuretic,hctz
189,B-medication.calcium_channel_blocker,norvasc
201,B-medication.thienopyridine,plavix
209,B-medication.beta_blocker,atenolol
218,B-medication.aspirin,asa
231,B-medication.ace_inhibitor,zestril
243,B-medication.statin,lipitor
255,B-medication.diuretic,hctz
267,B-medication.nitrate,nitroglycerin
618,B-medication.aspirin,asa


In [30]:
tag = 'MEDICATION'
attribute = 'type2'

medication2_indicator_tokens, medication2_indicator_labels = getBIOCoding_data(tag, attribute, filenames)


In [31]:
# capture data into dataframe to work with it
dfm2 = pd.DataFrame({'token': medication2_indicator_tokens, 'label': medication2_indicator_labels})

In [38]:
dfm2[dfm2['label'] != 'O'] 

Unnamed: 0,label,token
31,B-medication.,hctz
189,B-medication.,norvasc
201,B-medication.,plavix
209,B-medication.,atenolol
218,B-medication.,asa
231,B-medication.,zestril
243,B-medication.,lipitor
255,B-medication.,hctz
267,B-medication.,nitroglycerin
618,B-medication.,asa
