In [170]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import pandas as pd

### Get the files for parsing

In [159]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set1"))
#print (datafolder)

filenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    if filename.endswith( ('.xml') ): # select xml files
        #print(filename)
        filenames.append(filename)


### Define function to get tokens & their attributes

In [160]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

In [161]:
print(filenames[30])

C:\Users\sudha\Documents\W266-NLP\Final-Project-W266\Dataset\training-RiskFactors-Complete-Set1\226-04.xml


### Define function to generate IO Coding (Model2)

IO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* I - marks beginning/inside of the entity
* O - marks that the token is NOT part of any entity


In [162]:
def Generate_IO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (IO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []
    filename = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

        bio_labels = []

        count = 0

    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            # update the tag to 'I-' so that this generates IO-Coding
            next_tag="I-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    # build this list to hold name of the file the token belongs to
    # this is for the purpose of evaluation of the model from test results
    for i in range(0, len(tokens)):
        filename.append(file_path)
        
    return filename, tokens, bio_labels

In [163]:
def getIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    all_tokens = []
    all_labels = []
    all_filenames = []

    for file in filenames:
        #print("processing file ... ", file)
        filename, tokens, bio_labels = Generate_IO_Coding(file_path=file, tag=tag, attribute=attrib)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        all_filenames.extend(filename)  # this is for the purpose of validating test results
        #print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_filenames, all_tokens, all_labels


### Generate Data for Models

The goal is to generate the tokens and the corresponding labels specific to model of interest.  Below, we will be building the data for 'indicator' attribute for the following tags:

* DIABETES
* CAD
* HYPERTENSION
* HYPERLIPIDEMIA
* OBESE
* FAMILY_HIST

In [164]:
# get data for model #2
tag = 'DIABETES'
attribute = 'indicator'

diabetes_indicator_filenames, diabetes_indicator_tokens, diabetes_indicator_labels = getIOCoding_data(tag, attribute, filenames)
#diabetes_indicator_labels

In [165]:
# get data for model #4
tag = 'CAD'
attribute = 'indicator'

cad_indicator_filenames, cad_indicator_tokens, cad_indicator_labels = getIOCoding_data(tag, attribute, filenames)
#cad_indicator_labels

In [166]:
# get data for model #5
tag = 'HYPERTENSION'
attribute = 'indicator'

hypertension_indicator_filenames, hypertension_indicator_tokens, hypertension_indicator_labels = getIOCoding_data(tag, attribute, filenames)
#hypertension_indicator_labels

In [167]:
# get data for model #6
tag = 'HYPERLIPIDEMIA'
attribute = 'indicator'

hyperlipidemia_indicator_filenames, hyperlipidemia_indicator_tokens, hyperlipidemia_indicator_labels = getIOCoding_data(tag, attribute, filenames)


In [168]:
# get data for model #7
tag = 'OBESE'
attribute = 'indicator'

obese_indicator_filenames, obese_indicator_tokens, obese_indicator_labels = getIOCoding_data(tag, attribute, filenames)


In [169]:
# get data for model #8
tag = 'FAMILY_HIST'
attribute = 'indicator'

familyhist_indicator_filenames, familyhist_indicator_tokens, familyhist_indicator_labels = getIOCoding_data(tag, attribute, filenames)


### Write to Dataframe for processing

Capture into a dataframe, the data for model we are attempting to build. 

In [198]:
# capture data into dataframe to work with it
df = pd.DataFrame({'filename': cad_indicator_filenames, 'token': cad_indicator_tokens, 'label': cad_indicator_labels})

In [199]:
df[df['label']!='O']

Unnamed: 0,filename,token,label
145,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,known,I-cad.mention
146,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,hx,I-cad.mention
147,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,cad,I-cad.mention
166,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,coronary,I-cad.mention
167,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,artery,I-cad.mention
168,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,disease,I-cad.mention
169,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,s/p,I-cad.event
170,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,ant,I-cad.event
171,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,semi,I-cad.event
172,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,+,I-cad.event


In [201]:
df['label'].unique()

array(['O', 'I-cad.mention', 'I-cad.event', 'I-cad.test', 'I-cad.symptom'],
      dtype=object)

In [203]:
df['label'].value_counts()

O                404299
I-cad.test         1433
I-cad.event        1425
I-cad.mention       629
I-cad.symptom       285
Name: label, dtype: int64

In [204]:
#train_df = df[0:50000]  # use this to get a small set of training data
train_df = df

In [205]:
train_df[train_df['label']!='O']

Unnamed: 0,filename,token,label
145,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,known,I-cad.mention
146,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,hx,I-cad.mention
147,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,cad,I-cad.mention
166,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,coronary,I-cad.mention
167,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,artery,I-cad.mention
168,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,disease,I-cad.mention
169,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,s/p,I-cad.event
170,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,ant,I-cad.event
171,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,semi,I-cad.event
172,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,+,I-cad.event


### Form Test Data


In [72]:
# this was used for initial round of testing (using only files that start with 11 in testing folder)
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\testing-RiskFactors-Complete"))
#print (datafolder)

testfilenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    #if filename.endswith( ('.xml') ): # select xml files
    if file.startswith( ('11') ):
        #print(filename)
        testfilenames.append(filename)


### Get Test Tokens for 'Hypertension' (to test bert output)

In [66]:
# get data for model #5
tag = 'HYPERTENSION'
attribute = 'indicator'

test_hypertension_indicator_filenames, test_hypertension_indicator_tokens, test_hypertension_indicator_labels = getIOCoding_data(tag, attribute, testfilenames)
#hypertension_indicator_labels

In [86]:
# capture data into dataframe to work with it
test_df_hypertension = pd.DataFrame({'filename': test_hypertension_indicator_filenames, 'test_token': test_hypertension_indicator_tokens, 'test_label': test_hypertension_indicator_labels})

In [68]:
# actual counts of labels in test set for hypertension mention and high_bp
test_df_hypertension['test_label'].value_counts()

O                         23091
I-hypertension.mention       45
I-hypertension.high_bp       24
Name: test_label, dtype: int64

### Value Counts of Labels from BERT Classifier

Value counts of labels from BERT classifier (manually obtained from test_results file which holds probabilities for each class):

* Class 0: 23097
* Class 1: 47
* Class 2: 16

### Running manual count checks

In [88]:
tdf.shape

(23160, 3)

In [89]:
# check random values to see if the labels match output from bert (bert_run1_test_results)

tdf[488:495]
# bert output predicted I-hypertension.mention for token in position 490

Unnamed: 0,test_token,test_label,file
488,),O,110-02.xml
489,hypertension,I-hypertension.mention,110-02.xml
490,.,O,110-02.xml
491,under,O,110-02.xml
492,good,O,110-02.xml
493,control,O,110-02.xml
494,.,O,110-02.xml


#### Extract only filename from full path

In [87]:
def get_filename(fullpath):
    return fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Dataset\\testing-RiskFactors-Complete\\", "")
    
tdf = test_df_hypertension
tdf['file'] = tdf['filename'].apply(get_filename)
tdf.drop('filename', 1, inplace=True)
tdf.head(10)

Unnamed: 0,test_token,test_label,file
0,record,O,110-01.xml
1,date,O,110-01.xml
2,:,O,110-01.xml
3,2069-04-07,O,110-01.xml
4,mr.,O,110-01.xml
5,villegas,O,110-01.xml
6,is,O,110-01.xml
7,seen,O,110-01.xml
8,today,O,110-01.xml
9,.,O,110-01.xml


### Extract Labeling by XML File

Using the dataset captured above, extract information on the count of I-hypertension.mention and I-hypertension.high_bp tags in each of the files passed in test dataset and map them to the corresponding TAG and INDICATOR values.  This gives a high level counts of the tags identified in each of the files included in the test data.  This will be useful for error analysis and should be a base point for constructing the tags and start/end points if needed.


In [104]:
tdf.groupby(['file', 'test_label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,test_token
file,test_label,Unnamed: 2_level_1
110-01.xml,I-hypertension.mention,1
110-01.xml,O,333
110-02.xml,I-hypertension.high_bp,2
110-02.xml,I-hypertension.mention,1
110-02.xml,O,250
110-03.xml,I-hypertension.high_bp,3
110-03.xml,I-hypertension.mention,1
110-03.xml,O,510
110-04.xml,I-hypertension.mention,1
110-04.xml,O,787


## GENERATE TEST DATA FOR PREDICTION ON CLOUD

The steps below need to be generated only once for all MODELS.  Although individual models are built to predict each of the tokens such as I-hypertension.mention, I-diabetes.mention etc, test set is the same and is formed using XML files in the 'complete' folder (testing).


### Use code below to generate test dataset for BERT Classification Model

For initial testing, only a subset of the test files have been used. The code to fetch filenmaes need to be updated to get the full set (PLEASE REMEMBER TO FIX THIS FOR THE FINAL PREDICTIONS)


In [171]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))
datafolder = (codefolder.replace("Code", "Dataset\\testing-RiskFactors-Complete"))
#print (datafolder)

testfilenames = []
xml_contents = []

for file in os.listdir(datafolder):
    filename = os.fsdecode(os.fsencode((str(datafolder)+'\\'+file)))
    if filename.endswith( ('.xml') ): # select xml files
        #print(filename)
        testfilenames.append(filename)

In [176]:
print("There are {} test XML files for validating the model.".format(len(testfilenames)))

There are 514 test XML files for validating the model.


### Get the tokens and labels for TEST Dataset

ALthough filenames are captured, they can be ignored for generating the test dataset to use to predict against the model.  That will be needed only for validating the predictions from the BERT Model itself.

In [177]:
testset_filenames, testset_tokens, testset_labels = getIOCoding_data('', '', testfilenames)

In [181]:
#testset_tokens
test_df = pd.DataFrame({'token': testset_tokens})
test_df.head()

Unnamed: 0,token
0,record
1,date
2,:
3,2069-04-07
4,mr.


In [182]:
test_df.shape  # the number of tokens in the test file (to run predictions against)

(381953, 1)

### Generate train / dev / test files for Training BERT

In [206]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame

le = LabelEncoder()

train_df.head()

Unnamed: 0,filename,token,label
0,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,record,O
1,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,date,O
2,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,:,O
3,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,2067-05-03,O
4,C:\Users\sudha\Documents\W266-NLP\Final-Projec...,narrative,O


In [207]:
# Creating train and dev dataframes according to BERT
df_bert = pd.DataFrame({'user_id':train_df.index,
            'label':le.fit_transform(train_df['label']),
            'alpha':['a']*train_df.shape[0],
            'text':train_df['token'].replace(r'\n',' ',regex=True)})
 
# don't use the 10% random split
#df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)

In [208]:
# set train and dev sets to be a sequential set of values
print("Unique labels in the dataset: ", df_bert['label'].unique())
print("Value counts by labels: \n", df_bert['label'].value_counts())

Unique labels in the dataset:  [4 1 0 3 2]
Value counts by labels: 
 4    404299
3      1433
0      1425
1       629
2       285
Name: label, dtype: int64


In [209]:
df_bert_train = df_bert[0:350000]
df_bert_dev   = df_bert[350001:]

In [210]:
print("train set: ", df_bert_train.shape, " and dev set: ", df_bert_dev.shape)

train set:  (350000, 4)  and dev set:  (58070, 4)


In [211]:
df_bert_train.head()

Unnamed: 0,user_id,label,alpha,text
0,0,4,a,record
1,1,4,a,date
2,2,4,a,:
3,3,4,a,2067-05-03
4,4,4,a,narrative


In [212]:
 # Creating test dataframe according to BERT (CoLA Format)
# Saving dataframes to .tsv format as required by BERT
# Save into folder meant for that model (hypertension / diabetes / CAD) 
#df_bert_train.to_csv('data_for_bert/diabetes_ind/train.tsv', sep='\t', index=False, header=False)
#df_bert_dev.to_csv('data_for_bert/diabetes_ind/dev.tsv', sep='\t', index=False, header=False)

# train / dev sets for CAD-Indicator
df_bert_train.to_csv('data_for_bert/CAD_ind/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('data_for_bert/CAD_ind/dev.tsv', sep='\t', index=False, header=False)



In [183]:
 # Creating test dataframe according to BERT (CoLA Format)
df_bert_test = pd.DataFrame({'id':test_df.index,
                 'sentence':test_df['token'].replace(r'\n',' ',regex=True)})

# Saving dataframes to .tsv format as required by BERT - same for all models (so generated only once)
df_bert_test.to_csv('data_for_bert/test.tsv', sep='\t', index=False, header=True)

### Prepare Sequence Labeling Dataset 

Preapare the train / test / dev datasets so that only the token and the tags are included.  This is for use to predict sequential labels.  This can be ignored for now, as we are using the 'Cola' model approach.

In [26]:
# columns are: {'token', 'label'}
df_bert_train, df_bert_dev = train_test_split(train_df, test_size=0.01)

In [28]:
# Saving dataframes to .txt format as required by BERT
df_bert_train.to_csv('../bert/data/train.txt', sep='\t', index=False, header=False)
df_bert_dev.to_csv('../bert/data/dev.txt', sep='\t', index=False, header=False)
test_df.to_csv('../bert/data/test.txt', sep='\t', index=False, header=True)