In [1]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import random
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
import nltk.data
from nltk import sent_tokenize

### Get Training Files for Parsing

In [98]:
# set to the appropriate folder on your local drive
wd = os.path.dirname(os.path.abspath('__file__'))
folders = ["\\Dataset\\training-RiskFactors-Complete-Set1\\", "\\Dataset\\training-RiskFactors-Complete-Set2\\"]

#datafolder.append(codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set1"))
#datafolder.append(codefolder.replace("Code", "Dataset\\training-RiskFactors-Complete-Set2"))

#print (datafolder)

filenames = []

for folder in folders:
    for file in os.listdir(str(wd)+folder):
        filename = os.fsdecode(os.fsencode((str(wd)+folder+file)))
        if filename.endswith( ('.xml') ): # select xml files
            #print(filename)
            filenames.append(filename)


In [99]:
print("There are in total, {} files as part of the training set. ".format(len(filenames)))

There are in total, 790 files as part of the training set. 


### Get Testing specific files

In [100]:
# set to the appropriate folder on your local drive
wd = os.path.dirname(os.path.abspath('__file__'))
datafolder = ["\\Dataset\\testing-RiskFactors-Complete\\"]
#print (datafolder)

testfilenames = []

for folder in datafolder:
    for file in os.listdir(str(wd)+folder):
        filename = os.fsdecode(os.fsencode((str(wd)+folder+file)))
        if filename.endswith( ('.xml') ): # select xml files
            #print(filename)
            testfilenames.append(filename)
            
print("There are {} test XML files for validating the model.".format(len(testfilenames)))


There are 514 test XML files for validating the model.


### Define function to get sentences and labels

In [26]:
def get_sentences(file):

    tree = ET.ElementTree(file=file)
    root = tree.getroot()

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    all_sentences = []

    for item in sentences:
        for sub_item in item:
            if sub_item.replace(' ','') != '':
                all_sentences.append(sub_item)    
    
    return all_sentences

### Function definition for processing a file

This takes in the filename, tag and attribute as inputs and generates sentences and the corresponding label to form the dataset for training the model.  The function 'get_sentences' is incorporated into the function below.

In [95]:
def process_file(file, tag, attribute):
    
    # get all sentences in the file
    tree = ET.ElementTree(file=file)
    root = tree.getroot()

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    all_sentences = []

    for item in sentences:
        for sub_item in item:
            if sub_item.replace(' ','') != '':
                all_sentences.append(sub_item)    
                
    #all_sent = get_sentences(file)
    sent_label = {}

    sub_tags = []
    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if (item.tag==tag) and ('text' in sub_item.attrib.keys()):
                sub_tags.append((sub_item.attrib['text'], sub_item.attrib[attribute]))


    count=0
    for sent in all_sentences:
        label='Other'
        for tag in set(sub_tags):
            if tag[0] in sent:
                label = tag[1]
                count += 1

        sent_label[sent] = label
        
    # return empty dict if no tag found in file
    # else, return the sentences with the labels
    if count==0:
        return {}
    else:
        return sent_label


In [96]:
def get_TrainingData(tag, attrib, filenames, pct_split=0.9):

    """
    All files in the list (which holds the list of files in the directory) are parsed
    through to generate the training / dev datasets for the tag/attribute in context.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    Dataframe of the train / dev datasets (for the tag/attribute)
    """
    
    # using a 90/10 split by default unless specified as parameter
    split_index = int(len(filenames)*pct_split)
    random.seed(42)
    random.shuffle(filenames)
    
    train_files=[]
    dev_files=[]
    train_sent = []
    train_labels = []
    dev_sent = []
    dev_labels = []

    for file in filenames[:split_index]:
        file_data = process_file(file=file, tag=tag, attribute=attrib)
        for i in range(0, len(file_data)):
            train_files.append(file)
            
        for key, value in file_data.items():
            train_sent.append(key)
            train_labels.append(value)
        
    for file in filenames[split_index:]:
        file_data = process_file(file=file, tag=tag, attribute=attrib)
        for i in range(0, len(file_data)):
            dev_files.append(file)
        
        for key, value in file_data.items():
            dev_sent.append(key)
            dev_labels.append(value)

    train_df = pd.DataFrame({'filename': train_files, 'sentence': train_sent, 'label': train_labels})
    dev_df = pd.DataFrame({'filename': dev_files, 'sentence': dev_sent, 'label': dev_labels})
    return train_df, dev_df


In [97]:
def get_TestData(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed
    through to generate the test datasets for the tag/attribute in context.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    Dataframe of the test dataset (for the tag/attribute)
    """
    
    test_files=[]
    test_sent = []
    test_labels = []

    for file in filenames:
        file_data = process_file(file=file, tag=tag, attribute=attrib)
        for i in range(0, len(file_data)):
            test_files.append(file)
            
        for key, value in file_data.items():
            test_sent.append(key)
            test_labels.append(value)
        
    test_df = pd.DataFrame({'filename': test_files, 'sentence': test_sent, 'label': test_labels})
    return test_df


In [103]:
def get_filename(fullpath):
    #print(fullpath)
    fullpath=fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Code\\Dataset\\training-RiskFactors-Complete-Set1\\", "")
    return fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Code\\Dataset\\training-RiskFactors-Complete-Set2\\", "")


In [104]:
def get_testfilename(fullpath):
    return fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Code\\Dataset\\testing-RiskFactors-Complete\\", "")


### Generate Data for Models

The goal is to generate the tokens and the corresponding labels specific to model of interest.  Below, we will be building the data for 'indicator' attribute for the following tags:

* DIABETES
* CAD
* HYPERTENSION
* HYPERLIPIDEMIA
* OBESE
* FAMILY_HIST

Update the tag / indicator to get the training / dev / test datasets for them

In [107]:
# get data for model #2
tag = 'DIABETES'
attribute = 'indicator'

# training & dev datasets
df_train, df_dev = get_TrainingData(tag, attribute, filenames)
df_train['file'] = df_train['filename'].apply(get_filename)
df_train.drop('filename', 1, inplace=True)

df_dev['file'] = df_dev['filename'].apply(get_filename)
df_dev.drop('filename', 1, inplace=True)

# test dataset
df_test = get_TestData(tag, attribute, testfilenames)
df_test['file'] = df_test['filename'].apply(get_testfilename)
df_test.drop('filename', 1, inplace=True)


In [113]:
df_test.head(3)

Unnamed: 0,sentence,label,file
0,Record date: 2080-02-18,Other,110-03.xml
1,SDU JAR Admission Note,Other,110-03.xml
2,Name: \t Yosef Villegas,Other,110-03.xml


In [114]:
df_train.head(3)

Unnamed: 0,sentence,label,file
0,Record date: 2072-12-04,Other,284-04.xml
1,HPI: 81 y.o.w.,Other,284-04.xml
2,"with multiple medical problems including DM, H...",mention,284-04.xml


In [115]:
df_dev.head(3)

Unnamed: 0,sentence,label,file
0,Record date: 2097-03-25,Other,270-02.xml
1,"Patient Name: Whitaker, Vincent",Other,270-02.xml
2,MRN: 29964344,Other,270-02.xml


In [112]:
# check size of each of the datasets generated
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

(47888, 3)
(4482, 3)
(35556, 3)


### Write the TRAINING & DEV Datasets

Capture the TRAINING and DEV datasets for the tag/attribute in context and write to the appropriate folder for processing.


In [144]:
le = LabelEncoder()

# refer (https://blog.insightdatascience.com/using-bert-for-state-of-the-art-pre-training-for-natural-language-processing-1d87142c29e7)

# get the training set for BERT in the required format
df_train_bert = pd.DataFrame({'user_id':df_train.index,
            'label':le.fit_transform(df_train['label']),
            'alpha':['a']*df_train.shape[0],
            'text':df_train['sentence'].replace(r'\n',' ',regex=True)})


# get the dev set for BERT in the required format
df_dev_bert = pd.DataFrame({'user_id':df_dev.index,
            'label':le.fit_transform(df_dev['label']),
            'alpha':['a']*df_dev.shape[0],
            'text':df_dev['sentence'].replace(r'\n',' ',regex=True)})


 # Creating test dataframe according to BERT (CoLA Format)
df_test_bert = pd.DataFrame({'id':df_test.index,
                 'sentence':df_test['sentence'].replace(r'\n',' ',regex=True)})


In [130]:
# get train/dev sets for BERT-NER

# get the training set for BERT-NER in the required format (only token and label)
#df_train_bert_ner = DI_train[['token', 'label']]
#df_dev_bert_ner = DI_dev[['token', 'label']]


In [131]:
df_train_bert.head()

Unnamed: 0,user_id,label,alpha,text
0,0,1,a,Record date: 2072-12-04
1,1,1,a,HPI: 81 y.o.w.
2,2,3,a,"with multiple medical problems including DM, H..."
3,3,1,a,accompanied by her great great niece who provi...
4,4,1,a,missed few appointments - last time seen 8 mon...


In [132]:
# set train and dev sets to be a sequential set of values
print("Unique labels in the training dataset: ", df_train_bert['label'].unique())
print("Value counts by labels (training dataset): \n", df_train_bert['label'].value_counts())
print("\n")
print("Unique labels in the dev dataset: ", df_dev_bert['label'].unique())
print("Value counts by labels (dev dataset): \n", df_dev_bert['label'].value_counts())


Unique labels in the training dataset:  [1 3 0 2]
Value counts by labels (training dataset): 
 1    46581
3     1176
0      105
2       26
Name: label, dtype: int64


Unique labels in the dev dataset:  [1 3 0 2]
Value counts by labels (dev dataset): 
 1    4375
3      95
0      10
2       2
Name: label, dtype: int64


In [149]:
df_test['label'].value_counts()

Other      34642
mention      779
A1C           91
glucose       44
Name: label, dtype: int64

### Save the TRAINING & DEV data to folder

Write the files into appropriate folders, so that it can be uploaded to cloud for processing / building model 

In [145]:
write_path = 'data_for_bert_sent/'+tag.lower()+'_'+attribute[:3]

# train / dev sets for TAG-ATTRIBUTE
df_train_bert.to_csv(write_path+'/train.tsv', sep='\t', index=False, header=False)
df_dev_bert.to_csv(write_path+'/dev.tsv', sep='\t', index=False, header=False)

# test set for TAG-ATTRIBUTE
df_test_bert.to_csv(write_path+'/test.tsv', sep='\t', index=False, header=True)


### Check folder & Get Training

The format of the datasets have been adjused so as to perform multi-class classification and predicted against the test dataset.  The dev set is used for calculating the accuracy.

In [152]:
df_test['label'].value_counts()

Other      34642
mention      779
A1C           91
glucose       44
Name: label, dtype: int64

In [153]:
df_test[1270:1280]

Unnamed: 0,sentence,label,file
1270,"Diabetes (insulin-dependent, 3/66 HBA1C was 8.90)",mention,117-02.xml
1271,2.,Other,117-02.xml
1272,Hypercholesterolemia,Other,117-02.xml
1273,3.,Other,117-02.xml
1274,Hypertension,Other,117-02.xml
1275,4.,Other,117-02.xml
1276,History of angina,Other,117-02.xml
1277,5.,Other,117-02.xml
1278,"History of cyst near scapular, upper left back",Other,117-02.xml
1279,6.,Other,117-02.xml
