In [1]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import random
import pandas as pd
from pandas import DataFrame

NOTE: The following functions, although used for generating train / test datasets, are left in here so the test files can be generated if needed.

### Get the files for parsing

### Define function to get tokens & their attributes

In [2]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

### Define function to generate IO Coding (Model2)

IO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* I - marks beginning/inside of the entity
* O - marks that the token is NOT part of any entity


In [3]:
def Generate_IO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (IO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []
    filename = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

    bio_labels = []

    count = 0
    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            # update the tag to 'I-' so that this generates IO-Coding
            next_tag="I-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    # build this list to hold name of the file the token belongs to
    # this is for the purpose of evaluation of the model from test results
    for i in range(0, len(tokens)):
        filename.append(file_path)
        
    return filename, tokens, bio_labels

In [7]:
def getIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    test_tokens = []
    test_labels = []
    test_filenames = []

    for file in filenames:
        #print("processing file ... ", file)
        filename, tokens, labels = Generate_IO_Coding(file_path=file, tag=tag, attribute=attrib)

        test_tokens.extend(tokens)
        test_labels.extend(labels)
        test_filenames.extend(filename)  # this is for the purpose of validating test results
        

    return test_filenames, test_tokens, test_labels


### Form Test Data

In the script in cell below, we have chosen the test xml files that start with '11'.  This is just to do a sample prediction against the model built using BERT as classifier.

The condition should be removed to generate / test the complete set across all XML files in the 'testing' folder.

In [8]:
# set to the appropriate folder on your local drive
wd = os.path.dirname(os.path.abspath('__file__'))
datafolder = ["\\Dataset\\testing-RiskFactors-Complete\\"]
#print (datafolder)

testfilenames = []

for folder in datafolder:
    for file in os.listdir(str(wd)+folder):
        filename = os.fsdecode(os.fsencode((str(wd)+folder+file))) 
        if filename.endswith( ('.xml') ): # select xml files
            #print(filename)
            testfilenames.append(filename)

In [9]:
''' Do not use as gold dataset does not have start and end attributes in tags 
    and hence cannot be parsed using the above function
'''

# set to the appropriate folder on your local drive
wd = os.path.dirname(os.path.abspath('__file__'))
datafolder = ["\\Dataset\\testing-RiskFactors-Gold\\"]
#print (datafolder)

goldfilenames = []

for folder in datafolder:
    for file in os.listdir(str(wd)+folder):
        filename = os.fsdecode(os.fsencode((str(wd)+folder+file))) 
        if filename.endswith( ('.xml') ): # select xml files
            #print(filename)
            goldfilenames.append(filename)


### Get Test Tokens for 'Hypertension' (to test bert output)

In [10]:
# get data for model #5
tag = 'HYPERTENSION'
attribute = 'indicator'

test_hypertension_indicator_filenames, test_hypertension_indicator_tokens, test_hypertension_indicator_labels = getIOCoding_data(tag, attribute, testfilenames)
gold_hypertension_indicator_filenames, gold_hypertension_indicator_tokens, gold_hypertension_indicator_labels = getIOCoding_data(tag, attribute, goldfilenames)
#hypertension_indicator_labels

In [11]:
# capture data into dataframe to work with it
test_df_hypertension = pd.DataFrame({'filename': test_hypertension_indicator_filenames, 'test_token': test_hypertension_indicator_tokens, 'test_label': test_hypertension_indicator_labels})

In [12]:
# actual counts of labels in test set for hypertension mention and high_bp
test_df_hypertension['test_label'].value_counts()

O                         380516
I-hypertension.high_bp       733
I-hypertension.mention       696
Name: test_label, dtype: int64

In [13]:
# capture gold data into dataframe to work with it
gold_df_hypertension = pd.DataFrame({'filename': gold_hypertension_indicator_filenames, 'gold_token': gold_hypertension_indicator_tokens, 'gold_label': gold_hypertension_indicator_labels})

In [14]:
# actual counts of labels in gold set for hypertension mention and high_bp
gold_df_hypertension['gold_label'].value_counts()

O    381945
Name: gold_label, dtype: int64

** All the labels in the gold set are getting the value 'O' because there are not "start" and "end" positions specified in the tags in the gold dataset.

### Value Counts of Labels from BERT Classifier

Value counts of labels from BERT classifier (manually obtained from test_results file which holds probabilities for each class) is obtained as below.

In [15]:
def get_filename(fullpath):
    return fullpath.replace("C:\\Users\\sudha\\Documents\\W266-NLP\\Final-Project-W266\\Code\\Dataset\\testing-RiskFactors-Complete\\", "")
    #return fullpath.replace("C:\\Users\\Kalyan\\Documents\\Anu\\W266 - NLP\\Final Project\\lheart-disease-risk-prediction\\Code\\Dataset\\testing-RiskFactors-Gold\\", "")

In [16]:
def compare_with_bert_results(tag, attribute, testfilenames):
#def compare_with_bert_results(tag, attribute, testfilenames, goldfilenames):
    # get data for model #5

    
    test_filenames, test_tokens, test_labels = getIOCoding_data(tag, attribute, testfilenames)
    
    # capture data into dataframe to work with it
    test_df = pd.DataFrame({'filename': test_filenames, 'test_token': test_tokens, 'test_label': test_labels})

    print("\nThe value counts by labels: ", test_df['test_label'].value_counts(), "\n\n")
    
    #number of labels for the tag / attribute
    num_labels = (test_df['test_label'].unique()).shape[0]
    
    tdf = test_df
    tdf['file'] = tdf['filename'].apply(get_filename)
    tdf.drop('filename', 1, inplace=True)
    #print(tdf.head(10))

    '''
    # get gold data to validate
    gold_filenames, gold_tokens, gold_labels = getIOCoding_data(tag, attribute, goldfilenames)
    
    # capture data into dataframe to work with it
    gold_df = pd.DataFrame({'filename': gold_filenames, 'gold_token': gold_tokens, 'gold_label': gold_labels})

    print("\nThe value counts by labels: ", gold_df['gold_label'].value_counts(), "\n\n")
    
    #number of labels for the tag / attribute
    num_labels = (gold_df['gold_label'].unique()).shape[0]
    
    gdf = gold_df
    gdf['file'] = gdf['filename'].apply(get_filename)
    gdf.drop('filename', 1, inplace=True)
    #print(gdf.head(10))
    '''

    file_path="bert_output_results/"+tag.lower()+"/test_results.tsv"
    bert_results = pd.read_csv(file_path, sep='\t',header=None)

    class_list=[]
    for i in range(0,num_labels):
        class_list.append("Class-"+str(i))
        
    # set the column names for the dataframe that holds BERT output
    bert_results.columns=class_list
    bert_results['classLabel'] = bert_results.idxmax(axis=1)
    #print("BERT Results: \n", bert_results.head(10))
    print("\nBERT Results value counts by labels: ", bert_results['classLabel'].value_counts(), "\n\n")
    
    test_combined = pd.concat([tdf, bert_results['classLabel']], axis=1)
    #test_combined = pd.concat([tdf, gdf, bert_results['classLabel']], axis=1)
    
    return test_combined

In [17]:
# get results for HYPERTENSION - Indicator
tag = 'HYPERTENSION'
attribute = 'indicator'

df_HT_results = compare_with_bert_results(tag, attribute, testfilenames)

#df_combined = compare_with_bert_results(tag, attribute, testfilenames, goldfilenames)


The value counts by labels:  O                         380516
I-hypertension.high_bp       733
I-hypertension.mention       696
Name: test_label, dtype: int64 



BERT Results value counts by labels:  Class-2    380916
Class-1       785
Class-0       252
Name: classLabel, dtype: int64 




In [18]:
df_HT_results.head(10)

Unnamed: 0,test_label,test_token,file,classLabel
0,O,record,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
1,O,date,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
2,O,:,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
3,O,2069-04-07,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
4,O,mr.,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
5,O,villegas,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
6,O,is,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
7,O,seen,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
8,O,today,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2
9,O,.,C:\Users\Kalyan\Documents\Anu\W266 - NLP\Final...,Class-2


In [19]:
list(df_HT_results)

['test_label', 'test_token', 'file', 'classLabel']

In [21]:
df_HT_results[ df_HT_results['test_Label'] == 'I-hypertension.high_bp']

KeyError: 'test_Label'

In [23]:
df_HT_results[ df_HT_results['classLabel'] == 'Class-1'].count()

test_label    785
test_token    785
file          785
classLabel    785
dtype: int64

#### Class Mappings for Hypertension  are as follows:

Class1 --> I-hypertension.high_bp

Class2 --> I-hypertension.mention

Class3 --> O

In [None]:
def HT_set_labels(classlabel):
    if (classlabel=='Class-0'):
        return 'I-hypertension.mention'
    elif (classlabel=='Class-1'):
        return 'I-hypertension.high_bp'
    else:
        return 'O'

In [None]:
df_HT_results['PredictedLabel'] = df_HT_results['classLabel'].apply(HT_set_labels)
#df_HT_results.drop('classLabel', 1, inplace=True)

df_HT_results.head(10)


In [None]:
def DI_set_labels(classlabel):
    if (classlabel=='Class0'):
        return 'I-diabetes.a1c'
    elif (classlabel=='Class1'):
        return 'I-diabetes.glucose'
    else:
        return 'O'

In [None]:
bert_DI_results['PredictedLabel'] = bert_DI_results['classLabel'].apply(DI_set_labels)
bert_DI_results.drop('classLabel', 1, inplace=True)


bert_DI_results.head(10)


#### Run counts for Diabetes in TEST Set

In [None]:
# get data for model #5
tag = 'DIABETES'
attribute = 'indicator'

test_diabetes_indicator_filenames, test_diabetes_indicator_tokens, test_diabetes_indicator_labels = getIOCoding_data(tag, attribute, testfilenames)
gold_diabetes_indicator_filenames, gold_diabetes_indicator_tokens, gold_diabetes_indicator_labels = getIOCoding_data(tag, attribute, goldfilenames)
#diabetes_indicator_labels

In [None]:
# capture data into dataframe to work with it
test_df_diabetes = pd.DataFrame({'filename': test_diabetes_indicator_filenames, 'test_token': test_diabetes_indicator_tokens, 'test_label': test_diabetes_indicator_labels})


In [None]:
# actual counts of labels in test set for diabetes mention, glucose and a1c
test_df_diabetes['test_label'].value_counts()

In [None]:
# capture data into dataframe to work with it
gold_df_diabetes = pd.DataFrame({'filename': gold_diabetes_indicator_filenames, 'gold_token': gold_diabetes_indicator_tokens, 'gold_label': gold_diabetes_indicator_labels})


In [None]:
# actual counts of labels in test set for diabetes mention, glucose and a1c
gold_df_diabetes['gold_label'].value_counts()

In [None]:
# get results for DIABETES - Indicator
tag = 'CAD'
attribute = 'indicator'

df_combined = compare_with_bert_results(tag, attribute, testfilenames)

In [None]:
df_combined.head(10)

In [None]:
df_combined[df_combined['classLabel'] != 'Class-4']

In [None]:
df_combined[1110:1125]

### Running manual count checks

In [None]:
test_df_diabetes.head(2)

### Test LABELS for TOKENS in TEST Dataset against BERT Outputs

BERT Classifier has returned results for the tokens passed in 'test.tsv' file.  The returned values are probabilities, that need to be converted into equivalent class labels based on majority class.  Then, the class label should be compared against the actual label from the code above to extract the IO-Coding from the xml files.  This is a brute-force approach or a manual way of verifying the validity of the predictions


Read in results from BERT Predicitons to the above dataset
The above dataset is derived from IO-Coding applied as done on the training set. This is what should be based on the annotation process. Nowe, we have to read in the predictions from bert, which is a set of class probabilities across all 3 classes and we have to merget that with the above dataset for comparison and error analysis.

In [None]:
# read in the test results captured for BERT Hypertension model and specify columns as the actual file has no header
bert_DI_results = pd.read_csv("bert_output_results/diabetes/test_results.tsv", sep='\t',header=None)
bert_DI_results.columns=["Class0", "Class1", "Class2", "Class3"]

In [None]:
bert_DI_results.head()

In [None]:
bert_DI_results['classLabel'] = bert_DI_results.idxmax(axis=1)

bert_DI_results.head(5)

In [None]:
bert_DI_results['classLabel'].value_counts()

In [None]:
def DI_set_labels(classlabel):
    if (classlabel=='Class0'):
        return 'I-diabetes.a1c'
    elif (classlabel=='Class1'):
        return 'I-diabetes.glucose'
    elif (classlabel=='Class2'):
        return 'I-diabetes.mention'
    else:
        return 'O'

In [None]:
bert_DI_results['PredictedLabel'] = bert_DI_results['classLabel'].apply(DI_set_labels)
bert_DI_results.drop('classLabel', 1, inplace=True)


bert_DI_results.head(10)


In [None]:
# validating the counts by label
bert_DI_results['PredictedLabel'].value_counts()

test_DI_combined = pd.concat([tdf, bert_DI_results['PredictedLabel']], axis=1)

In [None]:
test_DI_combined[test_DI_combined['test_label']!='O']

In [None]:
test_DI_combined[test_DI_combined['PredictedLabel']!='O']

In [None]:
test_DI_combined[2220:2230]

In [None]:
test_DI_combined[376900:376910]

In [None]:
test_DI_combined[3080:3090]

In [None]:
test_DI_combined[2460:2470]

#### Checking I-hypertension.mention labels 

In [None]:
tdf[75:85]
# bert output predicted I-hypertension.mention for token in position 78

In [None]:
tdf[710:715]
# bert output predicted I-hypertension.mention for token in position 712

In [None]:
tdf[1990:2000]
# bert output predicted I-hypertension.mention for token in position 1991

In [None]:
tdf[3080:3090]
# bert output predicted I-hypertension.mention for token in position 3083

In [None]:
tdf[6720:6730]
# bert output predicted I-hypertension.mention for token in position 6726

#### Checking I-hypertension.high_bp labels

In [None]:
tdf[445:455]
# bert output predicted I-hypertension.high_bp for token in position 451

In [None]:
tdf[790:800]
# bert output predicted I-hypertension.high_bp for token in position 794

In [None]:
tdf[2360:2370]
# bert output predicted I-hypertension.high_bp for token in position 2366

In [None]:
tdf[12570:12580]
# bert output predicted I-hypertension.high_bp for token in position 12574 and 12576

In [None]:
tdf[18405:18418]
# bert output predicted I-hypertension.high_bp for token in position 18415

### Extract Labeling by XML File

Using the dataset captured above, extract information on the count of I-hypertension.mention and I-hypertension.high_bp tags in each of the files passed in test dataset and map them to the corresponding TAG and INDICATOR values.  This gives a high level counts of the tags identified in each of the files included in the test data.  This will be useful for error analysis and should be a base point for constructing the tags and start/end points if needed.


In [None]:
tdf.groupby(['file', 'test_label']).count()

In [None]:
# Get counts by labels for hypertension mention and high_bp across all test files
tdf.head(10)

In [None]:
tdf.to_csv('test_results_withfilenames.csv')

### Read in results from BERT Predicitons to the above dataset

The above dataset is derived from IO-Coding applied as done on the training set. This is what should be based on the annotation process. Nowe, we have to read in the predictions from bert, which is a set of class probabilities across all 3 classes and we have to merget that with the above dataset for comparison and error analysis. 


In [None]:
# read in the test results captured for BERT Hypertension model and specify columns as the actual file has no header
bert_hypertension_results = pd.read_csv("bert_output_results/hypertension/bert_run1_test_results.tsv", sep='\t',header=None)
bert_hypertension_results.columns=["Class1", "Class2", "Class3"]

In [None]:
bert_hypertension_results.head(5)

#### Class Mappings

Class labels correspond as follows:

* Class1 --> I-hypertension.high_bp
* Class2 --> I-hypertension.mention
* Class3 --> O


In [None]:
import numpy as np

In [None]:
ntest = np.array(bert_hypertension_results)
ntest.argmax(axis=1)

In [None]:
ntest.shape

In [None]:
  bert_hypertension_results['classLabel'] = bert_hypertension_results.idxmax(axis=1)

In [None]:
bert_hypertension_results.head(5)

In [None]:
def set_labels(classlabel):
    if (classlabel=='Class1'):
        return 'I-hypertension.high_bp'
    elif (classlabel=='Class2'):
        return 'I-hypertension.mention'
    else:
        return 'O'

bert_hypertension_results['PredictedLabel'] = bert_hypertension_results['classLabel'].apply(set_labels)
bert_hypertension_results.drop('classLabel', 1, inplace=True)


In [None]:
bert_hypertension_results.head(10)

In [None]:
# validating the counts by label
bert_hypertension_results['PredictedLabel'].value_counts()

In [None]:
test_hypertension_combined = pd.concat([tdf, bert_hypertension_results['PredictedLabel']], axis=1)

In [None]:
test_hypertension_combined.head(10)

In [None]:
# testing (spot-checking) where model predicted labels 1 & 2
test_hypertension_combined[70:85]

In [None]:
test_hypertension_combined[485:495]

In [None]:
test_hypertension_combined[450:460]

In [None]:
test_hypertension_combined[790:800]

### Interpreting the predictions compared against actual test labels

As seen above, BERT predictions seem very accurate and it seems to predict only after it has seen the complete context.  Also, punctuation marks are not labeled as one of the relevant classes, although a human annotator has done based on the instructions provided as part of the annotation process.  