In [14]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder

In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 645310353365086580
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10193742398
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11584288440690811715
physical_device_desc: "device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


### Get the files for parsing

In [5]:
# set to the appropriate folder on your local drive
codefolder = os.path.dirname(os.path.abspath('__file__'))

datafolders = ['\\data_all\\06_training-RiskFactors-Complete-Set1\\','\\data_all\\09_training-RiskFactors-Complete-Set2\\']
filenames = []

for folder in datafolders:
    for file in os.listdir(str(codefolder)+folder):
        filename=os.fsdecode(os.fsencode((str(codefolder)+folder+file)))
        if filename.endswith(('.xml')):
            filenames.append(filename)

filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-05.xml']

### Define function to get tokens & their attributes

In [6]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

### Define function to generate IO Coding (Model2)

IO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* I - marks beginning/inside of the entity
* O - marks that the token is NOT part of any entity

In [7]:
def Generate_IO_Coding(file_path, tag, attribute):
    """
    Function defined to take in a file for processing, tag and attributes to identify
    within the file (xml).  The tags are first read into objects, capturing the label,
    start and end values.  These are references in the 'text' object, where based on
    which the annotators did identify the tag in context. 
    
    This information is used to parse through the text (as tokens) and perform the BIO
    coding based on the start position matches.
    
    Input: 
    file_path: path of the file to be read in for processing
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc.
    attribute: specific attribute within the tag, from which to extract the value from
    
    Returns: 
    list of tokens, list of labels (IO coding)
    """
    
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()

    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    labels_list = []
    label_start = []
    label_end = []
    filename = []


    for item in root.find("TAGS"):
        if item.tag  == tag:
            label = (item.tag + "." + item.attrib[attribute]).lower().replace(" ", "_")
        else:
            label = ""

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))

        bio_labels = []

        count = 0

    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            # update the tag to 'I-' so that this generates IO-Coding
            next_tag="I-"
            for word in phrase_tokens:
                if (len(word_label) > 0):
                    new_label = next_tag + word_label
                    bio_labels.append(new_label)
                    next_tag="I-"
                else:
                    bio_labels.append("O")
                count += 1
        else:
            bio_labels.append("O")
            count += 1 

    # build this list to hold name of the file the token belongs to
    # this is for the purpose of evaluation of the model from test results
    for i in range(0, len(tokens)):
        filename.append(file_path)
        
    return filename, tokens, bio_labels

In [8]:
def getIOCoding_data(tag, attrib, filenames):

    """
    All files in the list (which holds the list of files in the directory) are parsed through
    and the Generate_BIO_Coding function is called by passing individual files within the folder.
    
    The tags and attributes are passed on to the function as parameters.
    
    Input: 
    filenames: names of the file to be read in for processing in a list object
    tag: tag, as identified in the annotation.  Ex: DIABETES, HYPERTENSION etc. (string)
    attribute: specific attribute within the tag, from which to extract the value from (string)
    
    Returns: 
    list of tokens, list of labels (BIO coding, done across all files in the path for the tag/attribute)
    """
    
    all_tokens = []
    all_labels = []
    all_filenames = []

    for file in filenames:
        #print("processing file ... ", file)
        filename, tokens, bio_labels = Generate_IO_Coding(file_path=file, tag=tag, attribute=attrib)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        all_filenames.extend(filename)  # this is for the purpose of validating test results
        #print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_filenames, all_tokens, all_labels


### Generate Data for Models

The goal is to generate the tokens and the corresponding labels specific to model of interest.  Below, we will be building the data for `time` attribute for the following tags:

* **Model 3a:** DIABETES
* **Model 3b:** CAD
* **Model 3c:** HYPERTENSION
* **Model 3d:** HYPERLIPIDEMIA
* **Model 3e:** OBESE
* **Model 3f:** MEDICATION

In [18]:
def preprocess_input(tag, attribute, filenames, input_type="train"):
    files, tokens, labels = getIOCoding_data(tag, attribute, filenames)
    df = pd.DataFrame({'filename': files, 'token': tokens, 'label': labels})
    
    if (input_type == "train") or (input_type == "dev"):
        le = LabelEncoder()
        # Train - CoLA Format
        df_bert = pd.DataFrame({'user_id':df.index, 
                                'label':le.fit_transform(df['label']), 
                                'alpha':['a']*df.shape[0], 
                                'text':df['token'].replace(r'\n',' ', regex=True)})
    
    elif input_type == "test":
        # Test -  CoLA Format
        df_bert = pd.DataFrame({'id':df.index, 
                                'sentence':df['token'].replace(r'\n', ' ', regex=True)})
    
    path = os.path.join(os.path.dirname(os.path.abspath('__file__')), tag + "_" + attribute)
    if os.path.exists(path) == False:
        os.makedirs(path)
    df_bert.to_csv(os.path.join(path, input_type+".tsv"), sep='\t', index=False, header=False)
    return df_bert

In [19]:
df_train = preprocess_input("DIABETES", "time", filenames, input_type="train")