In [2]:
import xml.etree.cElementTree as ET
import os
import nltk
import string
import random
import pandas as pd
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15282499900818875963
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10193742398
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11351185109022645623
physical_device_desc: "device: 0, name: GeForce GTX TITAN X, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


# Get the files for parsing

## Training Data

In [3]:
codefolder = os.path.dirname(os.path.abspath('__file__'))

datafolders = ['\\data_all\\06_training-RiskFactors-Complete-Set1\\','\\data_all\\09_training-RiskFactors-Complete-Set2\\']
filenames = []

for folder in datafolders:
    for file in os.listdir(str(codefolder)+folder):
        filename=os.fsdecode(os.fsencode((str(codefolder)+folder+file)))
        if filename.endswith(('.xml')):
            filenames.append(filename)
            
filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-05.xml']

In [4]:
# use a 90/10 split
split_index = int(len(filenames)*0.9)
random.seed(42)
random.shuffle(filenames)

train_filenames = filenames[:split_index]
dev_filenames = filenames[split_index:]

In [5]:
train_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\103-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\109-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\155-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\251-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\107-02.xml']

In [6]:
dev_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\335-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\149-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\366-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\181-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\180-01.xml']

## Test Data

In [7]:
datafolder = '\\data_all\\16_testing-RiskFactors-Complete\\'
test_filenames = []

for file in os.listdir(str(codefolder)+datafolder):
    filename=os.fsdecode(os.fsencode((str(codefolder)+datafolder+file)))
    if filename.endswith(('.xml')):
        test_filenames.append(filename)

test_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\111-01.xml']

# Define function to get tokens & their attributes

In [8]:
def spans(text):
    tokens = nltk.word_tokenize(text.lower())
    offset = 0
    start_indices = []
    end_indices = []
    for token in tokens:
        offset = text.find(token, offset)
        start, end = offset, offset+len(token)
        offset += len(token)
        start_indices.append(start)
        end_indices.append(end)
    return tokens, start_indices, end_indices

# Define function to generate IO Coding
IO coding is a technique for extracting entities, where in the input sentence is tokenized and analyzed for occurences of words that belong to an entity of interest.  It follows the following scheme:

* I - marks beginning/inside of the entity
* O - marks that the token is NOT part of any entity

In [9]:
def Generate_IO_Coding(file_path):
    tree = ET.ElementTree(file=file_path)
    root = tree.getroot()
    
    text = root.find('TEXT').text.lower()
    
    tokens, start, end = spans(text)
    
    labels_list = []
    label_start = []
    label_end = []
    
    filename = []

    for item in root.find("TAGS"):
        if item.tag == 'PHI':
            pass
        elif item.tag == 'SMOKER':
            label = "I-" + (item.tag + "." + item.attrib['status']).lower().replace(" ", "_")
        elif item.tag == 'FAMILY_HIST':
            label = "I-" + (item.tag + "." + item.attrib['indicator']).lower().replace(" ", "_")
        elif item.tag == 'MEDICATION':
            label = "I-" + (item.tag + "." + item.attrib['type1'] + "." + item.attrib['type2'] + "." + item.attrib['time']).lower().replace(" ", "_")
        else:
            label = "I-" + (item.tag + "." + item.attrib['indicator'] + "." + item.attrib['time']).lower().replace(" ", "_")

        for sub_item in item.findall(item.tag):
            if 'start' in sub_item.attrib.keys():
                labels_list.append(label)
                label_start.append(int(sub_item.attrib['start']))
                label_end.append(int(sub_item.attrib['end']))
                
    io_labels = []

    count = 0
    while len(start) > count:
        if start[count] in label_start:
            label_start_index = label_start.index(start[count])
            end_index = label_end[label_start_index]
            word_label = labels_list[label_start_index]
            phrase = text[start[count]:end_index]
            phrase_tokens = nltk.word_tokenize(phrase)
            for word in phrase_tokens:
                io_labels.append(word_label)
                count += 1
        else:
            io_labels.append("O")
            count += 1
            
    # build this list to hold name of the file the token belongs to
    # this is for the purpose of evaluation of the model from test results
    for i in range(0, len(tokens)):
        filename.append(file_path)
    
    return filename, tokens, io_labels

In [10]:
def getIOCoding_data(filenames):
    all_tokens = []
    all_labels = []
    all_filenames = []

    for file in filenames:
        #print("processing file ... ", file)
        filename, tokens, bio_labels = Generate_IO_Coding(file_path=file)

        all_tokens.extend(tokens)
        all_labels.extend(bio_labels)
        all_filenames.extend(filename)  # this is for the purpose of validating test results
        #print("finished processing file ", file, "; and token length is ", len(all_tokens), "; and label length: ", len(all_labels))
        
    return all_filenames, all_tokens, all_labels

# Generate Data for Models

In [11]:
def preprocess_input(filenames, input_type="train"):
    files, tokens, labels = getIOCoding_data(filenames)
    df = pd.DataFrame({'filename': files, 'token': tokens, 'label': labels})
    
    if (input_type == "train") or (input_type == "dev"):
        le = LabelEncoder()
        # Train - CoLA Format
        df_bert = pd.DataFrame({'user_id':df.index, 
                                'label':le.fit_transform(df['label']), 
                                'alpha':['a']*df.shape[0], 
                                'text':df['token'].replace(r'\n',' ', regex=True)})
    
    elif input_type == "test":
        # Test -  CoLA Format
        df_bert = pd.DataFrame({'id':df.index, 
                                'sentence':df['token'].replace(r'\n', ' ', regex=True)})
    
    path = os.path.join(os.path.dirname(os.path.abspath('__file__')), "Single_Model")
    if os.path.exists(path) == False:
        os.makedirs(path)
    df_bert.to_csv(os.path.join(path, input_type+".tsv"), sep='\t', index=False, header=False)
    return df_bert

In [12]:
input_types = [['train', train_filenames], ['dev', dev_filenames], ['test', test_filenames]]
for input_type in input_types:
    preprocess_input(input_type[1], input_type=input_type[0])

# Load Data for QC

## Check number of categories

In [6]:
codefolder = os.path.dirname(os.path.abspath('__file__'))
filename = os.fsdecode(os.fsencode((str(codefolder)+'\\Single_Model\\train.tsv')))
filename

'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\Single_Model\\train.tsv'

In [8]:
train = pd.read_csv(filename, sep="\t", header=None)
train.head()

Unnamed: 0,0,1,2,3
0,0,91,a,record
1,1,91,a,date
2,2,91,a,:
3,3,91,a,2067-11-24
4,4,91,a,huntington


In [12]:
len(train[1].unique()) + 1 # zero-indexed

93