In [1]:
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, BertEmbeddings, CharacterEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from typing import List
import pandas as pd
import os
import random
import xml.etree.cElementTree as ET
from sklearn.preprocessing import LabelEncoder
import nltk.data
from nltk import sent_tokenize
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
import torch
torch.version.cuda

'10.0'

In [4]:
torch.backends.cudnn.version()

7301

In [5]:
torch.cuda.is_available()

True

In [6]:
torch.cuda.get_device_name(0)

'GeForce GTX TITAN X'

# Flair Text Classification

## Data Format
```
__label__<label_1> <text>
__label__<label_1> __label__<label_2> <text>
```

The text classification data format is based on the `FastText format`, in which each line in the file represents a texxt document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. 

To create a `TaggedCorpus` for a text classification task, you need to have three files (train, dev, and test) in the above format located in one folder. This data folder structure could, for example, look like this:

```
/resources/tasks/project/train.txt
/resources/tasks/project/dev.txt
/resources/tasks/imdb/test.txt
```

### Get files for parsing

In [7]:
codefolder = os.path.dirname(os.path.abspath('__file__'))

datafolders = ['\\data_all\\06_training-RiskFactors-Complete-Set1\\','\\data_all\\09_training-RiskFactors-Complete-Set2\\']
filenames = []

for folder in datafolders:
    for file in os.listdir(str(codefolder)+folder):
        filename=os.fsdecode(os.fsencode((str(codefolder)+folder+file)))
        if filename.endswith(('.xml')):
            filenames.append(filename)
            
filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\220-05.xml']

In [8]:
# use a 90/10 split
split_index = int(len(filenames)*0.9)
random.seed(42)
random.shuffle(filenames)

train_filenames = filenames[:split_index]
dev_filenames = filenames[split_index:]

In [9]:
datafolder = '\\data_all\\16_testing-RiskFactors-Complete\\'
test_filenames = []

for file in os.listdir(str(codefolder)+datafolder):
    filename=os.fsdecode(os.fsencode((str(codefolder)+datafolder+file)))
    if filename.endswith(('.xml')):
        test_filenames.append(filename)

### Parse sentences & labels

In [10]:
def get_sentences(file):

    tree = ET.ElementTree(file=file)
    root = tree.getroot()

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    all_sentences = []

    for item in sentences:
        for sub_item in item:
            if sub_item.replace(' ','') != '':
                all_sentences.append(sub_item)    
    print("all_sentences:",all_sentences[:5])
    return all_sentences

In [11]:
def process_file(file):
    
    # get all sentences in the file
    tree = ET.ElementTree(file=file)
    root = tree.getroot()

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    all_sentences = []

    for item in sentences:
        for sub_item in item:
            if sub_item.replace(' ','') != '':
                all_sentences.append(sub_item)    
                
    #all_sent = get_sentences(file)
    sent_label = {}

    sub_tags = []
    for item in root.find("TAGS"):
        if item.tag == 'PHI':
            pass
        elif item.tag == 'SMOKER':
            label = (item.tag + "." + item.attrib['status']).lower().replace(" ", "_")
        elif item.tag == 'FAMILY_HIST':
            label = (item.tag + "." + item.attrib['indicator']).lower().replace(" ", "_")
        elif item.tag == 'MEDICATION':
            label = (item.tag + "." + item.attrib['type1'] + "." + item.attrib['type2'] + "." + item.attrib['time']).lower().replace(" ", "_")
        else:
            label = (item.tag + "." + item.attrib['indicator'] + "." + item.attrib['time']).lower().replace(" ", "_")
        for sub_item in item.findall(item.tag):
            #print("sub_item:", sub_item.tag)
            if ('text' in sub_item.attrib.keys()):
                #print(sub_item.attrib['text'])
                sub_tags.append((sub_item.attrib['text'], label))


    count=0
    for sent in all_sentences:
        label='Other'
        for tag in set(sub_tags):                                                       
            if tag[0] in sent:
                label = tag[1]
                count += 1

        sent_label[sent] = label
        
    # return empty dict if no tag found in file
    # else, return the sentences with the labels
    if count==0:
        return {}
    else:
        #print("sent_label:", sent_label)
        return sent_label

In [12]:
def get_TrainingData(filenames):
    train_files = []
    train_sent = []
    train_labels = []
    
    train_df = pd.DataFrame()
    for file in filenames:
        file_data = process_file(file=file)
        for i in range(0, len(file_data)):
            train_files.append(file)
            
        for key, value in file_data.items():
            train_sent.append(key)
            train_labels.append(value)
    
    train_df = pd.DataFrame({'filename': train_files, 'sentence': train_sent, 'label': train_labels})
    #print(train_df.head())
    return train_df

In [13]:
df_train = get_TrainingData(train_filenames)
df_dev = get_TrainingData(dev_filenames)
df_test = get_TrainingData(test_filenames)

In [14]:
train_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\103-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\109-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\155-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\251-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\107-02.xml']

In [15]:
dev_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\335-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\149-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\06_training-RiskFactors-Complete-Set1\\366-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\181-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\09_training-RiskFactors-Complete-Set2\\180-01.xml']

In [16]:
test_filenames[:5]

['E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-01.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-02.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-03.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\110-04.xml',
 'E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\data_all\\16_testing-RiskFactors-Complete\\111-01.xml']

In [17]:
df_train.head()

Unnamed: 0,filename,sentence,label
0,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,Record date: 2067-11-24,Other
1,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,HUNTINGTON EMERGENCY DEPT...,Other
2,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,"THOMAS-YOSEF,JULIA 840-91-51-9 VI...",Other
3,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,This patient was seen with Dr. Earley.,Other
4,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,The patient was interviewed,Other


In [18]:
df_dev.head()

Unnamed: 0,filename,sentence,label
0,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,Record date: 2104-01-30,Other
1,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,NEUROLOGY RESIDENT CONSULT NOTE,Other
2,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,PATIENT NAME: Dalila Haynes,Other
3,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,MRN: 78361343,Other
4,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,DATE OF CONSULT: 1/30/04,Other


In [19]:
df_test.head()

Unnamed: 0,filename,sentence,label
0,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,Record date: 2069-04-07,smoker.unknown
1,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,Mr. Villegas is seen today.,Other
2,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,I have not seen him since November.,Other
3,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,About three weeks ago he stopped his Prednison...,Other
4,E:\Google Drive\Berkeley\Courses\w266_NLP\Fina...,he was gaining weight.,Other


In [20]:
df_train_flair = df_train[['label', 'sentence']].copy()
df_train_flair['prefix'] = '__label__'
df_train_flair = df_train_flair[['prefix', 'label', 'sentence']]
df_train_flair['formatted'] = df_train_flair['prefix']+df_train_flair['label']+' '+df_train_flair['sentence']
df_train_flair['formatted'].head()

0               __label__Other Record date: 2067-11-24
1    __label__Other                      HUNTINGTON...
2    __label__Other THOMAS-YOSEF,JULIA   840-91-51-...
3    __label__Other This patient was seen with Dr. ...
4          __label__Other The patient was interviewed 
Name: formatted, dtype: object

In [21]:
df_dev_flair = df_dev[['label', 'sentence']].copy()
df_dev_flair['prefix'] = '__label__'
df_dev_flair = df_dev_flair[['prefix', 'label', 'sentence']]
df_dev_flair['formatted'] = df_dev_flair['prefix']+df_dev_flair['label']+' '+df_dev_flair['sentence']
df_dev_flair['formatted'].head()

0             __label__Other Record date: 2104-01-30
1    __label__Other NEUROLOGY RESIDENT CONSULT NOTE 
2         __label__Other PATIENT NAME: Dalila Haynes
3                       __label__Other MRN: 78361343
4            __label__Other DATE OF CONSULT: 1/30/04
Name: formatted, dtype: object

In [22]:
df_test_flair = df_test[['label', 'sentence']].copy()
df_test_flair['prefix'] = '__label__'
df_test_flair = df_test_flair[['prefix', 'label', 'sentence']]
df_test_flair['formatted'] = df_test_flair['prefix']+df_test_flair['label']+' '+df_test_flair['sentence']
df_test_flair['formatted'].head()

0      __label__smoker.unknown Record date: 2069-04-07
1           __label__Other Mr. Villegas is seen today.
2    __label__Other I have not seen him since Novem...
3    __label__Other About three weeks ago he stoppe...
4                __label__Other he was gaining weight.
Name: formatted, dtype: object

In [23]:
df_train_flair['formatted'].to_csv('E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\FLAIR_data\\train.txt', index=False, header=False)
df_dev_flair['formatted'].to_csv('E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\FLAIR_data\\dev.txt', index=False, header=False)
df_test_flair['formatted'].to_csv('E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\FLAIR_data\\test.txt', index=False, header=False)

# Models

In [24]:
data_folder = Path('E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\FLAIR_data\\')

# load corpus containing training, test, and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='test.txt',
                                                                     dev_file='dev.txt',
                                                                     train_file='train.txt')

2019-04-11 12:54:20,842 Reading data from E:\Google Drive\Berkeley\Courses\w266_NLP\Final Project\FLAIR_data
2019-04-11 12:54:20,847 Train: E:\Google Drive\Berkeley\Courses\w266_NLP\Final Project\FLAIR_data\train.txt
2019-04-11 12:54:20,851 Dev: E:\Google Drive\Berkeley\Courses\w266_NLP\Final Project\FLAIR_data\dev.txt
2019-04-11 12:54:20,854 Test: E:\Google Drive\Berkeley\Courses\w266_NLP\Final Project\FLAIR_data\test.txt


In [25]:
# create label dictionary
label_dict = corpus.make_label_dictionary()

In [26]:
# make a list of word embeddings
word_embeddings = [#WordEmbeddings('glove'),
                   BertEmbeddings(),
                   #FlairEmbeddings('pubmed-forward'),
                   #FlairEmbeddings('pubmed-backward'),
                   #CharacterEmbeddings()
                  ]

In [27]:
# initialize document embedding by passing list of word embeddings
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

In [28]:
# create text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

In [29]:
# initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [30]:
trainer.train('E:\\Google Drive\\Berkeley\\Courses\\w266_NLP\\Final Project\\FLAIR_data\\BERT',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=20)

2019-04-11 12:54:46,041 ----------------------------------------------------------------------------------------------------
2019-04-11 12:54:46,043 Evaluation method: MICRO_F1_SCORE
2019-04-11 12:54:46,072 ----------------------------------------------------------------------------------------------------
2019-04-11 12:54:49,627 epoch 1 - iter 0/1558 - loss 0.14745218
2019-04-11 13:06:11,925 epoch 1 - iter 155/1558 - loss 0.02543820


RuntimeError: $ Torch: not enough memory: you tried to allocate 0GB. Buy new RAM! at ..\aten\src\TH\THGeneral.cpp:201