### Named Entity Recognition (NER)

Named entity recognition is a technique to detect entities of interests from word tokens. in NER, each sentence in splitted into word tokens. Each word token is tagged with the corresponding entity. Possible entities could be - person's name, orgnaization name, phone number, address, currency etc. In clinical domain, we can have entities like - disease condition, test, procedure, medications etc.

We use i2b2 dataset which contains discharge summaries of several patients. The dataset is tagged with <b>person</b>, <b>treatment</b>, <b>test</b>, <b>problem</b>, <b>pronoun</b>. Words that are not tagged with any of the entities are tagged with <b>O</b>.

We use Bidirectional LSTM model for token classification.

In [1]:
import os
import re

import pickle
from collections import Counter
from glob import glob

import pandas as pd
import numpy as np

from nltk.tokenize import sent_tokenize, word_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Layer, Input, LSTM, Embedding, Dense, Conv1D, TimeDistributed, Dropout, Bidirectional, BatchNormalization, GlobalAveragePooling1D, SpatialDropout1D
#from keras_contrib.layers import CRF
from tensorflow.keras.callbacks import EarlyStopping

from tqdm import tqdm
#import os, csv, math, codecs

#import spacy
from sklearn_crfsuite.metrics import flat_classification_report

pd.options.display.max_rows = 1000

### Data Preprocessing

Convert raw input data to tabular coNLL03 format.

In [2]:
def process_concept(concept_str):
    """
    takes string like
    'c="asymptomatic" 16:2 16:2||t="problem"'
    and returns dictionary like
    {'t': 'problem', 'start_line': 16, 'start_pos': 2, 'end_line': 16, 'end_pos': 2}
    """
    try:
        position_bit, problem_bit = concept_str.split('||')
        t = problem_bit[3:-1]
        
        start_and_end_span = next(re.finditer('\s\d+:\d+\s\d+:\d+', concept_str)).span()
        c = concept_str[3:start_and_end_span[0]-1]
        c = [y for y in c.split(' ') if y.strip() != '']
        c = ' '.join(c)

        start_and_end = concept_str[start_and_end_span[0]+1 : start_and_end_span[1]]
        start, end = start_and_end.split(' ')
        start_line, start_pos = [int(x) for x in start.split(':')]
        end_line, end_pos = [int(x) for x in end.split(':')]
        
    except:
        print(concept_str)
        raise
    
    return {
        't': t, 'start_line': start_line, 'start_pos': start_pos, 'end_line': end_line, 'end_pos': end_pos,
        'c': c, 
    }

def build_label_vocab(base_dirs):
    seen, label_vocab, label_vocab_size = set(['O']), {'O': 'O'}, 0
    
    for base_dir in base_dirs:
        concept_dir = os.path.join(base_dir, 'concepts')

        assert os.path.isdir(concept_dir), "Directory structure doesn't match!"

        ids = set([x.split('.')[0] for x in os.listdir(concept_dir) if x.endswith('.con')])

        for i in ids:
            with open(os.path.join(concept_dir, '%s.txt.con' % i)) as f:
                concepts = [process_concept(x.strip()) for x in f.readlines()]
            for c in concepts:
                if c['t'] not in seen:
                    label_vocab_size += 1
                    label_vocab[c['t']] = c['t'] # label_vocab_size
                    seen.update([c['t']])
    return label_vocab, label_vocab_size

def reformatter(base, label_vocab, txt_dir = None, concept_dir = None):
    if txt_dir is None: txt_dir = os.path.join(base, 'docs')
    if concept_dir is None: concept_dir = os.path.join(base, 'concepts')
    
    assert os.path.isdir(txt_dir) and os.path.isdir(concept_dir), "Directory structure doesn't match!"
    
    txt_ids = set([x.split('.')[0] for x in os.listdir(txt_dir) if x.endswith('.txt')])
    concept_ids = set([x.split('.')[0] for x in os.listdir(concept_dir) if x.endswith('.con')])
    
    assert txt_ids == concept_ids, (
        "id set doesn't match: txt - concept = %s, concept - txt = %s"
        "" % (str(txt_ids - concept_ids), str(concept_ids - txt_ids))
    )
    
    ids = txt_ids
    
    reprocessed_texts = {}
    for i in ids:
        with open(os.path.join(txt_dir, '%s.txt' % i), mode='r') as f:
            lines = f.readlines()
            txt = [[y for y in x.strip().split(' ') if y.strip() != ''] for x in lines]
            line_starts_with_space = [x.startswith(' ') for x in lines]
        with open(os.path.join(concept_dir, '%s.txt.con' % i), mode='r') as f:
            concepts = [process_concept(x.strip()) for x in f.readlines()]
            
        labels = [['O' for _ in line] for line in txt]
        for c in concepts:
            if c['start_line'] == c['end_line']:
                line = c['start_line']-1
                p_modifier = -1 if line_starts_with_space[line] else 0
                text = (' '.join(txt[line][c['start_pos']+p_modifier:c['end_pos']+1+p_modifier])).lower()
                #assert text == c['c'], (
                #    "Text mismatch! %s vs. %s (id: %s, line: %d)\nFull line: %s"
                #    "" % (c['c'], text, i, line, txt[line])
                #)
                
            for line in range(c['start_line']-1, c['end_line']):
                p_modifier = -1 if line_starts_with_space[line] else 0
                start_pos = c['start_pos']+p_modifier if line == c['start_line']-1 else 0
                end_pos   = c['end_pos']+1+p_modifier if line == c['end_line']-1 else len(txt[line])
                
                if line == c['end_line'] - 1: labels[line][end_pos-1] = label_vocab[c['t']]                
                if line == c['start_line'] - 1: labels[line][start_pos] = label_vocab[c['t']]
                for j in range(start_pos + 1, end_pos-1): labels[line][j] = label_vocab[c['t']]
            
        joined_words_and_labels = [zip(txt_line, label_line) for txt_line, label_line in zip(txt, labels)]

        out_str = '\n\n'.join(
            ['\n'.join(['%s\t%s' % p for p in joined_line]) for joined_line in joined_words_and_labels]
        )
        
        reprocessed_texts[i] = out_str
        
    return reprocessed_texts

In [3]:
label_vocab, label_vocab_size = build_label_vocab([
    './NERdata/raw/Beth_Train/'
])

In [4]:
print (label_vocab)

{'O': 'O', 'person': 'person', 'treatment': 'treatment', 'test': 'test', 'problem': 'problem', 'pronoun': 'pronoun'}


In [5]:
reprocessed_texts = reformatter('./NERdata/raw/Beth_Train/', label_vocab)

In [6]:
print (reprocessed_texts['clinical-273'])

Admission	O
Date	O
:	O

2014-10-14	O

Discharge	O
Date	O
:	O

2014-10-17	O

Date	O
of	O
Birth	O
:	O

1959-12-09	O

Sex	O
:	O

M	O

Service	O
:	O

CCU	O

HISTORY	O
OF	O
PRESENT	O
ILLNESS	O
:	O

This	pronoun
is	O
a	O
55	O
-	O
year-old	O
Caucasian	O
speaking	O
male	O
who	person
is	O
a	O
smoker	O
and	O
has	O
a	O
family	person
history	O
of	O
coronary	problem
artery	problem
disease	problem
,	O
as	O
well	O
as	O
a	O
personal	O
history	O
of	O
hypertension	problem
,	O
who	person
experienced	O
multiple	O
episodes	O
of	O
10/10	problem
substernal	problem
chest	problem
pain	problem
radiating	O
down	O
his	person
left	O
arm	O
last	O
night	O
with	O
his	person
daily	O
activities	O
.	O

Each	problem
episode	problem
lasted	O
approximately	O
15	O
minutes	O
in	O
duration	O
and	O
resolved	O
on	O
their	pronoun
own	O
.	O

This	pronoun
morning	O
while	O
landscaping	O
the	person
patient	person
had	O
unremitting	problem
12-05	problem
pain	problem
with	O
shortness	problem
of	problem
breath	problem
and	O
diaphoresi

In [7]:
merged_txt = '\n'.join(
    [val for key,val in reprocessed_texts.items()]
)

In [8]:
with open('./NERdata.txt', mode='w') as f: f.write(merged_txt)

In [9]:
#all_ner_train_files = glob('./NERdata/*/train.tsv')
#all_ner_val_files = glob('./NERdata/*/devel.tsv')

Read coNLL format data and store it in a Pandas dataframe

In [10]:
class Dataset(object):
    def __init__(self, filepath):
        self.words, self.start_list, self.end_list  = self.read_conll_format(filepath)
        self.labels = self.read_conll_format_labels(filepath)

        assert len(self.words) == len(self.labels)

        self.sentence = ["sentence_{}".format(i+1) for i in range(len(self.words))]
        
    def read_conll_format_labels(self, filename):
        lines = self.read_lines(filename) + ['']
        posts, post = [], []
        for line in lines:
            if line:
                probs = line.split("\t")[1]
                post.append(probs)
                #print("post: ", post)
            elif post:
                posts.append(post)
                post = []
        # a list of lists of words/ labels
        return posts

    def read_conll_format(self, filename):
        lines = self.read_lines(filename) + ['']
        posts, post = [], []
        start_list, end_list, starts, ends = [], [], [], []
        start, end = 0, 0
        for line in lines:
            if line:
                start = end + 1
                words = line.split("\t")[0]
                end = start + len(words) 
                # print("words: ", words)
                post.append(words.lower())
                starts.append(start)
                ends.append(end)
            elif post:
                posts.append(post)
                start_list.append(starts)
                end_list.append(ends)
                post = []
                start, end = 0, 0
        # a list of lists of words/ labels
        return posts, start_list, end_list

    def read_lines(self, filename):
        with open(filename, 'r') as fp:
            lines = [line.strip() for line in fp]
        return lines

In [11]:
data = Dataset('./NERdata.txt')

In [12]:
data_df = pd.DataFrame()
data_df['Sentence'] = data.sentence
data_df['Word'] = data.words
data_df['Entity'] = data.labels

In [13]:
data_df.head(10)

Unnamed: 0,Sentence,Word,Entity
0,sentence_1,"[admission, date, :]","[O, O, O]"
1,sentence_2,[2012-05-21],[O]
2,sentence_3,"[discharge, date, :]","[O, O, O]"
3,sentence_4,[2012-05-25],[O]
4,sentence_5,"[date, of, birth, :]","[O, O, O, O]"
5,sentence_6,[1957-01-05],[O]
6,sentence_7,"[sex, :]","[O, O]"
7,sentence_8,[f],[O]
8,sentence_9,"[service, :]","[O, O]"
9,sentence_10,"[cmed, ccu]","[O, O]"


In [14]:
data_df['Sentence_length'] = data_df.Word.apply(lambda x: len(x))

In [15]:
data_df.Sentence_length.describe()

count    14380.000000
mean        10.095271
std          9.111970
min          1.000000
25%          3.000000
50%          7.000000
75%         14.000000
max         98.000000
Name: Sentence_length, dtype: float64

We use texts which has atleast 10 words and at most 50 words

In [16]:
min_length = 10
max_length = 50

data_df = data_df[data_df.Sentence_length >= min_length][data_df.Sentence_length <= max_length].reset_index(drop=True)

  after removing the cwd from sys.path.


In [17]:
from spacy import displacy

In [28]:
def plot_entity(text,labels):
    original_text = " ".join(text)
    ents = []
    start, end, start_tag = 0, 0, 0
    for i, w in enumerate(text):
        if i > 0:
            start = end + 1
            if labels[i-1] != labels[i]:
                start_tag = start
                
        end = start + len(w)
        
        if labels[i] != 'O':
            if i < len(text):
                if labels[i+1] != labels[i]:
                    ents.append({'start':start_tag,'end':end,'label':labels[i]})
            else:
                ents.append({'start':start_tag,'end':end,'label':labels[i]})
    
    sentence = [{'text': original_text, 'ents': ents,'title': None}]
    
    displacy.render(sentence,style='ent',manual=True)

In [31]:
plot_entity(data_df.Word.iloc[0],data_df.Entity.iloc[0])

In [29]:
plot_entity(data_df.Word.iloc[1],data_df.Entity.iloc[1])

In [30]:
plot_entity(data_df.Word.iloc[2],data_df.Entity.iloc[2])

Split data into train and test datasets

In [34]:
kf = GroupKFold(n_splits=5)

for train_index, val_index in kf.split(data_df.Word,data_df.Entity,data_df.Sentence):
    break

In [35]:
train = data_df.iloc[train_index].reset_index(drop=True)
val = data_df.iloc[val_index].reset_index(drop=True)

In [36]:
print (train.shape, val.shape)

(4520, 4) (1130, 4)


In [37]:
def flatten(elems):
    return [e for elem in elems for e in elem]

corpus = flatten(train.Word)
elems, freqs = zip(*Counter(corpus).most_common())

In [38]:
print ("Total {} words are in the train dataset".format(len(elems)))

Total 8598 words are in the train dataset


### Fix Model Hyperparameters

In [39]:
BATCH_SIZE = 32
EPOCHS = 25
embed_dim = 200 #dimension of the embedding
lstm_out = 100 #dimension of the lstm output 
n_layers = 1 #number of LSTM layers
MAX_NB_WORDS = 6000
tags = list(label_vocab.values())
n_tags = len(tags) + 1

### Tokenization

We tokenize both the input and the output data. For input text data we use top 6000 words. Tokenization is done using keras tokenizer. Unknown words are replace with UNK tag. For shorter texts we use zero padding to make all the texts of same length.

For output, we convert text entities into unique integer tokens and perform one-hot encoding.

In [40]:
tokenizer = Tokenizer(oov_token='UNK', num_words=MAX_NB_WORDS+1)
tokenizer.fit_on_texts(train.Word)
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= MAX_NB_WORDS+1}

word2idx = tokenizer.word_index
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

#tokenizer.word_index[tokenizer.oov_token] = MAX_NB_WORDS + 1

In [41]:
tag2idx

{'O': 1,
 'PAD': 0,
 'person': 2,
 'problem': 5,
 'pronoun': 6,
 'test': 4,
 'treatment': 3}

In [42]:
trainX = pad_sequences(maxlen=max_length, sequences=tokenizer.texts_to_sequences(train.Word), padding="post", value=word2idx["PAD"])
trainy = [[tag2idx[j] for j in i] for i in train.Entity]
trainy = pad_sequences(maxlen=max_length, sequences=trainy, padding="post", value=tag2idx["PAD"])
#trainy = [to_categorical(i, num_classes=n_tags) for i in tqdm(trainy)] 

valX = pad_sequences(maxlen=max_length, sequences=tokenizer.texts_to_sequences(val.Word), padding="post", value=word2idx["PAD"])
valy = [[tag2idx[j] for j in i] for i in val.Entity]
valy = pad_sequences(maxlen=max_length, sequences=valy, padding="post", value=tag2idx["PAD"])
#valy = [to_categorical(i, num_classes=n_tags) for i in tqdm(valy)] 

trainX = np.array(trainX)
trainy = np.array(trainy)
valX = np.array(valX)
valy = np.array(valy)

trainy = to_categorical(trainy)
valy = to_categorical(valy)

print (trainX.shape, valX.shape, trainy.shape, valy.shape)

(4520, 50) (1130, 50) (4520, 50, 7) (1130, 50, 7)


### Pretrained Embeddings

We use pretrained PubMed word embeddings. In our NER model, the first layer is embedding layer. Instead of learning word embeddings in the NER model directly, we use pretrained embeddings which already contains rich semantic embedding of each word tokens.

In [43]:
embeddings_index = {}
f = open('pubmed_wv.txt','r',encoding='utf8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    if coefs.shape[0] == embed_dim and word in word2idx:
        embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2665548it [07:57, 5576.69it/s] 

Found 4979 word vectors.





In [44]:
embedding_matrix = np.zeros((len(word2idx), embed_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Model Building

We use 3 different models.

* BiLSTM model - model uses 1-layer LSTM, intermediate dense layer and output softmax layer
* BiLSTM-CNN model - model uses CNN layer for feature extraction, followed by BiLSTM, dense and output layer
* BiLSTM-attention model - model contains BiLSTM layer, followed by multi headed self-attention layer and output layer

We use categorical cross entropy to calculate loss between original entities and predicted entities

In [45]:
class Attention(Layer):
    """Multi-headed attention layer."""
    
    def __init__(self, hidden_size, 
                 num_heads = 8, 
                 attention_dropout=.1,
                 trainable=True,
                 name='Attention'):
        
        if hidden_size % num_heads != 0:
            raise ValueError("Hidden size must be evenly divisible by the number of heads.")
            
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.trainable = trainable
        self.attention_dropout = attention_dropout
        self.dense = tf.keras.layers.Dense(self.hidden_size, use_bias=False)
        super(Attention, self).__init__(name=name)

    def split_heads(self, x):
        """Split x into different heads, and transpose the resulting value.
        The tensor is transposed to insure the inner dimensions hold the correct
        values during the matrix multiplication.
        Args:
          x: A tensor with shape [batch_size, length, hidden_size]
        Returns:
          A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
        """
        with tf.name_scope("split_heads"):
            batch_size = tf.shape(x)[0]
            length = tf.shape(x)[1]

            # Calculate depth of last dimension after it has been split.
            depth = (self.hidden_size // self.num_heads)

            # Split the last dimension
            x = tf.reshape(x, [batch_size, length, self.num_heads, depth])

            # Transpose the result
            return tf.transpose(x, [0, 2, 1, 3])
    
    def combine_heads(self, x):
        """Combine tensor that has been split.
        Args:
          x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
        Returns:
          A tensor with shape [batch_size, length, hidden_size]
        """
        with tf.name_scope("combine_heads"):
            batch_size = tf.shape(x)[0]
            length = tf.shape(x)[2]
            x = tf.transpose(x, [0, 2, 1, 3])  # --> [batch, length, num_heads, depth]
            return tf.reshape(x, [batch_size, length, self.hidden_size])        

    def call(self, inputs):
        """Apply attention mechanism to inputs.
        Args:
          inputs: a tensor with shape [batch_size, length_x, hidden_size]
        Returns:
          Attention layer output with shape [batch_size, length_x, hidden_size]
        """
        # Google developper use tf.layer.Dense to linearly project the queries, keys, and values.
        q = self.dense(inputs)
        k = self.dense(inputs)
        v = self.dense(inputs)

        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)
        
        # Scale q to prevent the dot product between q and k from growing too large.
        depth = (self.hidden_size // self.num_heads)
        q *= depth ** -0.5
        
        logits = tf.matmul(q, k, transpose_b=True)
        # logits += self.bias
        weights = tf.nn.softmax(logits, name="attention_weights")
        
        if self.trainable:
            weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
        
        attention_output = tf.matmul(weights, v)
        attention_output = self.combine_heads(attention_output)
        attention_output = self.dense(attention_output)
        return attention_output
        
    def compute_output_shape(self, input_shape):
        return tf.TensorShape(input_shape)

In [46]:
def model_lstm():
    input = Input(shape=(max_length,))
    x = Embedding(input_dim=len(word2idx), output_dim=embed_dim, weights=[embedding_matrix], trainable=False)(input)  # default: 100-dim embedding
    x = SpatialDropout1D(.2)(x)
    x = Bidirectional(LSTM(units=lstm_out, return_sequences=True))(x)  # biLSTM
    x = SpatialDropout1D(.2)(x)
    x = Dense(50)(x) # dense
    x = SpatialDropout1D(.2)(x)
    out = Dense(len(tag2idx),activation='softmax')(x) #output

    model = Model(input, out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [47]:
def model_cnn_lstm():
    input = Input(shape=(max_length,))
    x = Embedding(input_dim=len(word2idx), output_dim=embed_dim, weights=[embedding_matrix], trainable=False)(input)  # default: 100-dim embedding
    x = SpatialDropout1D(.2)(x)
    x = Conv1D(filters=lstm_out,kernel_size=5,strides=1,padding='same')(x)
    x = SpatialDropout1D(.2)(x)
    x = Bidirectional(LSTM(units=lstm_out, return_sequences=True))(x)  # biLSTM
    x = SpatialDropout1D(.2)(x)
    x = Dense(50)(x) # dense
    x = SpatialDropout1D(.2)(x)
    out = Dense(len(tag2idx),activation='softmax')(x) #output

    model = Model(input, out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [48]:
def model_lstm_attention():
    input = Input(shape=(max_length,))
    x = Embedding(input_dim=len(word2idx), output_dim=embed_dim, weights=[embedding_matrix], trainable=False)(input)  # default: 100-dim embedding
    x = SpatialDropout1D(.2)(x)
    x = Bidirectional(LSTM(units=lstm_out, return_sequences=True))(x)  # biLSTM
    x = Attention(hidden_size=2*lstm_out)(x)
    out = Dense(len(tag2idx),activation='softmax')(x) #output

    model = Model(input, out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [49]:
model1 = model_lstm()
model1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 200)           1200400   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 200)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           240800    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 200)           0         
_________________________________________________________________
dense (Dense)                (None, 50, 50)            10050     
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 50, 50)            0     

In [50]:
model2 = model_cnn_lstm()
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 200)           1200400   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 50, 200)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 50, 100)           100100    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 50, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           160800    
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 50, 200)           0   

In [51]:
model3 = model_lstm_attention()
model3.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 50, 200)           1200400   
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 50, 200)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 200)           240800    
_________________________________________________________________
Attention (Attention)        (None, None, 200)         40000     
_________________________________________________________________
dense_5 (Dense)              (None, None, 7)           1407      
Total params: 1,482,607
Trainable params: 282,207
Non-trainable params: 1,200,400
___________________________________________

### Model Fitting and Evaluation

We run each model on the training dataset for 25 epochs. As most of the word tokens are tagged with the tag "O", by default our problem is imbalance. In this case, accuracy metric is not suitable. We use a custom callback using macro F1 score. If the model achieves better F1 score, we store the model. Early stopping is performed using F1 score.

During evaluation we use entity level classification report which calculates F1, precision, recall for each entity separately. The report describes how the model performs for different entities.

In [52]:
class F1Callback(tf.keras.callbacks.Callback):
    def __init__(self, model, inputs, targets, filename, patience=5):
        self.model = model
        self.inputs = inputs
        self.targets = targets.argmax(-1).reshape(-1)
        self.best_score = -1
        self.bad_epoch = 0
        self.filename = filename
        self.patience = patience
        
    def on_epoch_end(self, epoch, logs):
        pred = self.model.predict(self.inputs).argmax(-1).reshape(-1)
        score = f1_score(self.targets, pred, average='macro')
        print(f'\nF1 Macro Score: {score:.5f}')
        
        if score > self.best_score:
            self.best_score = score
            self.bad_epoch = 0
            self.model.save_weights(self.filename)
            print ("\nModel saved in {}".format(self.filename))
        else:
            self.bad_epoch += 1
            
        if self.bad_epoch >= self.patience:
            print("\nEpoch %05d: early stopping Threshold" % epoch)
            self.model.stop_training = True

In [53]:
f1callback1 = F1Callback(model1,valX,valy,'lstm.h5')

history1 = model1.fit(trainX, trainy, batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_data=(valX,valy),
                   callbacks = [f1callback1])

Epoch 1/25
F1 Macro Score: 0.65924

Model saved in lstm.h5
Epoch 2/25
F1 Macro Score: 0.81503

Model saved in lstm.h5
Epoch 3/25
F1 Macro Score: 0.83640

Model saved in lstm.h5
Epoch 4/25
F1 Macro Score: 0.84965

Model saved in lstm.h5
Epoch 5/25
F1 Macro Score: 0.85769

Model saved in lstm.h5
Epoch 6/25
F1 Macro Score: 0.86706

Model saved in lstm.h5
Epoch 7/25
F1 Macro Score: 0.86975

Model saved in lstm.h5
Epoch 8/25
F1 Macro Score: 0.87818

Model saved in lstm.h5
Epoch 9/25
F1 Macro Score: 0.87750
Epoch 10/25
F1 Macro Score: 0.88782

Model saved in lstm.h5
Epoch 11/25
F1 Macro Score: 0.88550
Epoch 12/25
F1 Macro Score: 0.89021

Model saved in lstm.h5
Epoch 13/25
F1 Macro Score: 0.89210

Model saved in lstm.h5
Epoch 14/25
F1 Macro Score: 0.89290

Model saved in lstm.h5
Epoch 15/25
F1 Macro Score: 0.89523

Model saved in lstm.h5
Epoch 16/25
F1 Macro Score: 0.89691

Model saved in lstm.h5
Epoch 17/25
F1 Macro Score: 0.89954

Model saved in lstm.h5
Epoch 18/25
F1 Macro Score: 0.89814
E

In [54]:
val_y_actual = valy.argmax(-1)
val_y_actual = np.array([[idx2tag[i] for i in j] for j in val_y_actual])

In [55]:
model1.load_weights('lstm.h5')

val_pred1 = model1.predict(valX).argmax(-1)
val_pred1 = np.array([[idx2tag[i] for i in j] for j in val_pred1])

report = flat_classification_report(y_pred=val_pred1, y_true=val_y_actual)
print(report)

              precision    recall  f1-score   support

           O       0.93      0.95      0.94     14494
         PAD       1.00      1.00      1.00     35337
      person       0.94      0.87      0.90      1303
     problem       0.84      0.81      0.82      2441
     pronoun       0.96      0.97      0.97       209
        test       0.89      0.83      0.86      1326
   treatment       0.87      0.79      0.83      1390

    accuracy                           0.97     56500
   macro avg       0.92      0.89      0.90     56500
weighted avg       0.97      0.97      0.97     56500



Simple BiLSTM model achieves 97% F1 score for entity pronoun, 90% for persons, 82% for problems, 86% for tests and 83% for treatments.

In [56]:
f1callback2 = F1Callback(model2,valX,valy,'cnn_lstm.h5')

history2 = model2.fit(trainX, trainy, batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_data=(valX,valy),
                   callbacks = [f1callback2])

Epoch 1/25
F1 Macro Score: 0.67354

Model saved in cnn_lstm.h5
Epoch 2/25
  2/142 [..............................] - ETA: 5s - loss: 0.1756 - accuracy: 0.9444

  'precision', 'predicted', average, warn_for)


F1 Macro Score: 0.81853

Model saved in cnn_lstm.h5
Epoch 3/25
F1 Macro Score: 0.84471

Model saved in cnn_lstm.h5
Epoch 4/25
F1 Macro Score: 0.85065

Model saved in cnn_lstm.h5
Epoch 5/25
F1 Macro Score: 0.85545

Model saved in cnn_lstm.h5
Epoch 6/25
F1 Macro Score: 0.86827

Model saved in cnn_lstm.h5
Epoch 7/25
F1 Macro Score: 0.86540
Epoch 8/25
F1 Macro Score: 0.86811
Epoch 9/25
F1 Macro Score: 0.87897

Model saved in cnn_lstm.h5
Epoch 10/25
F1 Macro Score: 0.88060

Model saved in cnn_lstm.h5
Epoch 11/25
F1 Macro Score: 0.89141

Model saved in cnn_lstm.h5
Epoch 12/25
F1 Macro Score: 0.89232

Model saved in cnn_lstm.h5
Epoch 13/25
F1 Macro Score: 0.89051
Epoch 14/25
F1 Macro Score: 0.89563

Model saved in cnn_lstm.h5
Epoch 15/25
F1 Macro Score: 0.89258
Epoch 16/25
F1 Macro Score: 0.89092
Epoch 17/25
F1 Macro Score: 0.89397
Epoch 18/25
F1 Macro Score: 0.89668

Model saved in cnn_lstm.h5
Epoch 19/25
F1 Macro Score: 0.89732

Model saved in cnn_lstm.h5
Epoch 20/25
F1 Macro Score: 0.90021

In [57]:
model2.load_weights('cnn_lstm.h5')

val_pred2 = model2.predict(valX).argmax(-1)
val_pred2 = np.array([[idx2tag[i] for i in j] for j in val_pred2])

report = flat_classification_report(y_pred=val_pred2, y_true=val_y_actual)
print(report)

              precision    recall  f1-score   support

           O       0.94      0.94      0.94     14494
         PAD       0.99      1.00      1.00     35337
      person       0.93      0.87      0.90      1303
     problem       0.84      0.80      0.82      2441
     pronoun       0.97      0.97      0.97       209
        test       0.86      0.87      0.87      1326
   treatment       0.82      0.82      0.82      1390

    accuracy                           0.97     56500
   macro avg       0.91      0.90      0.90     56500
weighted avg       0.96      0.97      0.96     56500



In [58]:
f1callback3 = F1Callback(model3,valX,valy,'lstm_attention.h5')

history3 = model3.fit(trainX, trainy, batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_data=(valX,valy),
                   callbacks = [f1callback3])

Epoch 1/25
F1 Macro Score: 0.30188

Model saved in lstm_attention.h5
Epoch 2/25
  1/142 [..............................] - ETA: 0s - loss: 0.3914 - accuracy: 0.8644

  'precision', 'predicted', average, warn_for)


F1 Macro Score: 0.49629

Model saved in lstm_attention.h5
Epoch 3/25
F1 Macro Score: 0.53163

Model saved in lstm_attention.h5
Epoch 4/25
F1 Macro Score: 0.56362

Model saved in lstm_attention.h5
Epoch 5/25
F1 Macro Score: 0.65382

Model saved in lstm_attention.h5
Epoch 6/25
F1 Macro Score: 0.69453

Model saved in lstm_attention.h5
Epoch 7/25
F1 Macro Score: 0.71582

Model saved in lstm_attention.h5
Epoch 8/25
F1 Macro Score: 0.75365

Model saved in lstm_attention.h5
Epoch 9/25
F1 Macro Score: 0.77377

Model saved in lstm_attention.h5
Epoch 10/25
F1 Macro Score: 0.79086

Model saved in lstm_attention.h5
Epoch 11/25
F1 Macro Score: 0.81423

Model saved in lstm_attention.h5
Epoch 12/25
F1 Macro Score: 0.80647
Epoch 13/25
F1 Macro Score: 0.82967

Model saved in lstm_attention.h5
Epoch 14/25
F1 Macro Score: 0.81639
Epoch 15/25
F1 Macro Score: 0.83342

Model saved in lstm_attention.h5
Epoch 16/25
F1 Macro Score: 0.82977
Epoch 17/25
F1 Macro Score: 0.83594

Model saved in lstm_attention.h5
E

In [59]:
model3.load_weights('lstm_attention.h5')

val_pred3 = model3.predict(valX).argmax(-1)
val_pred3 = np.array([[idx2tag[i] for i in j] for j in val_pred3])

report = flat_classification_report(y_pred=val_pred3, y_true=val_y_actual)
print(report)

              precision    recall  f1-score   support

           O       0.92      0.94      0.93     14494
         PAD       1.00      0.99      0.99     35337
      person       0.91      0.85      0.88      1303
     problem       0.80      0.76      0.78      2441
     pronoun       0.95      0.89      0.92       209
        test       0.85      0.78      0.81      1326
   treatment       0.79      0.75      0.77      1390

    accuracy                           0.96     56500
   macro avg       0.89      0.85      0.87     56500
weighted avg       0.96      0.96      0.96     56500



### Prediction Validation

We convert the fixed length outputs to original lengths and inspect predictions on validation data.

In [60]:
predicted_entity = []

for i,s in enumerate(tqdm(val.Sentence.unique())):
    len_sentence = len(val[val.Sentence == s].Word.iloc[0])
    if len_sentence < max_length:
        predicted_entity.append(val_pred1[i,:len_sentence].tolist())
    else:
        predicted_entity.append(val_pred1[i][:max_length].tolist() + ['O']*(len_sentence - max_length))
        
val['Predicted_Entity'] = predicted_entity

100%|█████████████████████████████████████████████████████████████████████████████| 1130/1130 [00:01<00:00, 938.72it/s]


In [61]:
val.head(5)

Unnamed: 0,Sentence,Word,Entity,Sentence_length,Predicted_Entity
0,sentence_13,"[the, patient, saw, her, pcp, and, was, known,...","[person, person, O, person, person, O, O, O, O...",22,"[person, person, O, person, person, O, O, O, O..."
1,sentence_16,"[chest, x-ray, was, remarkable, for, a, questi...","[test, test, O, O, O, O, O, problem, problem, ...",12,"[test, test, O, O, O, problem, problem, proble..."
2,sentence_19,"[asthma, /, copd, on, bipap, ,, history, of, i...","[problem, O, problem, O, treatment, O, O, O, t...",10,"[problem, O, problem, O, treatment, O, O, O, t..."
3,sentence_43,"[from, admission, ,, vital, signs, 98.3, ,, bl...","[O, O, O, test, test, O, O, test, test, O, O, ...",29,"[O, O, O, test, test, O, O, test, test, O, O, ..."
4,sentence_45,"[she, is, a, chronically, ill-appearing, femal...","[person, O, O, O, O, O, O, O, O, problem, prob...",12,"[person, O, O, O, O, O, O, O, O, problem, prob..."


In [62]:
print ("Actual :")
plot_entity(val.Word.iloc[0],val.Entity.iloc[0])
print ("Predicted :")
plot_entity(val.Word.iloc[0],val.Predicted_Entity.iloc[0])

Actual :


Predicted :


In [63]:
print ("Actual :")
plot_entity(val.Word.iloc[1],val.Entity.iloc[1])
print ("Predicted :")
plot_entity(val.Word.iloc[1],val.Predicted_Entity.iloc[1])

Actual :


Predicted :


In [64]:
print ("Actual :")
plot_entity(val.Word.iloc[2],val.Entity.iloc[2])
print ("Predicted :")
plot_entity(val.Word.iloc[2],val.Predicted_Entity.iloc[2])

Actual :


Predicted :


In [65]:
print ("Actual :")
plot_entity(val.Word.iloc[3],val.Entity.iloc[3])
print ("Predicted :")
plot_entity(val.Word.iloc[3],val.Predicted_Entity.iloc[3])

Actual :


Predicted :


In [66]:
print ("Actual :")
plot_entity(val.Word.iloc[4],val.Entity.iloc[4])
print ("Predicted :")
plot_entity(val.Word.iloc[4],val.Predicted_Entity.iloc[4])

Actual :


Predicted :
