Disease named entity recognition using bidirectional recurrent neural networks

In [119]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import unicodedata
import project_path
import lib
import utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [120]:
#Define constants 
PAD_LENGTH=180 
LSTM_UNITS=120 
BATCH_SIZE=48

In [121]:
#Import train and test datasets
train_data = pd.read_csv("train.csv", encoding="latin1")
test_data = pd.read_csv("test.csv",encoding="latin1")

In [122]:
train_data.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word,tag
0,1,1,1,Obesity,O
1,2,1,1,in,O
2,3,1,1,Low-,O
3,4,1,1,and,O
4,5,1,1,Middle-Income,O


In [123]:
test_data.head()

Unnamed: 0,id,Doc_ID,Sent_ID,Word
0,4543834,30001,191283,CCCVA
1,4543835,30001,191283,","
2,4543836,30001,191283,MANOVA
3,4543837,30001,191283,","
4,4543838,30001,191283,my


In [124]:
#Check number of unique docs, sentences and words in training data
train_data.nunique()

id         4543833
Doc_ID       30000
Sent_ID     191282
Word        184505
tag              3
dtype: int64

In [125]:
#Check number of unique docs, sentences and words in test data
test_data.nunique()

id         2994463
Doc_ID       20000
Sent_ID     125840
Word        139891
dtype: int64

In [126]:
#Creating dictionaries of unique words and tags
words=list(set(train_data["Word"].append(test_data["Word"]).values)) #removing duplicate entries using set
words.append("ENDPAD")  
tags=list(set(train_data["tag"].values)) 


len_words=len(words)

len_tags=len(tags) 

print("Length of word dictionary: ",len_words)
print("Length of tag dictionary: ",len_tags) 


Length of word dictionary:  257203
Length of tag dictionary:  3


In [127]:
#Preprocessing 
#Convert greek characters to ASCII characters
words=[unicodedata.normalize('NFKD',str(w)).encode('ascii','ignore') for w in words] 

In [128]:
#Create dictionaries that has all unique words/tags as keys and unique ID as values
word2idx=dict(zip(words,range(0,len_words-1))) 
tag2idx=dict(zip(tags,range(0,len_tags-1)))

In [129]:
dict(list(word2idx.items())[:3])

{b'nan': 0, b'detoxifying': 1, b'Operability': 2}

In [130]:
dict(list(tag2idx.items())[:3])

{'B-indications': 0, 'O': 1}

In [131]:
#Get training sentences in a list
train_sentences = get_tagged_sentences(train_data) 
print(train_sentences[0:2])

[[('Obesity', 'O'), ('in', 'O'), ('Low-', 'O'), ('and', 'O'), ('Middle-Income', 'O'), ('Countries', 'O'), (':', 'O'), ('Burden', 'O'), (',', 'O'), ('Drivers', 'O'), (',', 'O'), ('and', 'O'), ('Emerging', 'O'), ('Challenges', 'O'), ('.', 'O')], [('We', 'O'), ('have', 'O'), ('reviewed', 'O'), ('the', 'O'), ('distinctive', 'O'), ('features', 'O'), ('of', 'O'), ('excess', 'O'), ('weight', 'O'), (',', 'O'), ('its', 'O'), ('causes', 'O'), (',', 'O'), ('and', 'O'), ('related', 'O'), ('prevention', 'O'), ('and', 'O'), ('management', 'O'), ('efforts', 'O'), (',', 'O'), ('as', 'O'), ('well', 'O'), ('as', 'O'), ('data', 'O'), ('gaps', 'O'), ('and', 'O'), ('recommendations', 'O'), ('for', 'O'), ('future', 'O'), ('research', 'O'), ('in', 'O'), ('low-', 'O'), ('and', 'O'), ('middle-income', 'O'), ('countries', 'O'), ('(', 'O'), ('LMICs', 'O'), (')', 'O'), ('.', 'O')]]


In [132]:
#Get test sentences in a list
test_sentences=get_test_sentences(test_data)
print(test_sentences[0:2])

[['CCCVA', ',', 'MANOVA', ',', 'my', 'black', 'hen', '.'], ['Comments', 'on', 'repeated', 'measures', '.']]


In [133]:
#Feature Extraction for DL
#Convert words to indices for train and test sentences. Also convert greek characters to ASCII characters
X_train = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
               encode('ascii','ignore')] for w in s] for s in train_sentences]
X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
                    encode('ascii','ignore')] for w in s] for s in test_sentences]


#Pad train and test sentences to PAD_LENGTH(180 words)
X_train=pad_sequences(maxlen=PAD_LENGTH,sequences=X_train,padding="post",value=len_words-1)
X_test=pad_sequences(maxlen=PAD_LENGTH,sequences=X_test,padding="post",value=len_words-1)


In [134]:
#Convert tags to indices for train sentences
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}
y = [[tag2idx[w[1]] for w in s] for s in train_sentences]
#Pad tag labels to PAD_LENGTH(180 words)
y=pad_sequences(maxlen=PAD_LENGTH,sequences=y,padding="post",value=tag2idx["O"])

#One hot encode labels
y=[to_categorical(i,num_classes=len_tags)for i in y]

In [None]:
#Input layer with input of 180 dimensional word indices 
input = Input(shape=(PAD_LENGTH,)) 

#Embedding layer 
model=Embedding(input_dim=len_words,output_dim=PAD_LENGTH,input_length=PAD_LENGTH)(input)

#Adding dropout layer
model=Dropout(0.2)(model) 

#Bidirectional LSTM layer 
model=Bidirectional(LSTM(units=LSTM_N,return_sequences=True,recurrent_dropout=0.1))(model) 

#Time distributed dense layer 
output=TimeDistributed(Dense(n_tags,activation="softmax"))(model) #Softmax output layer

model=Model(input,output)

model.compile(optimizer="adam",loss="categorical_crossentropy,metrics=["accuracy","mean_absolute_error"])
history=model.fit(X_train,np.array(y),batch_size=BATCH_SIZE,epochs=2,validation_split=0.05,verbose=1)