# Named-entity recognition with Bidirectional LSTM-CNNs

- Paper: https://arxiv.org/abs/1511.08308
- Keras code: https://github.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs
- Biomedical NER Data: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/tree/master/data

In [1]:
import numpy as np 
from validation import compute_f1
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from prepro import readfile,createBatches,createMatrices,iterate_minibatches,addCharInformatioin,padding
from keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform
from pprint import pprint

import gensim

Using TensorFlow backend.


In [2]:
epochs = 50
embedding_dim = 200

In [3]:
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    return predLabels, correctLabels

In [4]:
trainSentences = readfile("data/train.tsv")
devSentences = readfile("data/devel.tsv")
testSentences = readfile("data/test.tsv")

In [5]:
trainSentences[100]

[['The', 'O\n'],
 ['positive', 'O\n'],
 ['control', 'O\n'],
 ['for', 'O\n'],
 ['DMT1', 'O\n'],
 ['up', 'O\n'],
 ['-', 'O\n'],
 ['regulation', 'O\n'],
 ['was', 'O\n'],
 ['a', 'O\n'],
 ['murine', 'O\n'],
 ['model', 'O\n'],
 ['of', 'O\n'],
 ['dietary', 'B-Disease\n'],
 ['iron', 'I-Disease\n'],
 ['deficiency', 'I-Disease\n'],
 ['that', 'O\n'],
 ['demonstrated', 'O\n'],
 ['greatly', 'O\n'],
 ['increased', 'O\n'],
 ['levels', 'O\n'],
 ['of', 'O\n'],
 ['duodenal', 'O\n'],
 ['DMT1', 'O\n'],
 ['(', 'O\n'],
 ['IRE', 'O\n'],
 [')', 'O\n'],
 ['mRNA', 'O\n'],
 ['.', 'O\n']]

In [6]:
trainSentences = addCharInformatioin(trainSentences)
devSentences = addCharInformatioin(devSentences)
testSentences = addCharInformatioin(testSentences)

In [7]:
trainSentences[100]

[['The', ['T', 'h', 'e'], 'O\n'],
 ['positive', ['p', 'o', 's', 'i', 't', 'i', 'v', 'e'], 'O\n'],
 ['control', ['c', 'o', 'n', 't', 'r', 'o', 'l'], 'O\n'],
 ['for', ['f', 'o', 'r'], 'O\n'],
 ['DMT1', ['D', 'M', 'T', '1'], 'O\n'],
 ['up', ['u', 'p'], 'O\n'],
 ['-', ['-'], 'O\n'],
 ['regulation', ['r', 'e', 'g', 'u', 'l', 'a', 't', 'i', 'o', 'n'], 'O\n'],
 ['was', ['w', 'a', 's'], 'O\n'],
 ['a', ['a'], 'O\n'],
 ['murine', ['m', 'u', 'r', 'i', 'n', 'e'], 'O\n'],
 ['model', ['m', 'o', 'd', 'e', 'l'], 'O\n'],
 ['of', ['o', 'f'], 'O\n'],
 ['dietary', ['d', 'i', 'e', 't', 'a', 'r', 'y'], 'B-Disease\n'],
 ['iron', ['i', 'r', 'o', 'n'], 'I-Disease\n'],
 ['deficiency',
  ['d', 'e', 'f', 'i', 'c', 'i', 'e', 'n', 'c', 'y'],
  'I-Disease\n'],
 ['that', ['t', 'h', 'a', 't'], 'O\n'],
 ['demonstrated',
  ['d', 'e', 'm', 'o', 'n', 's', 't', 'r', 'a', 't', 'e', 'd'],
  'O\n'],
 ['greatly', ['g', 'r', 'e', 'a', 't', 'l', 'y'], 'O\n'],
 ['increased', ['i', 'n', 'c', 'r', 'e', 'a', 's', 'e', 'd'], 'O\n'],


In [8]:
labelSet = set()
words = {}

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

In [9]:
# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [10]:
# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')

In [12]:
# :: Read in word embeddings ::
word2Idx = {}

In [13]:
# Add 'PADDING_TOKEN' and 'UNKNOWN_TOKEN' to word2Idx
word2Idx['PADDING_TOKEN'] = 0
word2Idx['UNKNOWN_TOKEN'] = 1 

In [None]:
# 미리 학습한 Word2Vec 파일
pretrained_w2v_file = '../../PubMed-w2v.bin'

# 미리 학습한 Word2Vec 파일 읽기
pretrained_w2v = gensim.models.KeyedVectors.load_word2vec_format(pretrained_w2v_file, binary=True)

In [14]:
embedding_matrix = np.zeros((len(words)+2, embedding_dim))
for word, i in words.items():
    word2Idx[word] = len(word2Idx)
    if word in pretrained_w2v.vocab:
        embedding_matrix[i] = pretrained_w2v.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 0


In [15]:
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_<>()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)

In [16]:
train_set = padding(createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx,char2Idx))
dev_set = padding(createMatrices(devSentences,word2Idx, label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx))

In [17]:
idx2Label = {v: k for k, v in label2Idx.items()}

In [18]:
train_batch,train_batch_len = createBatches(train_set)
dev_batch,dev_batch_len = createBatches(dev_set)
test_batch,test_batch_len = createBatches(test_set)

In [19]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],  weights=[embedding_matrix], trainable=False)(words_input)
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)
character_input=Input(shape=(None,52,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)
output = concatenate([words, casing,char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)
model = Model(inputs=[words_input, casing_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')

In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 52)     0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 2910        char_input[0][0]                 
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 52, 30) 2730        dropout_1[0][0]                  
__________________________________________________________________________________________________
time_distr

In [21]:
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing,char = batch       
        model.train_on_batch([tokens, casing,char], labels)
        a.update(i)
    print(' ')

Epoch 0/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50

KeyboardInterrupt: 

In [22]:
#   Performance on dev dataset        
predLabels, correctLabels = tag_dataset(dev_batch)        
pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, idx2Label)
print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))



In [23]:
#   Performance on test dataset       
predLabels, correctLabels = tag_dataset(test_batch)        
pre_test, rec_test, f1_test= compute_f1(predLabels, correctLabels, idx2Label)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))

