In [1]:
# Keras==1.0.6
import numpy as np
from keras.layers.recurrent import LSTM
from keras.layers.core import  Activation
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
import tensorflow_hub as hub
from keras.layers import Dense,TimeDistributed
import keras.layers as layers
from keras.models import Model,Sequential
import tensorflow as tf
from keras import backend as K
from keras.engine.topology import Layer
from keras.layers import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
raw = open('wikigold.conll.txt', 'r').readlines()
 
all_x = []
point = []
for line in raw:
    stripped_line = line.strip().split(' ')
    point.append(stripped_line)
    if line == '\n':
        all_x.append(point[:-1])
        point = []
all_x = all_x[:-1]
 
lengths = [len(x) for x in all_x]
print('Input sequence length range: ', max(lengths), min(lengths))
 
short_x = [x for x in all_x if len(x) < 64]
 
X = [[c[0] for c in x] for x in short_x]
y = [[c[1] for c in y] for y in short_x]
 
all_text = [c for x in X for c in x]


Input sequence length range:  144 1


In [3]:
words = list(set(all_text))
word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}
labels = list(set([c for x in y for c in x]))
label2ind = {label: (index + 1) for index, label in enumerate(labels)}
ind2label = {(index + 1): label for index, label in enumerate(labels)}
print('Vocabulary size:', len(word2ind), len(label2ind))

Vocabulary size: 8285 5


In [4]:
maxlen = max([len(x) for x in X])
print('Maximum sequence length:', maxlen)
print(label2ind)

Maximum sequence length: 63
{'I-MISC': 1, 'I-ORG': 2, 'I-PER': 3, 'I-LOC': 4, 'O': 5}


In [5]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result
 
X_enc = [[word2ind[c] for c in x] for x in X]
max_label = max(label2ind.values()) + 1
y_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y]
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]
 
X_enc = pad_sequences(X_enc, maxlen=maxlen)
y_enc = pad_sequences(y_enc, maxlen=maxlen)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=11*32, train_size=45*32, random_state=42)
print('Training and testing tensor shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
 
max_features = len(word2ind)
embedding_size = 300
hidden_size = 32
out_size = len(label2ind) + 1
batch_size = 32

Training and testing tensor shapes: (1440, 63) (352, 63) (1440, 63, 6) (352, 63, 6)


In [7]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


In [8]:
sess = tf.Session()
K.set_session(sess)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [9]:

class ELMoEmbedding(Layer):

    def __init__(self, idx2word, output_mode="default", trainable=True, **kwargs):
        assert output_mode in ["default", "word_emb", "lstm_outputs1", "lstm_outputs2", "elmo"]
        assert trainable in [True, False]
        self.idx2word = idx2word
        self.output_mode = output_mode
        self.trainable = trainable
        self.max_length = None
        self.word_mapping = None
        self.lookup_table = None
        self.elmo_model = None
        self.embedding = None
        super(ELMoEmbedding, self).__init__(**kwargs)

    def build(self, input_shape):
        self.max_length = input_shape[1]
        self.word_mapping = [x[1] for x in sorted(self.idx2word.items(), key=lambda x: x[0])]
        self.lookup_table = tf.contrib.lookup.index_to_string_table_from_tensor(self.word_mapping, default_value="<UNK>")
        self.lookup_table.init.run(session=K.get_session())
        self.elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=self.trainable)
        super(ELMoEmbedding, self).build(input_shape)

    def call(self, x):
        x = tf.cast(x, dtype=tf.int64)
        sequence_lengths = tf.cast(tf.count_nonzero(x, axis=1), dtype=tf.int32)
        strings = tf.squeeze(self.lookup_table.lookup(x))
        inputs = {
            "tokens": strings,
            "sequence_len": sequence_lengths
        }
        return self.elmo_model(inputs, signature="tokens", as_dict=True)[self.output_mode]

    def compute_output_shape(self, input_shape):
        if self.output_mode == "default":
            return (input_shape[0], 1024)
        if self.output_mode == "word_emb":
            return (input_shape[0], self.max_length, 512)
        if self.output_mode == "lstm_outputs1":
            return (input_shape[0], self.max_length, 1024)
        if self.output_mode == "lstm_outputs2":
            return (input_shape[0], self.max_length, 1024)
        if self.output_mode == "elmo":
            return (input_shape[0], self.max_length, 1024)

    def get_config(self):
        config = {
            'idx2word': self.idx2word,
            'output_mode': self.output_mode 
        }
        return list(config.items())


In [10]:
sentence_input = Input(shape=(X_train.shape[1],), dtype=tf.int64)
sentence_embedding = ELMoEmbedding(idx2word=ind2word, output_mode="elmo", trainable=True)(sentence_input) # These two are interchangeable

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [13]:
dropout = Dropout(0.5)(sentence_embedding)
lstm_ = LSTM(hidden_size,batch_size=batch_size, return_sequences=True)(sentence_embedding)
timed_ = layers.TimeDistributed(layers.Dense(out_size))(lstm_)
pred = layers.Activation('softmax')(timed_)

In [16]:
model = Model(inputs=[sentence_input], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 63)                0         
_________________________________________________________________
el_mo_embedding_1 (ELMoEmbed (None, 63, 1024)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 63, 32)            135296    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 63, 6)             198       
_________________________________________________________________
activation_1 (Activation)    (None, 63, 6)             0         
Total params: 135,494
Trainable params: 135,494
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10, validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Raw test score:', score)

  


Train on 1440 samples, validate on 352 samples
Epoch 1/10
 224/1440 [===>..........................] - ETA: 3:59 - loss: 1.1687

KeyboardInterrupt: 

In [52]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr

In [54]:
y_prob = model.predict(X_train) 
y_classes = y_prob.argmax(axis=-1)

In [56]:
y_classes

array([[0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 0, 0, 3],
       ...,
       [0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 1, 1, 3],
       [0, 0, 0, ..., 3, 1, 3]])

In [59]:
y_classes[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3])

In [62]:
yh = y_train.argmax(2)

array([[0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 0, 0, 3],
       ...,
       [0, 0, 0, ..., 3, 3, 3],
       [0, 0, 0, ..., 1, 1, 3],
       [0, 0, 0, ..., 3, 1, 3]])

In [64]:
pr=y_classes
fyh, fpr = score(yh, pr)
print('Training accuracy:', accuracy_score(fyh, fpr))
print('Training confusion matrix:')
print(confusion_matrix(fyh, fpr))
precision_recall_fscore_support(fyh, fpr)

Training accuracy: 0.9404167790666307
Training confusion matrix:
[[    0     0     0     0     0     0]
 [    0   903    10    90    39    74]
 [    0    85   366   286    35   248]
 [    0    40    41 24544    16   111]
 [    0    35     1    62  1083    45]
 [    1   195    26   271    56   993]]


  'recall', 'true', average, warn_for)


(array([0.        , 0.71780604, 0.82432432, 0.97192413, 0.88120423,
        0.67505099]),
 array([0.        , 0.80913978, 0.35882353, 0.99159664, 0.88336052,
        0.64396887]),
 array([0.        , 0.76074136, 0.5       , 0.98166183, 0.88228106,
        0.65914371]),
 array([    0,  1116,  1020, 24752,  1226,  1542]))

In [66]:
y_prob = model.predict(X_test) 
pr = y_prob.argmax(axis=-1)
yh = y_test.argmax(2)
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))
precision_recall_fscore_support(fyh, fpr)

Testing accuracy: 0.9298578199052133
Testing confusion matrix:
[[   0    0    0    0    0    0]
 [   0  191    6   26   15   30]
 [   0   28   81   82    5   77]
 [   1    8    4 6142   13   36]
 [   0   23    1   15  231   13]
 [   0   31    6   75   23  222]]


  'recall', 'true', average, warn_for)


(array([0.        , 0.6797153 , 0.82653061, 0.96876972, 0.80487805,
        0.58730159]),
 array([0.        , 0.71268657, 0.2967033 , 0.99000645, 0.81625442,
        0.62184874]),
 array([0.        , 0.69581056, 0.43665768, 0.97927296, 0.81052632,
        0.60408163]),
 array([   0,  268,  273, 6204,  283,  357]))

In [159]:
from keras import backend as K
string = "John was a member of US Army"
wordlist1 = string.split(' ')
ip = []
for x in wordlist:
    ip.append(word2ind[x])
i=maxlen-len(ip)
temp=[0]*i
ip1=temp+ip

string = "US Army is war"
wordlist2 = string.split(' ')
ip = []
for x in wordlist:
    ip.append(word2ind[x])
i=maxlen-len(ip)
temp=[0]*i
ip2=temp+ip


input_layer = model.layers[1].input
output_layer = model.layers[4].output
op = K.function([input_layer], [output_layer])


In [142]:
xxxxx=[ip1,ip2]

In [146]:
ttt=np.array(xxxxx,dtype=np.int32)

In [147]:
ttt.shape

(2, 63)

In [135]:
y_prob2 = model.predict(X_test[10:14]) 
#pr2 = y_prob.argmax(axis=-1)

In [148]:
out = op([ttt])

In [164]:
out[0][0]
out[0][1].shape

(63, 6)

In [167]:
i=maxlen-len(ip1)

temp = []
while i<maxlen:
    for j in label2ind:        
        #if label2ind[j]==out[0][0][i].tolist().index(max(out[0][0][i])):
        if label2ind[j]==out[0][0][i].tolist().index(max(out[0][0][i])):
            temp.append(j)
    i=i+1
print(wordlist1)
print(temp)

['John', 'was', 'a', 'member', 'of', 'US', 'Army']
['I-PER', 'O', 'O', 'O', 'O', 'O', 'I-ORG']


In [169]:
i=maxlen-len(ip2)

temp = []
while i<maxlen:
    for j in label2ind:        
        #if label2ind[j]==out[0][0][i].tolist().index(max(out[0][0][i])):
        if label2ind[j]==out[0][1][i].tolist().index(max(out[0][1][i])):
            temp.append(j)
    i=i+1
print(wordlist2)
print(temp)

['US', 'Army', 'is', 'war']
['O', 'I-ORG', 'O', 'O']


In [166]:
len(ip1)

63