In [16]:
# Keras==1.0.6
import numpy as np
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import  Activation
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

In [17]:
from keras.layers import Dense
from keras.layers import TimeDistributed

In [18]:
raw = open('wikigold.conll.txt', 'r').readlines()
 
all_x = []
point = []
for line in raw:
    stripped_line = line.strip().split(' ')
    point.append(stripped_line)
    if line == '\n':
        all_x.append(point[:-1])
        point = []
all_x = all_x[:-1]
 
lengths = [len(x) for x in all_x]
print('Input sequence length range: ', max(lengths), min(lengths))
 
short_x = [x for x in all_x if len(x) < 64]
 
X = [[c[0] for c in x] for x in short_x]
y = [[c[1] for c in y] for y in short_x]
 
all_text = [c for x in X for c in x]


Input sequence length range:  144 1


In [19]:
words = list(set(all_text))
word2ind = {word: index for index, word in enumerate(words)}
ind2word = {index: word for index, word in enumerate(words)}
labels = list(set([c for x in y for c in x]))
label2ind = {label: (index + 1) for index, label in enumerate(labels)}
ind2label = {(index + 1): label for index, label in enumerate(labels)}
print('Vocabulary size:', len(word2ind), len(label2ind))

Vocabulary size: 8285 5


In [20]:
maxlen = max([len(x) for x in X])
print('Maximum sequence length:', maxlen)
print(label2ind)

Maximum sequence length: 63
{'I-LOC': 1, 'O': 2, 'I-PER': 3, 'I-ORG': 4, 'I-MISC': 5}


In [21]:
def encode(x, n):
    result = np.zeros(n)
    result[x] = 1
    return result
 
X_enc = [[word2ind[c] for c in x] for x in X]
max_label = max(label2ind.values()) + 1
y_enc = [[0] * (maxlen - len(ey)) + [label2ind[c] for c in ey] for ey in y]
y_enc = [[encode(c, max_label) for c in ey] for ey in y_enc]
 
X_enc = pad_sequences(X_enc, maxlen=maxlen)
y_enc = pad_sequences(y_enc, maxlen=maxlen)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=11*32, train_size=45*32, random_state=42)
print('Training and testing tensor shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
 
max_features = len(word2ind)
embedding_size = 300
hidden_size = 32
out_size = len(label2ind) + 1

Training and testing tensor shapes: (1440, 63) (352, 63) (1440, 63, 6) (352, 63, 6)


In [23]:
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen, mask_zero=True))
model.add(LSTM(hidden_size, return_sequences=True))  


In [24]:
#model.add(TimeDistributedDense(out_size))
model.add(TimeDistributed(Dense(out_size)))

model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 63, 300)           2485500   
_________________________________________________________________
lstm_2 (LSTM)                (None, 63, 32)            42624     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 63, 6)             198       
_________________________________________________________________
activation_2 (Activation)    (None, 63, 6)             0         
Total params: 2,528,322
Trainable params: 2,528,322
Non-trainable params: 0
_________________________________________________________________


In [35]:
batch_size = 32
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10, validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Raw test score:', score)

Train on 1440 samples, validate on 352 samples
Epoch 1/10
  64/1440 [>.............................] - ETA: 3s - loss: 0.3400

  


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Raw test score: 0.33517230505293066


In [26]:
def score(yh, pr):
    coords = [np.where(yhh > 0)[0][0] for yhh in yh]
    yh = [yhh[co:] for yhh, co in zip(yh, coords)]
    ypr = [prr[co:] for prr, co in zip(pr, coords)]
    fyh = [c for row in yh for c in row]
    fpr = [c for row in ypr for c in row]
    return fyh, fpr
 
pr = model.predict_classes(X_train)
yh = y_train.argmax(2)
fyh, fpr = score(yh, pr)
print('Training accuracy:', accuracy_score(fyh, fpr))
print('Training confusion matrix:')
print(confusion_matrix(fyh, fpr))
precision_recall_fscore_support(fyh, fpr)
 
pr = model.predict_classes(X_test)
yh = y_test.argmax(2)
fyh, fpr = score(yh, pr)
print('Testing accuracy:', accuracy_score(fyh, fpr))
print('Testing confusion matrix:')
print(confusion_matrix(fyh, fpr))
precision_recall_fscore_support(fyh, fpr)

Training accuracy: 0.8576004855678446
Training confusion matrix:
[[    0     0     0     0     0     0]
 [    4     8   839     0   265     0]
 [    1     0 24747     1     3     0]
 [   33     0   931    85   177     0]
 [   20     0   952     1   569     0]
 [   35     0   844     2   115    24]]
Testing accuracy: 0.8519972918077183
Testing confusion matrix:
[[   0    0    0    0    0    0]
 [   5    2  227    0   34    0]
 [   7    0 6194    0    2    1]
 [  19    0  237   11   16    0]
 [   7    0  266    0   84    0]
 [   6    0  256    0   10    1]]


  'recall', 'true', average, warn_for)


(array([0.        , 1.        , 0.86267409, 1.        , 0.57534247,
        0.5       ]),
 array([0.        , 0.00746269, 0.99838814, 0.03886926, 0.23529412,
        0.003663  ]),
 array([0.        , 0.01481481, 0.92558279, 0.07482993, 0.33399602,
        0.00727273]),
 array([   0,  268, 6204,  283,  357,  273]))

In [27]:
from keras import backend as K
string = "John was a member of US Army"
wordlist = string.split(' ')
ip = []
for x in wordlist:
    ip.append(word2ind[x])
i=maxlen-len(ip)
temp=[0]*i
ip=temp+ip
input_layer = model.layers[0].input
output_layer = model.layers[3].output
op = K.function([input_layer], [output_layer])
out = op([[ip]])
temp = []
while i<maxlen:
    for j in label2ind:
        #print(out[0][i].tolist())
        if label2ind[j]==out[0][0][i].tolist().index(max(out[0][0][i])):
            temp.append(j)
    i=i+1
print(wordlist)
print(temp)

['John', 'was', 'a', 'member', 'of', 'US', 'Army']
['O', 'O', 'O', 'O', 'O', 'I-ORG']


In [34]:
out[0][0]

array([[1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 1.65256396e-01],
       [1.61958724e-01, 1.62683949e-01, 1.77147254e-01, 1.65883929e-01,
        1.67069778e-01, 

In [10]:
model.layers[3].name

'activation_1'

In [13]:
ip

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4768,
 6816,
 2857,
 3069,
 3651,
 3574,
 1428]