In [1]:
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# Using nltk treebank corpus
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [3]:
tagged_sentences

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

### Restructuring data

In [4]:
sentences=[]
sentence_tags =[]
for tagged_sentence in tagged_sentences:
    sentence=[]
    tags=[]
    for (s,w) in tagged_sentence:
        sentence.append(s)
        tags.append(w)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

In [5]:
sentences[2]

array(['Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former',
       'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',',
       'was', 'named', '*-1', 'a', 'nonexecutive', 'director', 'of',
       'this', 'British', 'industrial', 'conglomerate', '.'], dtype='<U12')

In [6]:
sentence_tags[2]

array(['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', 'CC', 'JJ', 'NN', 'IN',
       'NNP', 'NNP', 'NNP', 'NNP', ',', 'VBD', 'VBN', '-NONE-', 'DT',
       'JJ', 'NN', 'IN', 'DT', 'JJ', 'JJ', 'NN', '.'], dtype='<U6')

In [7]:
(train_sentences, test_sentences, train_tags, test_tags)=train_test_split(sentences, sentence_tags, test_size=0.2)

In [8]:
len(train_sentences)

3131

### Constructing word to index and tag to index mapping

In [9]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

In [10]:
for ts in train_tags:
    for t in ts:
        tags.add(t)

In [11]:
tags

{'#',
 '$',
 "''",
 ',',
 '-LRB-',
 '-NONE-',
 '-RRB-',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``'}

In [12]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for out-of-vocabulary words 
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [13]:
# Changing sentences and tags to integers
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    train_sentences_X.append(s_int)
    
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_sentences_X.append(s_int)
    
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
    
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

In [14]:
# function to convert sequence of tags to sequence of one hot encoded tags
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [15]:
#padding all sequences
MAX_LENGTH=0
for s in sentences:
    MAX_LENGTH=max(MAX_LENGTH,len(s))
MAX_LENGTH

271

In [16]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

### Using RNN

In [17]:
model1 = Sequential()
model1.add(InputLayer(input_shape=(MAX_LENGTH, )))
model1.add(Embedding(len(word2index), 128))
model1.add(SimpleRNN(256, return_sequences=True))
model1.add(TimeDistributed(Dense(len(tag2index))))
model1.add(Activation('softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [18]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [19]:
model1.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy',ignore_class_accuracy(0)])

In [20]:
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 271, 128)          1310720   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 271, 256)          98560     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 271, 47)           12079     
_________________________________________________________________
activation_1 (Activation)    (None, 271, 47)           0         
Total params: 1,421,359
Trainable params: 1,421,359
Non-trainable params: 0
_________________________________________________________________


In [21]:
history=model1.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 2504 samples, validate on 627 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




In [22]:
print("Average training accuracy=",np.mean(history.history['acc']))

Average training accuracy= 0.9511339418590069


In [26]:
scalars = model1.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))



In [27]:
print("Accuracy= ",scalars[1]*100)

Accuracy=  98.59514629247089


In [28]:
#Testing on other samples
test_samples = [
    "running is very important for me .".split(),
    "I was running every day for a month .".split()
]

In [29]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

In [30]:
predictions = model1.predict(test_samples_X)

In [31]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [32]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['VBG', 'VBZ', 'RB', 'JJ', 'IN', 'PRP', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 

### Using LSTM

In [33]:
model2= Sequential()
model2.add(InputLayer(input_shape=(MAX_LENGTH, )))
model2.add(Embedding(len(word2index), 128))
model2.add(LSTM(256, return_sequences=True))
model2.add(TimeDistributed(Dense(len(tag2index))))
model2.add(Activation('softmax'))

In [34]:
model2.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy',ignore_class_accuracy(0)])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 271, 128)          1310720   
_________________________________________________________________
lstm_1 (LSTM)                (None, 271, 256)          394240    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 271, 47)           12079     
_________________________________________________________________
activation_2 (Activation)    (None, 271, 47)           0         
Total params: 1,717,039
Trainable params: 1,717,039
Non-trainable params: 0
_________________________________________________________________


In [None]:
history=model2.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Train on 2504 samples, validate on 627 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40

In [None]:
print("Average training accuracy=",np.mean(history.history['acc']))

In [None]:
scalars = model2.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))

In [None]:
print("Accuracy= ",scalars[1]*100)

In [None]:
predictions = model2.predict(test_samples_X)

In [None]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))