In [None]:
import nltk
nltk.download('treebank')
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [None]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [None]:
tokens, pos_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    tokens.append(np.array(sentence))
    pos_tags.append(np.array(tags))

In [None]:
print(tokens[5])
print(pos_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [None]:
train_tokens, test_tokens, train_tags, test_tags = train_test_split(tokens, pos_tags, test_size=0.2)

In [None]:
words, tags = set([]), set([])
 
for s in train_tokens:
    for w in s:
        words.add(w.lower())
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [None]:
X_train, X_test, y_train, y_test = [], [], [], []
 
for s in train_tokens:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    X_train.append(s_int)
 
for s in test_tokens:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    X_test.append(s_int)
 
for s in train_tags:
    y_train.append([tag2index[t] for t in s])
 
for s in test_tags:
    y_test.append([tag2index[t] for t in s])
 
print(X_train[0])
print(X_test[0])
print(y_train[0])
print(y_test[0])

[5852, 6996, 8349, 9209, 5852, 3028, 9419, 184, 7567, 1040, 184, 9122, 8974, 7025, 3043, 8159, 4536, 6047, 10063, 184, 5750, 7963, 9509, 3028, 1402, 6414, 2718, 7581, 6559, 10156, 6297, 184, 8181, 3250, 8181, 8712, 1599, 4885, 9816, 2854, 2937, 4885, 184, 2026, 3028, 6162, 184, 3028, 1346, 265, 7006, 9613, 4901, 110, 3092]
[9040, 6286, 2150, 3482, 7782, 7224, 184, 5400, 6093, 3260, 3028, 5743, 7062, 7955, 5154, 3028, 8082, 265, 10079, 7567, 380, 1, 7711, 2505, 1312, 1840, 632, 3092]
[37, 17, 31, 26, 37, 17, 6, 27, 37, 26, 27, 28, 25, 33, 8, 6, 6, 34, 6, 27, 3, 17, 37, 17, 31, 26, 6, 46, 37, 6, 14, 27, 3, 3, 37, 6, 8, 34, 6, 37, 6, 34, 27, 37, 17, 26, 27, 17, 6, 23, 6, 8, 6, 8, 15]
[37, 31, 4, 11, 3, 31, 27, 4, 11, 44, 17, 6, 26, 25, 33, 17, 26, 23, 26, 37, 17, 26, 37, 4, 23, 26, 4, 15]


In [None]:
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH)

271


In [None]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train, maxlen=MAX_LENGTH, padding='post')
y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test, maxlen=MAX_LENGTH, padding='post')
 
print(X_train[0])
print(X_test[0])
print(y_train[0])
print(y_test[0])

[ 5852  6996  8349  9209  5852  3028  9419   184  7567  1040   184  9122
  8974  7025  3043  8159  4536  6047 10063   184  5750  7963  9509  3028
  1402  6414  2718  7581  6559 10156  6297   184  8181  3250  8181  8712
  1599  4885  9816  2854  2937  4885   184  2026  3028  6162   184  3028
  1346   265  7006  9613  4901   110  3092     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(MAX_LENGTH, )))
model.add(tf.keras.layers.Embedding(len(word2index), 100))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.Dense(len(tag2index), activation=tf.nn.softmax))
#model.add(tf.keras.layers.Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 271, 100)          1019600   
                                                                 
 bidirectional_2 (Bidirectio  (None, 271, 128)         84480     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 271, 47)           6063      
                                                                 
Total params: 1,110,143
Trainable params: 1,110,143
Non-trainable params: 0
_________________________________________________________________


In [None]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [None]:
cat_ytrain = to_categorical(y_train, len(tag2index))
print(cat_ytrain[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
model.fit(X_train, to_categorical(y_train, len(tag2index)), batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa9b042ce90>

In [None]:
scores = model.evaluate(X_test, to_categorical(y_test, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

accuracy: 98.92126321792603


In [None]:
test_samples = [
    "running is very important for me .".split(),
    "I was running every day for a month .".split()
]
print(test_samples)

[['running', 'is', 'very', 'important', 'for', 'me', '.'], ['I', 'was', 'running', 'every', 'day', 'for', 'a', 'month', '.']]


In [None]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = tf.keras.preprocessing.sequence.pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[7094 4573 7288 6026 7567 4129 3092    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [None]:
predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

[[[7.54597485e-02 2.39929679e-04 4.17178730e-03 ... 6.52788300e-03
   9.89545658e-02 1.31242198e-03]
  [7.01300474e-03 9.38352459e-05 1.83206541e-03 ... 1.88941807e-02
   1.73729844e-02 9.71579226e-04]
  [3.20604607e-03 1.12227957e-04 3.10662412e-03 ... 1.55645364e-03
   3.73521238e-03 1.61245500e-03]
  ...
  [9.99011159e-01 1.04446090e-05 1.04097944e-05 ... 6.07076936e-06
   9.75065632e-06 6.87468219e-06]
  [9.98229325e-01 2.45515894e-05 1.78659029e-05 ... 9.91485831e-06
   1.19246724e-05 1.42134895e-05]
  [9.97088850e-01 4.81694588e-05 2.76321261e-05 ... 1.48188183e-05
   1.37563929e-05 2.53818998e-05]]

 [[2.54528108e-03 1.43287893e-04 4.52591712e-03 ... 8.65982939e-03
   6.31608674e-03 2.45225197e-03]
  [9.86973057e-04 4.58473041e-05 7.21509452e-04 ... 3.06960591e-03
   1.58742536e-04 5.77664352e-04]
  [1.76105555e-02 2.05269142e-04 6.27550576e-03 ... 3.17853736e-03
   3.08294535e-01 1.59028894e-03]
  ...
  [9.99011159e-01 1.04452465e-05 1.04090204e-05 ... 6.06966978e-06
   9.75204

In [None]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [None]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['NNP', 'VBZ', 'RB', 'JJ', 'IN', 'PRP', '.', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', 