In [39]:
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [40]:
# Using nltk treebank corpus
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [41]:
tagged_sentences

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

### Restructuring data

In [42]:
sentences=[]
sentence_tags =[]
for tagged_sentence in tagged_sentences:
    sentence=[]
    tags=[]
    for (s,w) in tagged_sentence:
        sentence.append(s)
        tags.append(w)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

In [43]:
sentences[2]

array(['Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former',
       'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',',
       'was', 'named', '*-1', 'a', 'nonexecutive', 'director', 'of',
       'this', 'British', 'industrial', 'conglomerate', '.'], dtype='<U12')

In [44]:
sentence_tags[2]

array(['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', 'CC', 'JJ', 'NN', 'IN',
       'NNP', 'NNP', 'NNP', 'NNP', ',', 'VBD', 'VBN', '-NONE-', 'DT',
       'JJ', 'NN', 'IN', 'DT', 'JJ', 'JJ', 'NN', '.'], dtype='<U6')

In [45]:
(train_sentences, test_sentences, train_tags, test_tags)=train_test_split(sentences, sentence_tags, test_size=0.2)

In [46]:
len(train_sentences)

3131

### Constructing word to index and tag to index mapping

In [47]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

In [48]:
for ts in train_tags:
    for t in ts:
        tags.add(t)

In [49]:
tags

{'#',
 '$',
 "''",
 ',',
 '-LRB-',
 '-NONE-',
 '-RRB-',
 '.',
 ':',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 '``'}

In [58]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for out-of-vocabulary words 
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [63]:
# Changing sentences and tags to integers
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    train_sentences_X.append(np.array(s_int))
    
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_sentences_X.append(np.array(s_int))
    
for s in train_tags:
    train_tags_y.append(np.array([tag2index[t] for t in s]))
    
for s in test_tags:
    test_tags_y.append(np.array([tag2index[t] for t in s]))

In [65]:
#padding all sequences
MAX_LENGTH=0
for s in sentences:
    MAX_LENGTH=max(MAX_LENGTH,len(s))
MAX_LENGTH

271

In [66]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')