# Building POS Tagger using Keras Library 

We are going to work with Twitter *Hindi-English* code mixed tweets. For the purpose of data, we have 1981 tweets which are in **conll** format. They are tagged manually with the *language* and *POS*

#### Load Dependencies

In [1]:
import keras
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.layers import Embedding # new!
from keras.layers import Conv1D, SpatialDropout1D, GlobalMaxPooling1D

from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint # new! 
from sklearn.model_selection import train_test_split
import os # new! 
from sklearn.metrics import roc_auc_score, roc_curve # new!
import pandas as pd
import matplotlib.pyplot as plt # new!
%matplotlib inline

Using TensorFlow backend.


In [2]:
# function to define arr with (word, lang, tag)
def make_arr(f1):
    twitter_file = open(f1, "r")
    sentences = []
    sent = []
    for line in twitter_file:
        temp = line.split('\t')
        
        if temp[0] == '\n':
            sentences.append(sent)
            sent = []
            continue

        check = list(temp[2])
        if '\n' in check:
            check.remove('\n')

        temp[2] = ''.join(check)
        sent.append((temp[0], temp[1], temp[2]))


    return sentences

#### Load data 

In [3]:
tagged_sentences = make_arr("Twitter_file.txt")
no_of_sentences = 1981 # number of sentences to take for corpus

In [4]:
len(tagged_sentences)

1981

#### Separate the sentences words and tags into two different arrays

In [5]:
sentences, sentence_tags = [], []
for sent in tagged_sentences:
    sentence, lang, tags = zip(*sent)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

In [6]:
len(sentences), len(sentence_tags)

(1981, 1981)

In [7]:
sentences[0], sentence_tags[0]

(array(['I', 'will', 'read', 'kal', 'pakka', '.', 'Have', 'to', 'study',
        'right', 'now', 'okay', '?', '\xf0\x9f\x98\x80',
        'https://t.co/C2SrZhfJfK'], dtype='|S23'),
 array(['PR_PRP', 'V_VAUX', 'V_VM', 'RB_ALC', 'JJ', 'RD_PUNC', 'V_VM',
        'RP_RPD', 'N_NN', 'RB_AMN', 'RB_AMN', 'RP_INJ', 'RD_PUNC', 'E',
        'U'], dtype='|S7'))

In [8]:
len(sentences[0]), len(sentence_tags[0])

(15, 15)

### Train the required number of sentences *

In [9]:
sentences = sentences[:no_of_sentences]
sentence_tags = sentence_tags[:no_of_sentences]

In [10]:
(train_sentences, test_sentences, train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

In [11]:
print sentences[0], sentence_tags[0]

['I' 'will' 'read' 'kal' 'pakka' '.' 'Have' 'to' 'study' 'right' 'now'
 'okay' '?' '\xf0\x9f\x98\x80' 'https://t.co/C2SrZhfJfK'] ['PR_PRP' 'V_VAUX' 'V_VM' 'RB_ALC' 'JJ' 'RD_PUNC' 'V_VM' 'RP_RPD' 'N_NN'
 'RB_AMN' 'RB_AMN' 'RP_INJ' 'RD_PUNC' 'E' 'U']


#### For the embedding layer to work, find the count of unique words

In [12]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)
        
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [13]:
print train_tags[0]

['@' 'PR_PRP' 'V_VM' 'RB_ALC' 'N_NN' 'DM_DMD' 'JJ' 'N_NNP' 'PSP' 'N_NST'
 'PSP' 'N_NN' 'V_VAUX' 'V_VM' 'DM_DMD' 'JJ' 'N_NN' 'V_VM' 'PR_PRP'
 'RP_INTF' 'JJ' 'RP_NEG' 'V_VAUX' 'RB_ALC' 'E']


#### Convert the words to the numpy array so that, numerical data can be used for training

In [14]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

print(train_sentences_X[0])
print(train_tags_y[0])
print(test_sentences_X[0])
print(test_tags_y[0])
print len(train_sentences_X[0]), len((train_tags_y[0]))

[4353, 267, 975, 3527, 5785, 3400, 2567, 1464, 5790, 3938, 5108, 1595, 7059, 2561, 3400, 4586, 2567, 400, 1628, 8501, 7035, 2394, 7333, 5545, 2904]
[18, 39, 3, 35, 17, 33, 5, 10, 36, 15, 36, 17, 29, 3, 33, 5, 17, 3, 39, 13, 5, 7, 29, 35, 20]
[7704, 8219, 1, 6622, 4811, 1, 3754, 6551, 1, 7638, 3953, 7059, 400, 2490, 1, 1044, 3754, 14]
[18, 18, 18, 4, 3, 10, 32, 17, 3, 17, 3, 29, 29, 39, 17, 10, 32, 18]
25 25


#### Pad the sequences because Keras can only work with fixed size sequences

In [15]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)  

207


In [16]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[4353  267  975 3527 5785 3400 2567 1464 5790 3938 5108 1595 7059 2561
 3400 4586 2567  400 1628 8501 7035 2394 7333 5545 2904    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [17]:
print(len(tags))

39


#### Define our neural architecture

In [18]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 207, 128)          1115776   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 207, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 207, 40)           20520     
_________________________________________________________________
activation_1 (Activation)    (None, 207, 40)           0         
Total params: 1,924,776
Trainable params: 1,924,776
Non-trainable params: 0
_________________________________________________________________


#### Since we have 39 tags for each word we need to convert it to ONE HOT ENCODING

In [20]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [21]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print cat_train_tags_y[0]

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [22]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(0.001),metrics=['accuracy'])

In [23]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Train on 1267 samples, validate on 317 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f1dd15fa190>

In [24]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print ({model.metrics_names[1]},  {scores[1] * 100})

(set(['acc']), set([95.0194091430539]))


#### Convert back the categorical to tokens

In [25]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [28]:
predictions = model.predict(test_sentences_X)

print len(predictions[0])
print len(test_tags_y[0])

# print first few predictions
token_sequences = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
# print(logits_to_tokens(predictions[:2], {i: t for t, i in tag2index.items()}))
test_tags_sequences = logits_to_tokens(test_tags_y, {i: t for t, i in tag2index.items()})

print len(predictions), len(test_tags_sequences)
# token_sequences[0] = filter(lambda a: a != '-PAD-', token_sequences[0])
print len(token_sequences[0][:len(test_tags[0])])
print len(test_tags[0])

207
207
397 397
18
18
