In [1]:
##---------------------------------------------------------------------------------
## Summary : perform pos tagging using bidirectional LSTM
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time 
import matplotlib.pyplot as plt
import random
from collections import defaultdict

In [2]:
# lets convert the words into integer
# The default dictionary takes a function as input and outupts 
# it if key is not present in the map.
w2i = defaultdict(lambda : len(w2i))
t2i = defaultdict(lambda : len(t2i))

# lets write a method to read the data from file
def readData(fileName):
    retList = []
    
    with open(fileName,'r+',encoding='utf8') as f:
        for line in f:
            words = []
            tags = []
            for wt in line.strip().split():
                word,tag = wt.split('|')
                words.append(w2i[word])
                tags.append(t2i[tag])
            retList.append([words,tags])
    return retList
        

In [3]:
# read the train data 
train = readData('../data/tags/train.txt')
unk_word = w2i["<unk>"]
w2i = defaultdict(lambda: unk_word, w2i)
unk_tag = t2i["<unk>"]
t2i = defaultdict(lambda: unk_tag, t2i)

# read the test data
test = readData('../data/tags/dev.txt')

nwords = len(w2i)
ntags = len(t2i)
print(nwords,' : ',ntags)

29002  :  10


In [4]:
# lets declare the embedding size and hidden layer size
EMB_SIZE = 64
HID_SIZE = 64

# lets declare the model and trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# lets declare the parameters to the model
W_emb = model.add_lookup_parameters((nwords,EMB_SIZE))

# BiRNN
lstm = dy.BiRNNBuilder(1,EMB_SIZE,HID_SIZE,model,dy.LSTMBuilder)

# Word-level softmax
W_sm = model.add_parameters((ntags, HID_SIZE))
b_sm = model.add_parameters((ntags))

In [5]:
# lets write a method to compute the scores
def computeScores(sent):
    # renew the cg
    dy.renew_cg()
    wSoftmax = dy.parameter(W_sm)
    bSoftmax = dy.parameter(b_sm)
        
    # get the embeddings for the words
    wordEmbs = [W_emb[x] for x in sent]
    
    # send the embeddings to the lstm
    wordReps = lstm.transduce(wordEmbs)
    
    scores = [dy.affine_transform([bSoftmax,wSoftmax,wordRep]) for wordRep in wordReps]
    
    return scores

In [6]:
# now lets define method to compute the loss 

# Calculate MLE loss for one example
def computeLoss(scores, tags):
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    return dy.esum(losses)

# compute the number of correct classifications
def computeCorr(scores,tags):
    correct = [(np.argmax(score.npvalue())==tag) for score,tag in zip(scores,tags)]
    return sum(correct)

In [7]:
# lets perform training
print('Starting training ....')
for i in range(20):
    # randomly shuffle the training examples
    random.shuffle(train)
    trainLoss = 0
    totalWords = 0
    numCorr = 0
    startTime = time.time()
    
    for sent,tags in train:
        scores = computeScores(sent)
        loss = computeLoss(scores,tags)
        loss.backward()
        trainer.update()
        trainLoss += loss.value()
        totalWords += len(sent)
        numCorr += computeCorr(scores,tags)
    print("Train_loss at iter : ",i," is ",trainLoss/totalWords,' Accuracy : ',(numCorr/totalWords),". Time taken : ",(-startTime+time.time()))
    
    testLoss = 0
    totalWords = 0
    numCorr = 0
    startTime = time.time()
    
    for sent,tags in test:
        scores = computeScores(sent)
        loss = computeLoss(scores,tags)
        testLoss += loss.value()
        totalWords += len(sent)
        numCorr += computeCorr(scores,tags)
    print("Test_loss at iter : ",i," is ",testLoss/totalWords,' Accuracy : ',(numCorr/totalWords),". Time taken : ",(-startTime+time.time()))

Starting training ....
Train_loss at iter :  0  is  0.33780106687746864  Accuracy :  0.8997089155693886 . Time taken :  87.308180809021
Test_loss at iter :  0  is  0.4073060695707017  Accuracy :  0.8711000589637757 . Time taken :  4.63921594619751
Train_loss at iter :  1  is  0.17900118804154638  Accuracy :  0.9465404494472621 . Time taken :  78.09503149986267
Test_loss at iter :  1  is  0.3669272116031442  Accuracy :  0.8911221062886149 . Time taken :  4.57405948638916
Train_loss at iter :  2  is  0.12352797027011744  Accuracy :  0.9645699449277934 . Time taken :  77.88749408721924
Test_loss at iter :  2  is  0.3790509220356239  Accuracy :  0.8948650242264209 . Time taken :  4.564040422439575
Train_loss at iter :  3  is  0.0898828662547492  Accuracy :  0.9751005894258138 . Time taken :  76.16704511642456
Test_loss at iter :  3  is  0.3820539796398534  Accuracy :  0.8995564898607942 . Time taken :  4.545998573303223
Train_loss at iter :  4  is  0.06462492268438766  Accuracy :  0.982611