In [1]:
##---------------------------------------------------------------------------------
## Summary : perform pos tagging using bidirectional LSTM
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time
import matplotlib.pyplot as plt
import random
from collections import defaultdict

In [2]:
# declare the combination we want to use
use_teacher_forcing = True
use_structure_perceptron = True
use_cost_augmented = True
use_hinge = True
use_schedule = True


In [3]:
# lets convert the words into integer
# The default dictionary takes a function as input and outupts 
# it if key is not present in the map.
w2i = defaultdict(lambda : len(w2i))
t2i = defaultdict(lambda : len(t2i))

# lets write a method to read the data from file
def readData(fileName):
    retList = []
    
    with open(fileName,'r+',encoding='utf8') as f:
        for line in f:
            words = []
            tags = []
            for wt in line.strip().split():
                word,tag = wt.split('|')
                words.append(w2i[word])
                tags.append(t2i[tag])
            retList.append([words,tags])
    return retList
        

In [4]:
# read the train data 
train = readData('../data/tags/train.txt')
unk_word = w2i["<unk>"]
w2i = defaultdict(lambda: unk_word, w2i)
unk_tag = t2i["<unk>"]
start_tag = t2i["<start>"]
t2i = defaultdict(lambda: unk_tag, t2i)

# read the test data
test = readData('../data/tags/dev.txt')

nwords = len(w2i)
ntags = len(t2i)
print(nwords,' : ',ntags)

29002  :  11


In [5]:
# write a class for always true parser
class AlwaysTrueSampler:
    #An always true sampler, only sample from true distribution.

    def sample_true(self):
        return True

    def decay(self):
        pass
    
    
# write a class for schedule sampler. This class initially gives
# correct values and slowly starts giving wrong labels by using
# simulated annealing
class ScheduleSampler:
    def __init__(self, start_rate=1, min_rate=0.2, decay_rate=0.1):
        self.min_rate = min_rate
        self.iter = 0
        self.decay_rate = decay_rate
        self.start_rate = start_rate
        self.reach_min = False
        self.sample_rate = start_rate
    
    # method for modifying decay rate
    def decay_func(self):
        if not self.reach_min:
            self.sample_rate = self.start_rate - self.iter * self.decay_rate
            if self.sample_rate < self.min_rate:
                self.reach_min = True
                self.sample_rate = self.min_rate

    def decay(self):
        self.iter += 1
        self.decay_func()
        print("Sample rate is now ",self.sample_rate)
        
    def sample_true(self):
        return (random.random() < self.sample_rate)

In [6]:
# Use the appropriate sampler
if use_schedule:
    sampler = ScheduleSampler()
else:
    sampler = AlwaysTrueSampler()



In [7]:
# lets declare the embedding size and hidden layer size
EMB_SIZE = 64
HID_SIZE = 64
TAG_EMBED_SIZE = 16

# lets declare the model and trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# lets declare the parameters to the model
W_emb = model.add_lookup_parameters((nwords,EMB_SIZE))

# use tag embeddings if teacher forcing is present
if(use_teacher_forcing):
    tag_emb = model.add_lookup_parameters((ntags,TAG_EMBED_SIZE))
    
# declare the forward lstm
if(use_teacher_forcing):
    fwdLstm = dy.LSTMBuilder(1,EMB_SIZE+TAG_EMBED_SIZE,HID_SIZE/2,model)
else:
    fwdLstm = dy.LSTMBuilder(1,EMB_SIZE,HID_SIZE/2,model)
    
# backward lstm is same in both cases
bwdLstm = dy.LSTMBuilder(1,EMB_SIZE,HID_SIZE/2,model)

# Word-level softmax
W_sm = model.add_parameters((ntags, HID_SIZE))
b_sm = model.add_parameters((ntags))

In [8]:
# lets write a method to compute scores when teacher forcing is not present
def computeScores(sent):
    # renew cg
    dy.renew_cg()
    wSoftmax = dy.parameter(W_sm)
    bSoftmax = dy.parameter(b_sm)
        
    # get the embeddings for the words
    wordEmbs = [W_emb[x] for x in sent]
    
    # get the init state of fwdLSTM
    finit = fwdLstm.initial_state()
    fwd_word_reps = finit.transduce(wordEmbs)
    
    # get the init state of bwdLSTM
    binit = bwdLstm.initial_state()
    bwd_word_reps = binit.transduce(reversed(wordEmbs))
    
    # concatanate the two reps
    wordReps = [dy.concatenate((fwd,bwd) for fwd,bwd in zip(fwd_word_reps,reversed(bwd_word_reps)))]
    
    # compute the scores
    scores = [dy.affine_transform([bSoftmax,wSoftmax,wordRep]) for wordRep in wordReps]
                
    return scores


# lets define a method to compute the scores when teacher forcing is present
def computeScoresWithPrevTag(sent,tags=None):
    # renew cg
    dy.renew_cg()
    wSoftmax = dy.parameter(W_sm)
    bSoftmax = dy.parameter(b_sm)
        
    # get the embeddings for the words
    wordEmbs = [W_emb[x] for x in sent]
    
    # get the init state of bwdLSTM
    binit = bwdLstm.initial_state()
    bwd_word_reps = binit.transduce(reversed(wordEmbs))
    
    # get the init state of fwdLSTM
    finit = fwdLstm.initial_state()
    
    # initialize the previous tag to start tag
    prevTag = start_tag
    scores = []
    
    # iterate through the sentence and compute the scores
    index = 0
    for word,bwd_rep in zip(wordEmbs,reversed(bwd_word_reps)):
        finit = finit.add_input(dy.concatenate([word,tag_emb[prevTag]]))
        fullRep = dy.concatenate([finit.output(),bwd_rep])
        
        #perform softmax and compute score
        score = dy.affine_transform([bSoftmax,wSoftmax,fullRep])
        scores.append(score)
        prediction = np.argmax(score.npvalue())
        
        # check if tags are given
        if tags is not None:
            # if sampler returns true give correct tag
            if sampler.sample_true():
                prevTag = tags[index]
            # if sampler returns false give predicted tag 
            else :
                prevTag = prediction
            index += 1
        else :
            prevTag = prediction
            
    return scores
        

In [9]:
#  now lets define various methods to compute losses,scores,correct Labels

# compute MLE loss 
def computeMLELoss(scores, tags):
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    return dy.esum(losses)


# compute the hamming loss
def computeHammingLoss(predictions,actuals):
    return sum([pred != actual for pred,actual in zip(predictions,actuals)])


# compute the sequence scores. This is the score of entire sentence
def computeSeqScores(scores, tags):
    return dy.esum([score[int(tag)] for score, tag in zip(scores, tags)])


# compute the hamming augmented decoder
def hammingAugDecode(scores, tags):
    augmentedResult = []
    
    for score,tag in zip(scores,tags):
        originalScore = score.npvalue()
        cost = np.ones(originalScore.shape)
        # correct tag will have zero score added
        cost[tag] = 0
        augmentedResult.append(np.argmax(np.add(originalScore,cost)))
    
    return augmentedResult


def computePercepLoss(scores, tags):
    # compute the predictions
    if use_cost_augmented:
        predictions = hammingAugDecode(scores, tags)
    else:
        predictions = [np.argmax(score.npvalue()) for score in scores]

    # declare the margin
    margin = dy.scalarInput(-2)
    
    # check if predictions and tags are same
    if predictions!=tags:
        # compute the sequence scores
        predScore = computeSeqScores(scores,predictions)
        actualScore = computeSeqScores(scores,tags)
        
        # check if we want to use cost augmented loss
        if use_cost_augmented:
            hamming = dy.scalarInput(computeHammingLoss(predictions,tags))
            loss = predScore + hamming - actualScore
        else:
            loss = predScore - actualScore
        
        # check if we want to use hinge loss
        if use_hinge:
            loss = dy.emax([dy.scalarInput(0),loss-margin])
            
    else :
        loss = dy.scalarInput(0)
    
    return loss
  
# method to return loss    
def computeLoss(scores,tags):
    if use_structure_perceptron:
        return computePercepLoss(scores,tags)
    else:
        return computeMLELoss(scores,tags)
    
    
# compute the number of correct classifications
def computeCorr(scores,tags):
    correct = [(np.argmax(score.npvalue())==tag) for score,tag in zip(scores,tags)]
    return sum(correct) 



In [10]:
# lets perform training
print('Starting training ....')

for i in range(20):
    # randomly shuffle the training examples
    random.shuffle(train)
    trainLoss = 0
    totalWords = 0
    numCorr = 0
    startTime = time.time()
    
    for sent,tags in train:
        scores = computeScoresWithPrevTag(sent,tags)
        loss = computeLoss(scores,tags)
        loss.backward()
        trainer.update()
        trainLoss += loss.value()
        totalWords += len(sent)
        numCorr += computeCorr(scores,tags)
    print("Train_loss at iter : ",i," is ",trainLoss/totalWords,' Accuracy : ',(numCorr/totalWords),". Time taken : ",(-startTime+time.time()))
    
    testLoss = 0
    totalWords = 0
    numCorr = 0
    startTime = time.time()
    
    for sent,tags in test:
        scores = computeScoresWithPrevTag(sent)
        loss = computeLoss(scores,tags)
        testLoss += loss.value()
        totalWords += len(sent)
        numCorr += computeCorr(scores,tags)
    print("Test_loss at iter : ",i," is ",testLoss/totalWords,' Accuracy : ',(numCorr/totalWords),". Time taken : ",(-startTime+time.time()))

Starting training ....
Train_loss at iter :  0  is  0.25094247251978047  Accuracy :  0.9208427740910667 . Time taken :  105.90236020088196
Test_loss at iter :  0  is  0.37651000948397484  Accuracy :  0.8714076960545543 . Time taken :  7.030837535858154
Train_loss at iter :  1  is  0.15612875653871944  Accuracy :  0.9614615502463332 . Time taken :  100.41570615768433
Test_loss at iter :  1  is  0.376483876095111  Accuracy :  0.8838669982310867 . Time taken :  6.929711818695068
Train_loss at iter :  2  is  0.11741959584887873  Accuracy :  0.9739233504543658 . Time taken :  89.43639063835144
Test_loss at iter :  2  is  0.38909091671846774  Accuracy :  0.886635732048094 . Time taken :  7.466298818588257
Train_loss at iter :  3  is  0.09210035231969706  Accuracy :  0.9811803030180859 . Time taken :  83.35605072975159
Test_loss at iter :  3  is  0.4200089200420039  Accuracy :  0.8844309995641808 . Time taken :  7.807456016540527
Train_loss at iter :  4  is  0.07256802750115936  Accuracy :  0