In [1]:
##---------------------------------------------------------------------
## Summary : Implementing optimum version of NN language model
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time
import matplotlib.pyplot as plt

import random 
from collections import defaultdict

In [2]:
# convert the words to integer
w2i = defaultdict(lambda: len(w2i))

# create the end of sentence and UNK tokens
S = w2i['<s>']
UNK = w2i['<unk>']

# lets declare a method to read the data
def readDataset(fileName):
    retList = []
    with open(fileName,'r+') as file:
        for line in file:
            words = [w2i[x] for x in line.lower().strip().split(' ')]
            retList.append(words)
            
    return retList

In [3]:
# now lets read the data
train = readDataset('../data/ptb/train.txt')
test = readDataset('../data/ptb/valid.txt')

# compute the number of words in the vocabulary
nWords = len(w2i)
print('Number of words in vocabulary : ',nWords)

# lets freeze the dictionary
w2i = defaultdict(lambda:UNK, len(w2i))

# lets write a method to convert int to words
i2w = {v:k for k,v in w2i.items()}

Number of words in vocabulary :  10000


In [4]:
# lets declare the embedding size and hidden layer size
nEmb = 32
nHid = 32

# lets declare the model and the trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# let's declare the parameters
W_emb = model.add_lookup_parameters((nWords,nEmb))

# let's declare the rnn
rnn = dy.LSTMBuilder(1,nEmb,nHid,model)

# let's declare the softmax weights
W_sm = model.add_parameters((nWords,nHid))
b_sm = model.add_parameters((nWords))

In [14]:
def computeLoss(sents):
    # renew the computation graph
    dy.renew_cg()
    
    # lets get the softmax weights
    weightsSoftmax = dy.parameter(W_sm)
    biasesSoftmax = dy.parameter(b_sm)
    
    # initialize the rnn state
    state = rnn.initial_state()
    
    # get the wids and masks for each step
    totalWords = 0
    wids = []
    masks = []
    
    # always the longest sentence will be in begining. Get the ith word for each sentence
    for i in range(len(sents[0])):
        wids.append([(sent[i] if len(sent) > i else S) for sent in sents])
        mask = [(1 if len(sent) > i else 0) for sent in sents]
        masks.append(mask)
        
        totalWords += sum(mask)
        
    # start the rnn by inputting <s>
    init_ids = [S] * len(sents)
    state = state.add_input(dy.lookup_batch(W_emb,init_ids))
    
    allLosses = []
    
    #feed each word to rnn and predict the next word
    for wid,mask in zip(wids,masks):
        # compute the scores biasesSoftmax + weightsSoftmax*state.output()
        scores = dy.affine_transform([biasesSoftmax,weightsSoftmax,state.output()])
        loss = dy.pickneglogsoftmax_batch(scores,wid)
        
        # don't count the loss if the mask is not 1
        if(mask[-1]!=1):
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1,), len(sents))
            loss = loss*mask_expr
            
        allLosses.append(loss)
        
        # update the state of rnn
        state = state.add_input(dy.lookup_batch(W_emb,wid))
        
    return dy.sum_batches(dy.esum(allLosses)), totalWords
        
            

In [None]:
# lets define the minibatch size
MB_SIZE = 16

# now we need to sort the train and test sentences in descending order
train.sort(key=lambda x:-len(x))
test.sort(key=lambda x:-len(x))

trainOrder = [x * MB_SIZE for x in range(int((len(train) - 1) / MB_SIZE + 1))]
testOrder = [x * MB_SIZE for x in range(int((len(test) - 1)/ MB_SIZE + 1))]


# let's start the training
for i in range(10):
    # shuffle the training data
    random.shuffle(trainOrder)
    startTime = time.time()
    
    trainLoss = 0
    totalWords = 0
    for sid in trainOrder:
        loss_exp, mb_words = computeLoss(train[sid:sid + MB_SIZE])
        
        trainLoss += loss_exp.scalar_value()
        totalWords += mb_words
        loss_exp.backward()
        trainer.update()
        
    print('Iteration ',i,' : ',' TrainingLoss : ',trainLoss/totalWords,' Number of words processed : ',totalWords,' Time taken : ',
         (time.time()-startTime))
    trainer.update_epoch(1.0)
    
    testLoss = 0
    testTotalWords = 0
    for sid in testOrder:
        loss_exp, mb_words = computeLoss(test[sid:sid + MB_SIZE])
        
        testLoss += loss_exp.scalar_value()
        testTotalWords += mb_words
        
    print('Iteration ',i,' : ',' TestingLoss : ',testLoss/testTotalWords,' Number of words processed : ',testTotalWords)

Iteration  0  :   TrainingLoss :  6.04545369317854  Number of words processed :  887521  Time taken :  298.27337551116943
Iteration  0  :   TestingLoss :  5.912341651965278  Number of words processed :  70390
Iteration  1  :   TrainingLoss :  5.780199519468154  Number of words processed :  887521  Time taken :  307.05250787734985
Iteration  1  :   TestingLoss :  5.732103717758974  Number of words processed :  70390
Iteration  2  :   TrainingLoss :  5.6164246383863485  Number of words processed :  887521  Time taken :  325.9535667896271
Iteration  2  :   TestingLoss :  5.625986420666228  Number of words processed :  70390
Iteration  3  :   TrainingLoss :  5.500422291772433  Number of words processed :  887521  Time taken :  312.1986541748047
Iteration  3  :   TestingLoss :  5.552818499398614  Number of words processed :  70390
Iteration  4  :   TrainingLoss :  5.408813760287597  Number of words processed :  887521  Time taken :  265.47605180740356
Iteration  4  :   TestingLoss :  5.4978