In [1]:
##---------------------------------------------------------------------------------
## Summary : Perform language translation using LSTM encoder decoder
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time 
import matplotlib.pyplot as plt
import random
from collections import defaultdict

In [2]:
# lets convert word to integer. Since source and target languages are different
# we use two different converters
w2i_src = defaultdict(lambda: len(w2i_src))
w2i_trg = defaultdict(lambda: len(w2i_trg))


# lets write a method to read the sentences from two languages at the same time
def readDataset(srcFileName,trgFileName):
    retList = []
    
    with open(srcFileName,'r+',encoding='utf8') as src, open(trgFileName,'r+',encoding='utf8') as trg:
        for srcLine,trgLine in zip(src,trg):
            srcWords = [w2i_src[x] for x in srcLine.lower().strip().split()]
            trgWords = [w2i_trg[x] for x in trgLine.strip().split()]
            retList.append([srcWords,trgWords])
    return retList

In [3]:
# read the train and test dataSets
train = readDataset('../data/parallel/train.ja','../data/parallel/train.en')

# insert UNK and eos into both languages
UNK_SRC = w2i_src['<unk>']
UNK_TRG = w2i_trg['<unk>']
EOS_SRC = w2i_src['</s>']
EOS_TRG = w2i_trg['</s>']
SOS_TRG = w2i_trg['<s>']

nWords_src = len(w2i_src)
nWords_trg = len(w2i_trg)


print('number of japanese words : ',nWords_src)
print('number of english words : ',nWords_trg)

w2i_src = defaultdict(lambda: UNK_SRC, w2i_src)
w2i_trg = defaultdict(lambda: UNK_TRG, w2i_trg)


# lets write a method to convert the integers back to words
i2w_trg = {v:k for k,v in w2i_trg.items()}

dev = readDataset('../data/parallel/dev.ja','../data/parallel/dev.en')

number of japanese words :  8059
number of english words :  7043


In [4]:
# lets declare the embedding size, Hidden layer size
nEmb = 64
nHid = 128

# lets declare the model and trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# lets declare the parameters we use in the network
# embeddings
W_emb_src = model.add_lookup_parameters((nWords_src,nEmb))
W_emb_trg = model.add_lookup_parameters(((nWords_trg,nEmb)))

# LSTMS for encoder and decoder
lstmEncoder = dy.LSTMBuilder(1,nEmb,nHid,model)
lstmDecoder = dy.LSTMBuilder(1,nEmb,nHid,model)

# softmax weights
W_sm = model.add_parameters((nWords_trg,nHid))
b_sm = model.add_parameters((nWords_trg))


In [5]:
# lets write a method to compute the loss given a batch of sentences
def computeLoss(sent):
    # renew the computation graph
    dy.renew_cg()
    
    # initialize allLosses and get softmax weights and biases to computation graph
    allLosses = []
    weightsSoftMax = dy.parameter(W_sm)
    biasesSoftMax = dy.parameter(b_sm)
        
    # get the source and target
    src = sent[0]
    trg = sent[1]
    
    # initialize the encoder LSTM
    enc_init = lstmEncoder.initial_state()
    
    # pass the source sent through the encoder get the encoder output
    enc_out = enc_init.add_inputs([W_emb_src[x] for x in src])[-1].output()
    
    # now lets initialize the decoder output and set it enc_out and hidden to tanh(enc_out)
    dec_curState = lstmDecoder.initial_state().set_s([enc_out,dy.tanh(enc_out)])
    
   
    prev_word = trg[0]
    # now iterate through every word in the sent and compute the loss
    for next_word in trg[1:]:
        # feed the current word to the decoder
        dec_curState = dec_curState.add_input(W_emb_trg[prev_word])
        output = dec_curState.output()
        
        # compute the softmax output
        softMaxOut = dy.affine_transform([biasesSoftMax,weightsSoftMax,output]) 
        
        # compute the loss and update the losses
        allLosses.append(dy.pickneglogsoftmax(softMaxOut, next_word))
        
        # update the prev_word
        prev_word = next_word
        
    return dy.esum(allLosses)

In [11]:
# lets declare the maximum sentence size
MAX_SENT_SIZE = 100

# now lets write a method to find the translation
def translate(sent):
    # renew the computation graph
    dy.renew_cg()
    
    # get the weights and biases of softmax to the computation graph
    weightsSoftmax = dy.parameter(W_sm)
    biasesSoftmax = dy.parameter(b_sm)
    
    # pass the input sentence throught the encoder
    enc_init = lstmEncoder.initial_state()
    enc_out = enc_init.add_inputs([W_emb_src[x] for x in sent])[-1].output()
    
    # now lets initialize the decoder output and set it enc_out and hidden to tanh(enc_out)
    dec_curState = lstmDecoder.initial_state().set_s([enc_out,dy.tanh(enc_out)])
    
    # lets declare the first word to be SOS
    prev_word = SOS_TRG
    retSent = []
    
    for i in range(MAX_SENT_SIZE):
        dec_curState = dec_curState.add_input(W_emb_trg[prev_word])
        output = dec_curState.output()
        
        # compute the softmax output
        softMaxOut = dy.affine_transform([biasesSoftmax,weightsSoftmax,output]) 
        
        # compute probabilities of each word and choose the word with highest probability
        # prob = dy.log_softmax(softMaxOut).npvalue()
        next_word = np.argmax(softMaxOut.npvalue())
        
        if next_word == EOS_TRG:
            break
        
        # update the prev_word
        prev_word = next_word
        retSent.append(i2w_trg[next_word])
    return retSent

In [7]:
# lets start the training
print('started training.....')

for i in range(100):
    # shuffle the training examples
    random.shuffle(train)
    
    startTime = time.time()
    trainLoss = devLoss = 0
    
    for sent in train:
        loss = computeLoss(sent)
        trainLoss += loss.value()
        loss.backward()
        trainer.update()
    print("Training loss per sentence at iteration : ",i," is ",trainLoss/len(train),". Time taken : ",(-startTime+time.time()))
    
    startTime = time.time()
    for sent in dev:
        loss = computeLoss(sent)
        devLoss += loss.value()
        loss.backward()
        trainer.update()
    print("Training loss per sentence at iteration : ",i," is ",devLoss/len(dev),". Time taken : ",(-startTime+time.time()))

started training.....
Training loss per sentence at iteration :  0  is  48.786718786621094 . Time taken :  468.01820969581604
Training loss per sentence at iteration :  0  is  43.86825667572022 . Time taken :  22.597745656967163
Training loss per sentence at iteration :  1  is  41.67604865269661 . Time taken :  457.0389778614044
Training loss per sentence at iteration :  1  is  38.1275664434433 . Time taken :  22.603434801101685
Training loss per sentence at iteration :  2  is  38.42163778848648 . Time taken :  462.5360805988312
Training loss per sentence at iteration :  2  is  34.869053071975706 . Time taken :  22.546091079711914
Training loss per sentence at iteration :  3  is  35.774328156399726 . Time taken :  464.0526051521301
Training loss per sentence at iteration :  3  is  32.34425690937042 . Time taken :  22.766379594802856
Training loss per sentence at iteration :  4  is  33.42880968849659 . Time taken :  465.3874969482422
Training loss per sentence at iteration :  4  is  30.

Training loss per sentence at iteration :  78  is  8.67041477765739 . Time taken :  233.38076257705688
Training loss per sentence at iteration :  78  is  10.610354239225387 . Time taken :  11.468691349029541
Training loss per sentence at iteration :  79  is  8.585863729365915 . Time taken :  232.92167377471924
Training loss per sentence at iteration :  79  is  10.463948942124844 . Time taken :  11.390565872192383
Training loss per sentence at iteration :  80  is  8.661557903084159 . Time taken :  234.4440860748291
Training loss per sentence at iteration :  80  is  10.319949804782867 . Time taken :  13.01561689376831
Training loss per sentence at iteration :  81  is  8.604815092778951 . Time taken :  234.6403419971466
Training loss per sentence at iteration :  81  is  10.540611353874207 . Time taken :  11.46874713897705
Training loss per sentence at iteration :  82  is  8.549598021784425 . Time taken :  232.95106434822083
Training loss per sentence at iteration :  82  is  10.31531825113

In [12]:
# now lets translate few sentences
sents = []
sents.append("i like my steak medium")
sents.append("have you ever watched sumo wrestling ?")

sents_i = [[w2i_src[x] for x in sent.lower().strip().split(' ')] for sent in sents]

i=0
for sent,trans in dev:
    print(sent)
    out = translate(sent)
    print(out)
    i += 1
    if(i>5):
        break

[22, 8057, 1, 2049, 147, 41, 168, 183, 8]
['is', 'in', 'this', 'city', '.', '.', 'don', '&apos;t', 'done', '?', '.', 'don', '&apos;t', 'it', 'was', 'his', 'opinion', 'from', 'his', '.', 'don', '&apos;t', 'you', '?', 'don', '&apos;t', 'it', '?', 'wasn', '&apos;t', 'know', 'how', 'to', 'do', '.', 'about', 'the', 'not', '?', '&quot;', '&quot;', '&quot;', '&quot;', '&quot;', '&quot;', '&quot;', '&quot;', '&quot;', 'use', 'his', 'opinion', '.', '&quot;', 'no', 'use', 'the', 'city', '.', '&quot;', 'no', '.', 'we', 'can', 'off', 'his', 'paper', '.', 'too', 'the', 'crops', '.', '&quot;', 'goes', 'to', 'use', 'one', 'of', 'the', '.', 'opinion', '.', 'i', 'am', 'no', '.', 'i', 'can', '&apos;t', 'have', 'a', 'hat', '.', '&quot;', 'never', 'can', 'be', 'raining', '.', 'not', 'solve']
[2527, 16, 1204, 5, 37, 10, 35, 515, 167, 8057, 60, 10, 35, 733, 6, 452, 69, 6, 42, 184, 41, 168, 183, 8]
['i', 'have', 'to', 'have', 'follow', 'my', 'subject', '.', '&quot;', 'i', '&apos;ll', 'get', 'put', 'on', 'the