In [1]:
##---------------------------------------------------------------------------------
## Summary : Perform language translation using LSTM encoder decoder with attention
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time 
import matplotlib.pyplot as plt
import random
from collections import defaultdict

In [2]:
# lets convert word to integer. Since source and target languages are different
# we use two different converters

w2i_src = defaultdict(lambda: len(w2i_src))
w2i_trg = defaultdict(lambda: len(w2i_trg))


# insert UNK and eos into both languages
UNK_SRC = w2i_src['<unk>']
UNK_TRG = w2i_trg['<unk>']
EOS_SRC = w2i_src['</s>']
EOS_TRG = w2i_trg['</s>']
SOS_TRG = w2i_trg['<s>']


# lets write a method to read the sentences from two languages at the same time
def readDataset(srcFileName,trgFileName):
    retList = []
    
    with open(srcFileName,'r+',encoding='utf8') as src, open(trgFileName,'r+',encoding='utf8') as trg:
        for srcLine,trgLine in zip(src,trg):
            srcWords = [w2i_src[x] for x in srcLine.lower().strip().split()+['</s>']]
            trgWords = [w2i_trg[x] for x in ['<s>']+trgLine.strip().split()+['</s>']]
            retList.append([srcWords,trgWords])
    return retList

In [3]:
# read the train dataSet
train = readDataset('../data/parallel/train.ja','../data/parallel/train.en')
w2i_src = defaultdict(lambda: UNK_SRC, w2i_src)             # when the item is not present it is treated as UNK
w2i_trg = defaultdict(lambda: UNK_TRG, w2i_trg)             # when the item is not present it is treated as UNK

nWordsSrc = len(w2i_src)
nWordsTrg = len(w2i_trg)

print(nWordsSrc,' : ',nWordsTrg)

# read dev and test data 
dev = readDataset('../data/parallel/dev.ja','../data/parallel/dev.en')
test = readDataset('../data/parallel/test.ja','../data/parallel/test.en')

# method to convert the integers back to words
i2w_trg = {v: k for k, v in w2i_trg.items()}
i2w_src = {v: k for k, v in w2i_src.items()}

8059  :  7043


In [4]:
# Model parameters
EMBED_SIZE = 64
HIDDEN_SIZE = 128
ATTENTION_SIZE = 128
BATCH_SIZE = 16

# During the early stages of training and sometimes in later stages of training the model
# can generate very long sentences. To prevent that put a max limit on the sentence size
MAX_SENT_SIZE = 50


# lets declare the model and trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# lets declare the parameters
# lookup params for the words
W_emb_src = model.add_lookup_parameters((nWordsSrc,EMBED_SIZE))
W_emb_trg = model.add_lookup_parameters((nWordsTrg,EMBED_SIZE))

# LSTMS for the encoders and decoders. We are using the BiDirectional LSTM for source and
# normal one for decoder
lstmSrc = dy.BiRNNBuilder(1,EMBED_SIZE,HIDDEN_SIZE,model,dy.LSTMBuilder)
lstmTrg = dy.LSTMBuilder(1,EMBED_SIZE,HIDDEN_SIZE,model)

# weights for layer before the softmax layer. This is additional component for attention
W_Atten = model.add_parameters((HIDDEN_SIZE,2*HIDDEN_SIZE))
b_Atten = model.add_parameters((HIDDEN_SIZE))

# softmax weights
W_sm = model.add_parameters((nWordsTrg,HIDDEN_SIZE))
b_sm = model.add_parameters((nWordsTrg))

# attention weights
w1_att_src_p = model.add_parameters((ATTENTION_SIZE, HIDDEN_SIZE))
w1_att_tgt_p = model.add_parameters((ATTENTION_SIZE, HIDDEN_SIZE))
w2_att_p = model.add_parameters((ATTENTION_SIZE))


In [5]:
# lets write a method to compute the attention given source output matrix,
# target embedding and fixed component
# we are using MLP w2*tanh(W1*(q,k))
def calc_attention(src_output_matrix,trg_out_embedding,fixed_component):
    # get the attention weights to computation graph
    w1_Trg = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    
    attention = dy.transpose(dy.tanh(dy.colwise_add(fixed_component,w1_Trg*trg_out_embedding)))*w2_att
    alignment = dy.softmax(attention)
    attention = src_output_matrix*alignment
    return attention,alignment

In [6]:
# lets write a method to compute the loss
def computeLoss(sents):
    # renew the computation graph
    dy.renew_cg()
    
    # get the source and target sentences
    src_sents = [x[0] for x in sents]
    trg_sents = [x[1] for x in sents]
    src_cws = []

    # get the length of source sentence 
    len_src = [len(x) for x in src_sents]
    max_src_len = np.max(len_src)
    num_words = 0
    
    # get all the ith words aligned
    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])
        
    # get the output of the encoder LSTM
    src_outputs = [dy.concatenate([x.output(),y.output()]) for x,y in lstmSrc.add_inputs([dy.lookup_batch(W_emb_src,cws) for cws in src_cws])]
    src_output = src_outputs[-1]          # output of encoder
    
    # get the fixed component to caluclate the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att = dy.parameter(w1_att_src_p)
    fixed_component = w1_att * src_output_matrix
    
    # lets do the decoder part now
    allLosses = []
    
    # we need to get the mask and get the target words
    len_trg = [len(x) for x in trg_sents]
    max_trg_len = np.max(len_trg)
    trg_cws = []
    masks = []
    
    for i in range(max_trg_len):
        trg_cws.append([sent[i] if len(sent) > i else EOS_TRG for sent in trg_sents])
        mask = [(1 if len(sent)>i else 0) for sent in trg_sents]
        masks.append(mask)
        num_words += sum(mask)
        
    # initialize the current state of the decoder
    cur_state = lstmTrg.initial_state().set_s([src_output,dy.tanh(src_output)])
    prev_words = trg_cws[0]
    
    # get the parameters to Computation Graph
    weightsAtten = dy.parameter(W_Atten)
    biasesAtten = dy.parameter(b_Atten)
    weightsSm = dy.parameter(W_sm)
    biasesSm = dy.parameter(b_sm)
        
    for next_words,mask in zip(trg_cws[1:],masks):
        # feed the current state into the network and get the output
        cur_state = cur_state.add_input(dy.lookup_batch(W_emb_trg,prev_words))
        output_emb = cur_state.output()
        attention,alignment = calc_attention(src_output_matrix,output_emb,fixed_component)
        middle_expr = dy.tanh(dy.affine_transform([biasesAtten,weightsAtten,dy.concatenate([output_emb,attention])]))
        
        # get the scores and compute the loss
        s = dy.affine_transform([biasesSm,weightsSm,middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s,next_words))
        
        # get the mask
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        
        # compute the loss
        mask_loss = loss * mask_expr
        allLosses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(allLosses)), num_words

In [7]:
# now lets write a method to translate the sentence
def translate(sent):
    dy.renew_cg()
    
    # get the output of encoder
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in lstmSrc.add_inputs([W_emb_src[word] for word in sent])]
    src_output = src_outputs[-1]          # output of encoder
    
    # get the fixed component to caluclate the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att = dy.parameter(w1_att_src_p)
    fixed_component = w1_att * src_output_matrix
    
    
    # initialize the current state of the decoder
    cur_state = lstmTrg.initial_state().set_s([src_output,dy.tanh(src_output)])
    
    # initialize the source sent to SOS_TRG
    prev_word = SOS_TRG
    trg_sent = []
    attention_matrix = []
    
    # get the parameters to Computation Graph
    weightsAtten = dy.parameter(W_Atten)
    biasesAtten = dy.parameter(b_Atten)
    weightsSm = dy.parameter(W_sm)
    biasesSm = dy.parameter(b_sm)
    
    # generate the sentences while sentence length is less than max_sent_length
    for i in range(MAX_SENT_SIZE):
        cur_state = cur_state.add_input(W_emb_trg[prev_word])
        output_emb = cur_state.output()
        attention,alignment = calc_attention(src_output_matrix,output_emb,fixed_component)
        attention_matrix.append(alignment)
        middle_expr = dy.tanh(dy.affine_transform([biasesAtten,weightsAtten,dy.concatenate([output_emb,attention])]))
        
        # get the scores
        s = dy.affine_transform([biasesSm,weightsSm,middle_expr]).value()
        next_word = np.argmax(s)
        
        if next_word == EOS_TRG:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent,dy.concatenate_cols(attention_matrix).value()

In [8]:
# lets write a method to get the data into batches
def create_batches(sorted_dataset, max_batch_size):
    source = [x[0] for x in sorted_dataset]
    src_lengths = [len(x) for x in source]
    batches = []
    prev = src_lengths[0]
    prev_start = 0
    batch_size = 1
    
    for i in range(1, len(src_lengths)):
        if src_lengths[i] != prev or batch_size == max_batch_size:
            batches.append((prev_start, batch_size))
            prev = src_lengths[i]
            prev_start = i
            batch_size = 1
        else:
            batch_size += 1
    
    return batches

In [9]:
# lets start the training
print('started training.....')

for i in range(10):
    # Perform training
    train.sort(key=lambda t: len(t[0]), reverse=True)
    dev.sort(key=lambda t: len(t[0]), reverse=True)
    
    train_order = create_batches(train, BATCH_SIZE)
    dev_order = create_batches(dev, BATCH_SIZE)
    train_words, train_loss = 0, 0.0
    startTime = time.time()
    
    for sent_id, (start, length) in enumerate(train_order):
        train_batch = train[start:start+length]
        my_loss, num_words = computeLoss(train_batch)
        train_loss += my_loss.value()
        train_words += num_words
        my_loss.backward()
        trainer.update()    
    print("Training loss per word at iteration : ",i," is ",train_loss/train_words,". Time taken : ",(-startTime+time.time()))
    
    startTime = time.time()
    dev_words, dev_loss = 0, 0.0
    for sent_id, (start, length) in enumerate(dev_order):
        dev_batch = dev[start:start+length]
        my_loss, num_words = computeLoss(dev_batch)
        dev_loss += my_loss.value()
        dev_words += num_words
    print("Dev loss per word at iteration : ",i," is ",dev_loss/dev_words,". Time taken : ",(-startTime+time.time()))
    

started training.....
Training loss per word at iteration :  0  is  5.027200011916914 . Time taken :  107.88814067840576
Dev loss per word at iteration :  0  is  5.085063047139328 . Time taken :  2.4066691398620605
Training loss per word at iteration :  1  is  4.330869026744074 . Time taken :  107.15615224838257
Dev loss per word at iteration :  1  is  4.7745328823138395 . Time taken :  2.3922231197357178
Training loss per word at iteration :  2  is  4.015895939109432 . Time taken :  107.11620306968689
Dev loss per word at iteration :  2  is  4.542926129296629 . Time taken :  2.4305260181427
Training loss per word at iteration :  3  is  3.7655969351224354 . Time taken :  108.68468761444092
Dev loss per word at iteration :  3  is  4.575525460769799 . Time taken :  2.396254301071167
Training loss per word at iteration :  4  is  3.5617951850061242 . Time taken :  107.58853387832642
Dev loss per word at iteration :  4  is  4.522007629004197 . Time taken :  2.4098799228668213
Training loss 

In [10]:
# lets generate some sentences
src = test[0][0]
trg = test[0][1]

output_sent, attention_matrix = translate(src)
print('original Translation : ',[i2w_trg[x] for x in trg])
print('obtained Translation : ',output_sent)


# lets generate some sentences
src = test[1][0]
trg = test[1][1]

output_sent, attention_matrix = translate(src)
print('original Translation : ',[i2w_trg[x] for x in trg])
print('obtained Translation : ',output_sent)


# lets generate some sentences
src = test[2][0]
trg = test[2][1]

output_sent, attention_matrix = translate(src)
print('original Translation : ',[i2w_trg[x] for x in trg])
print('obtained Translation : ',output_sent)

original Translation :  ['<s>', 'can', 'you', 'do', 'it', 'in', 'one', 'day', '?', '</s>']
obtained Translation :  ['do', 'you', 'have', 'a', 'drink', '?']
original Translation :  ['<s>', 'he', 'stared', 'at', 'me', 'with', 'a', 'dating', 'smile', '.', '</s>']
obtained Translation :  ['i', 'got', 'a', 'lot', 'of', 'breath', '.']
original Translation :  ['<s>', 'it', '&apos;s', 'time', 'to', 'leave', '.', '</s>']
obtained Translation :  ['we', 'are', 'my', 'regular', 'change', '.']
