In [1]:
##---------------------------------------------------------------------------------
## Summary : use LSTMS to retrieve english text from japanese query 
## Author  : Srinivas Venkata Vemparala
## Source  : https://github.com/neubig/nn4nlp-code
##---------------------------------------------------------------------------------

%matplotlib inline
import numpy as np
import pandas as pd
import dynet as dy
import time 
import matplotlib.pyplot as plt
import random
from collections import defaultdict

In [2]:
# lets convert word to integer. Since source and target languages are different
# we use two different converters
w2i_src = defaultdict(lambda: len(w2i_src))
w2i_trg = defaultdict(lambda: len(w2i_trg))

# insert UNK into both languages
UNK_SRC = w2i_src['<unk>']
UNK_TRG = w2i_trg['<unk>']

# lets write a method to read the sentences from two languages at the same time
def readDataset(srcFileName,trgFileName):
    retList = []
    
    with open(srcFileName,'r+',encoding='utf8') as src, open(trgFileName,'r+',encoding='utf8') as trg:
        for srcLine,trgLine in zip(src,trg):
            srcWords = [w2i_src[x] for x in srcLine.lower().strip().split()]
            trgWords = [w2i_trg[x] for x in trgLine.strip().split()]
            retList.append([srcWords,trgWords])
    return retList

In [3]:
# read the train and test dataSets
train = readDataset('../data/parallel/train.ja','../data/parallel/train.en')
test = readDataset('../data/parallel/dev.ja','../data/parallel/dev.en')

nWords_src = len(w2i_src)
nWords_trg = len(w2i_trg)

# lets freeze the dictionary
w2i_src = defaultdict(lambda:UNK, len(w2i_src))
w2i_trg = defaultdict(lambda:UNK, len(w2i_trg))

print('number of japanese words : ',nWords_src)
print('number of english words : ',nWords_trg)

number of japanese words :  8251
number of english words :  7212


In [4]:
# lets declare the embedding,batchSize and hidden layer sizes
nEmb = 64
nHid = 128
MB_SIZE = 16

# lets declare the model and trainer
model = dy.Model()
trainer = dy.AdamTrainer(model)

# lets declare the embeddings for src and target
W_emb_src = model.add_lookup_parameters((nWords_src,nEmb))
W_emb_trg = model.add_lookup_parameters((nWords_trg,nEmb))

# lets declare the lSTMS for the model.we will be using 2 for src and 2 for trg
fwdLSTM_src = dy.LSTMBuilder(1,nEmb,nHid/2,model)
bwdLSTM_src = dy.LSTMBuilder(1,nEmb,nHid/2,model)
fwdLSTM_trg = dy.LSTMBuilder(1,nEmb,nHid/2,model)
bwdLSTM_trg = dy.LSTMBuilder(1,nEmb,nHid/2,model)

# we don't need softmax layer as we will be directly comparing the hidden layer outputs

In [5]:
# lets write a method to encode the sentences
def encodeSent(weightsEMB,fwd,bwd,sents):
    embs = [[weightsEMB[x] for x in sent] for sent in sents]
    return [dy.concatenate([fwd.transduce(x)[-1], bwd.transduce(x)[-1]]) for x in embs]
    

In [6]:
def computeLoss(sents):
    # renew the computation graph
    dy.renew_cg()
    
    # get the initial state of the rnns
    fwd_src = fwdLSTM_src.initial_state()
    bwd_src = bwdLSTM_src.initial_state()
    fwd_trg = fwdLSTM_trg.initial_state()
    bwd_trg = bwdLSTM_trg.initial_state()
    
    # encode the sentences
    source_rep = encodeSent(W_emb_src,fwd_src,bwd_src,[src for src,trg in sents])
    target_rep = encodeSent(W_emb_trg,fwd_trg,bwd_trg,[trg for src,trg in sents])
    
    # concatanate the columns to form a matrix
    mat_src = dy.concatenate_cols(source_rep)
    mat_trg = dy.concatenate_cols(target_rep)
    
    # multiply the src and target matrices to get the similarity
    sim_mat = dy.transpose(mat_src)*mat_trg
    
    # compute the hinge loss over all the dimensions
    hingeLoss = dy.hinge_dim(sim_mat,list(range(len(sents))),d=1,m=0.1) # margin is taken as 0.1 and true labels are elements
    # we can also compute the loss in both directions
    # hingeLoss2 = dy.hinge_dim(sim_mat,list(range(len(sents))),d=0,m=0.1)
    
    return dy.sum_elems(hingeLoss)

In [7]:
# lets write a method to index the corpus. This is later used to retrieve text
def indexCorpus(sents):
    retList = []
    
    # we are doing batching hence we are doing several sentences at the same time
    for sid in range(0,len(sents),MB_SIZE):
        # renew the computation graph
        dy.renew_cg()
        
        # get the initial states of the rnns
        fwd_src = fwdLSTM_src.initial_state()
        bwd_src = bwdLSTM_src.initial_state()
        fwd_trg = fwdLSTM_trg.initial_state()
        bwd_trg = bwdLSTM_trg.initial_state()
        
        # encode the sentences in source and target
        src_rep = encodeSent(W_emb_src,fwd_src,bwd_src,[src for src,trg in sents[sid:min(sid+MB_SIZE,len(sents))]])
        trg_rep = encodeSent(W_emb_src,fwd_trg,bwd_trg,[trg for src,trg in sents[sid:min(sid+MB_SIZE,len(sents))]])
        
        for src_expr,trg_expr in zip(src_rep,trg_rep):
            retList.append([src_expr,trg_expr])
    
    return retList

In [12]:
# Perform retrieval, and return both scores and ranked order of candidates
def retrieve(src, db_mtx):
    scores = np.dot(db_mtx,src)
    ranks = np.argsort(-scores)
    return ranks, scores

In [9]:
# lets start the training

for i in range(20):
    trainLoss = 0
    startTime = time.time()
    for sid in range(0,len(train),MB_SIZE):
        # compute the loss
        loss = computeLoss(train[sid:min(sid+MB_SIZE,len(train))])
        trainLoss += loss.scalar_value()
        loss.backward()
        trainer.update()
    print("Training loss per sentence at iteration : ",i," is ",trainLoss/len(train),". Time taken : ",(startTime-time.time()))
    
    testLoss = 0
    startTime = time.time()
    for sid in range(0,len(test),MB_SIZE):
        # compute the loss
        loss = computeLoss(test[sid:min(sid+MB_SIZE,len(test))])
        testLoss += loss.scalar_value()
    print("Training loss per sentence at iteration : ",i," is ",testLoss/len(test),". Time taken : ",(startTime-time.time()))

Training loss per sentence at iteration :  0  is  0.5724111133098603 . Time taken :  -87.35064840316772
Training loss per sentence at iteration :  0  is  0.29522717463970183 . Time taken :  -1.3724799156188965
Training loss per sentence at iteration :  1  is  0.2053221986114979 . Time taken :  -81.56462407112122
Training loss per sentence at iteration :  1  is  0.21539960038661957 . Time taken :  -1.2559986114501953
Training loss per sentence at iteration :  2  is  0.1016724161490798 . Time taken :  -79.67263054847717
Training loss per sentence at iteration :  2  is  0.2190750108361244 . Time taken :  -1.2630300521850586
Training loss per sentence at iteration :  3  is  0.05503851629793644 . Time taken :  -77.73596835136414
Training loss per sentence at iteration :  3  is  0.2443950602710247 . Time taken :  -1.25600004196167
Training loss per sentence at iteration :  4  is  0.031489483015984296 . Time taken :  -79.1491470336914
Training loss per sentence at iteration :  4  is  0.257245

ValueError: Attempt to use a stale expression.

In [13]:
# evaluate the actual retrival
reps = indexCorpus(test)
trg_mat = np.stack(trg for src,trg in reps)
    
# initialize the losses
recall_1 = recall_5 = recall_10 = 0
for i,(src,trg) in enumerate(reps):
    ranks,scores = retrieve(src,trg_mat)
        
    # check if i is first
    if ranks[0] == i:
        recall_1 += 1
            
    # check if i is in top 5    
    if i in ranks[:5]:
        recall_5 += 1
        
    # check if i is in top 10
    if i in ranks[:10]:
        recall_10 += 1
            
    print('Recall values : ',recall_1/len(test),' : ',recall_5/len(test),' : ',recall_10/len(test))

ValueError: Attempt to use a stale expression.