In [1]:

import sys,os
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/gdrive')
  path_to_file = '/content/gdrive/My Drive/AI Sem II/NLP/A2'
  print(path_to_file)
  os.chdir(path_to_file)
  !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/AI Sem II/NLP/A2
/content/gdrive/My Drive/AI Sem II/NLP/A2


In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data_fr = 'data/training/news-commentary-v9.fr-en.fr'
data_en = 'data/training/news-commentary-v9.fr-en.en'

with open(data_fr, 'rb') as fr: 
  sents_fr = [line.decode("utf-8") for line in fr]      
 # sents_cs = [value for value in sents_cs if value != '']   
with open(data_en, 'rb') as en: 
  sents_en = [line.decode("utf-8") for line in en]
 # sents_en = [value for value in sents_en if value != '']
len(sents_en), len(sents_fr)

(183251, 183251)

In [4]:
#max length of string 
length_fr = [len(i.split()) for i in sents_fr]
max(length_fr)

223

In [5]:
length_en = [len(i.split()) for i in sents_en]
max(length_en)

171

In [6]:
SOS_token = 0
EOS_token = 1

class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2

  def addSentence(self, sentence):
    
    for word in sentence.split(' '):
      self.addWord(word)
  
  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word 
      self.n_words += 1
    else:
      self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
def sent_pairs(lang1=sents_fr, lang2=sents_en):
  pairs = []
  for i, (cs_sent, en_sent) in enumerate(zip(lang1, lang2)):
    #if i < 100:
      pairs.append([cs_sent, en_sent])
   # print(i)
  pairs = [[normalizeString(s) for s in line] for line in pairs]
  input_lang1 = Lang('fr')
  output_lang1 = Lang('en')

  input_lang2 = Lang('fr')
  output_lang2 = Lang('en')

  input_lang3 = Lang('fr')
  output_lang3 = Lang('en')

  input_lang4 = Lang('fr')
  output_lang4 = Lang('en')

  input_lang5 = Lang('fr')
  output_lang5 = Lang('en')

     
  return input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, pairs
 

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs

In [8]:

def prepareData(lang1=sents_fr, lang2=sents_en):
    input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, pairs = sent_pairs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))

    #Assumption: as the dataset is news commentary on different topics, there is highly unlikely exactly the same sentences 
    # collect test pairs
    num_test = int(len(pairs)*0.2)
    print("Number of test pairs:", num_test)
    random.seed(3)
    random.shuffle(pairs)
    
    #fold 1
    test_pairs1 = pairs[:num_test]
     # collect train pairs
    train_pairs1 = pairs[num_test:]
    print("Number of train pairs:", len(train_pairs1))
    print("Counting words...")

    for pair in train_pairs1:      
      input_lang1.addSentence(pair[0])
      output_lang1.addSentence(pair[1])
    print("Counted words:")
    print(input_lang1.name, input_lang1.n_words)
    print(output_lang1.name, output_lang1.n_words)

    #fold 2
    test_pairs2 = pairs[num_test:num_test*2]
     # collect train pairs
    train_pairs2 = pairs[:num_test]
    for x in pairs[num_test*2:]:
      train_pairs2.append(x)
    print("Number of train pairs:", len(train_pairs2))
    print("Counting words...")


    for pair in train_pairs2:      
      input_lang2.addSentence(pair[0])
      output_lang2.addSentence(pair[1])
    print("Counted words:")
    print(input_lang2.name, input_lang2.n_words)
    print(output_lang2.name, output_lang2.n_words)

    #fold 3
    test_pairs3 = pairs[num_test*2:num_test*3]
     # collect train pairs
    train_pairs3 = pairs[:num_test*2]
    for x in pairs[num_test*3:]:
      train_pairs3.append(x)
    print("Number of train pairs:", len(train_pairs3))
    print("Counting words...")


    for pair in train_pairs3:      
      input_lang3.addSentence(pair[0])
      output_lang3.addSentence(pair[1])
    print("Counted words:")
    print(input_lang3.name, input_lang3.n_words)
    print(output_lang3.name, output_lang3.n_words)


    #fold 4
    test_pairs4 = pairs[num_test*3:num_test*4]
     # collect train pairs
    train_pairs4 = pairs[:num_test*3]
    for x in pairs[num_test*4:]:
      train_pairs4.append(x)
    print("Number of train pairs:", len(train_pairs4))
    print("Counting words...")


    for pair in train_pairs4:      
      input_lang4.addSentence(pair[0])
      output_lang4.addSentence(pair[1])
    print("Counted words:")
    print(input_lang4.name, input_lang4.n_words)
    print(output_lang4.name, output_lang4.n_words)

    #fold 5
    test_pairs5 = pairs[num_test*4:]
     # collect train pairs
    train_pairs5 = pairs[:num_test*4]
    print("Number of train pairs:", len(train_pairs5))
    print("Counting words...")


    for pair in train_pairs5:      
      input_lang5.addSentence(pair[0])
      output_lang5.addSentence(pair[1])
    print("Counted words:")
    print(input_lang5.name, input_lang5.n_words)
    print(output_lang5.name, output_lang5.n_words)


    return (input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4,input_lang5, output_lang5,train_pairs1, 
            test_pairs1, train_pairs2, test_pairs2, train_pairs3, test_pairs3,train_pairs4, test_pairs4,train_pairs5, test_pairs5)


(input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, train_pairs1, test_pairs1, train_pairs2,
 test_pairs2,train_pairs3, test_pairs3, train_pairs4, test_pairs4,train_pairs5, test_pairs5) = prepareData(sents_fr, sents_en)
print(random.choice(train_pairs1))
print(random.choice(train_pairs2))
print(random.choice(train_pairs3))
print(random.choice(train_pairs4))
print(random.choice(train_pairs5))

Read 183251 sentence pairs
Number of test pairs: 36650
Number of train pairs: 146601
Counting words...
Counted words:
fr 57432
en 42977
Number of train pairs: 146601
Counting words...
Counted words:
fr 57510
en 42960
Number of train pairs: 146601
Counting words...
Counted words:
fr 57715
en 43042
Number of train pairs: 146601
Counting words...
Counted words:
fr 57623
en 43010
Number of train pairs: 146600
Counting words...
Counted words:
fr 57515
en 42988
['de petits groupes de palestiniens demolissent les vestiges des infrastructures industrielles aneanties par les bombes des blocs de beton qui polluent le paysage sablonneux .', 'small groups of palestinians smash up the remains of gaza s bombed industrial infrastructure the concrete blocks that litter the sandy landscape .']
['les hierarques chiites ayant coutume de releguer l avenement du mahdi a un avenir eloigne le penchant millenariste d ahmadinejad les agacent .', 'for the shia religious hierarchy long accustomed to relegating t

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 235
# Multiplicative attention https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN4(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN4, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc = nn.Linear(hidden_size, hidden_size, bias=False) #  Multiplicative
        ##################
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        x=self.fc(hidden)
        #print(x.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape )
        attn_weights = F.softmax(
            #################################
             encoder_outputs.unsqueeze(0).bmm(x.view(1,-1,1)).view(1,-1), dim=1)
            #################################
       # print(attn_weights.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang1, pair[0])
    target_tensor = tensorFromSentence(output_lang1, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs1))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [10]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang1, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang1.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang1, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang1.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang1.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs1)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs1:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs1:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [11]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang1.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN4(hidden_size, output_lang1.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 1 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 3s (- 78m 1s) (1000 1%) 6.1003
2m 0s (- 73m 35s) (2000 2%) 5.7335
3m 0s (- 72m 7s) (3000 4%) 5.6152
3m 58s (- 70m 42s) (4000 5%) 5.6211
5m 0s (- 70m 11s) (5000 6%) 5.5981
5m 58s (- 68m 37s) (6000 8%) 5.3672
6m 56s (- 67m 23s) (7000 9%) 5.4511
7m 57s (- 66m 37s) (8000 10%) 5.4655
8m 56s (- 65m 36s) (9000 12%) 5.4167
9m 57s (- 64m 41s) (10000 13%) 5.3526
10m 56s (- 63m 41s) (11000 14%) 5.4261
11m 57s (- 62m 46s) (12000 16%) 5.4372
12m 58s (- 61m 54s) (13000 17%) 5.3913
14m 1s (- 61m 5s) (14000 18%) 5.5450
15m 3s (- 60m 13s) (15000 20%) 5.5362
16m 6s (- 59m 24s) (16000 21%) 5.7007
17m 8s (- 58m 28s) (17000 22%) 5.5740
18m 13s (- 57m 42s) (18000 24%) 5.6719
19m 15s (- 56m 44s) (19000 25%) 5.5782
20m 21s (- 55m 58s) (20000 26%) 5.6486
21m 25s (- 55m 4s) (21000 28%) 5.4895
22m 27s (- 54m 6s) (22000 29%) 5.6012
23m 32s (- 53m 13s) (23000 30%) 5.5649
24m 37s (- 52m 19s) (24000 32%) 5.5409
25m 41s (- 51m 22s) (25000 33%) 5.5699
26m 46s (- 50m 26s) (26000 34%) 5.5490
27m 53s (- 49m 35s) (2700

###Fold2

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 235
# Multiplicative attention https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN4(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN4, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc = nn.Linear(hidden_size, hidden_size, bias=False) #  Multiplicative
        ##################
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        x=self.fc(hidden)
        #print(x.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape )
        attn_weights = F.softmax(
            #################################
             encoder_outputs.unsqueeze(0).bmm(x.view(1,-1,1)).view(1,-1), dim=1)
            #################################
       # print(attn_weights.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang2, pair[0])
    target_tensor = tensorFromSentence(output_lang2, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs2))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [13]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang2, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang2.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang2, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang2.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang2.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs2)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs2:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs2:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [14]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang2.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN4(hidden_size, output_lang2.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 2 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 2s (- 77m 41s) (1000 1%) 6.1116
2m 3s (- 75m 6s) (2000 2%) 5.7738
3m 2s (- 72m 52s) (3000 4%) 5.5165
4m 0s (- 71m 16s) (4000 5%) 5.5829
5m 0s (- 70m 10s) (5000 6%) 5.6764
5m 58s (- 68m 48s) (6000 8%) 5.4336
6m 59s (- 67m 52s) (7000 9%) 5.4825
8m 0s (- 67m 3s) (8000 10%) 5.5177
9m 1s (- 66m 13s) (9000 12%) 5.6157
10m 2s (- 65m 16s) (10000 13%) 5.4786
11m 1s (- 64m 8s) (11000 14%) 5.3637
12m 2s (- 63m 14s) (12000 16%) 5.4183
13m 1s (- 62m 6s) (13000 17%) 5.3865
14m 2s (- 61m 12s) (14000 18%) 5.5318
15m 5s (- 60m 23s) (15000 20%) 5.6171
16m 9s (- 59m 35s) (16000 21%) 5.6892
17m 12s (- 58m 43s) (17000 22%) 5.6252
18m 14s (- 57m 46s) (18000 24%) 5.5546
19m 20s (- 57m 1s) (19000 25%) 5.6417
20m 24s (- 56m 6s) (20000 26%) 5.6185
21m 27s (- 55m 11s) (21000 28%) 5.5627
22m 32s (- 54m 17s) (22000 29%) 5.5831
23m 37s (- 53m 23s) (23000 30%) 5.6024
24m 41s (- 52m 28s) (24000 32%) 5.5204
25m 44s (- 51m 28s) (25000 33%) 5.5363
26m 49s (- 50m 32s) (26000 34%) 5.5490
27m 52s (- 49m 34s) (27000 36%)

###Fold 3

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 235
# Multiplicative attention https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN4(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN4, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc = nn.Linear(hidden_size, hidden_size, bias=False) #  Multiplicative
        ##################
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        x=self.fc(hidden)
        #print(x.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape )
        attn_weights = F.softmax(
            #################################
             encoder_outputs.unsqueeze(0).bmm(x.view(1,-1,1)).view(1,-1), dim=1)
            #################################
       # print(attn_weights.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang3, pair[0])
    target_tensor = tensorFromSentence(output_lang3, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs3))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang3, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang3.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang3, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang3.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang3.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs3)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs3:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs3:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [17]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang3.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN4(hidden_size, output_lang3.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 3 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 4s (- 80m 6s) (1000 1%) 6.1452
2m 4s (- 75m 33s) (2000 2%) 5.6695
3m 3s (- 73m 17s) (3000 4%) 5.5035
4m 1s (- 71m 29s) (4000 5%) 5.5569
5m 0s (- 70m 4s) (5000 6%) 5.3796
5m 59s (- 68m 58s) (6000 8%) 5.3946
7m 0s (- 68m 3s) (7000 9%) 5.3879
8m 1s (- 67m 8s) (8000 10%) 5.4289
9m 0s (- 66m 5s) (9000 12%) 5.3511
10m 2s (- 65m 18s) (10000 13%) 5.4991
11m 3s (- 64m 17s) (11000 14%) 5.5117
12m 4s (- 63m 23s) (12000 16%) 5.4843
13m 6s (- 62m 32s) (13000 17%) 5.4492
14m 8s (- 61m 37s) (14000 18%) 5.4927
15m 12s (- 60m 49s) (15000 20%) 5.5997
16m 16s (- 60m 1s) (16000 21%) 5.6281
17m 21s (- 59m 13s) (17000 22%) 5.6039
18m 26s (- 58m 24s) (18000 24%) 5.6733
19m 30s (- 57m 30s) (19000 25%) 5.5655
20m 35s (- 56m 37s) (20000 26%) 5.6703
21m 39s (- 55m 40s) (21000 28%) 5.5087
22m 44s (- 54m 47s) (22000 29%) 5.5786
23m 47s (- 53m 48s) (23000 30%) 5.5616
24m 52s (- 52m 52s) (24000 32%) 5.5468
25m 56s (- 51m 52s) (25000 33%) 5.5471
27m 0s (- 50m 53s) (26000 34%) 5.5585
28m 5s (- 49m 55s) (27000 36%) 

###Fold 4

In [18]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 235
# Multiplicative attention https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN4(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN4, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc = nn.Linear(hidden_size, hidden_size, bias=False) #  Multiplicative
        ##################
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        x=self.fc(hidden)
        #print(x.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape )
        attn_weights = F.softmax(
            #################################
             encoder_outputs.unsqueeze(0).bmm(x.view(1,-1,1)).view(1,-1), dim=1)
            #################################
       # print(attn_weights.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang4, pair[0])
    target_tensor = tensorFromSentence(output_lang4, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs4))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [19]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang4, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang4.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang4, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang4.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang4.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs4)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs4:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs4:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang4.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN4(hidden_size, output_lang4.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 4 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

###Fold 5

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 235
# Multiplicative attention https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN4(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN4, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc = nn.Linear(hidden_size, hidden_size, bias=False) #  Multiplicative
        ##################
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        x=self.fc(hidden)
        #print(x.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape )
        attn_weights = F.softmax(
            #################################
             encoder_outputs.unsqueeze(0).bmm(x.view(1,-1,1)).view(1,-1), dim=1)
            #################################
       # print(attn_weights.view(1,-1,1).shape,encoder_outputs.unsqueeze(0).shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang5, pair[0])
    target_tensor = tensorFromSentence(output_lang5, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs5))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang5, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang5.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang5, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang5.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang5.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs5)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs5:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs5:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang5.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN4(hidden_size, output_lang5.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 5 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

In [None]:
evaluateRandomly(encoder1, attn_decoder1)