In [1]:
import sys,os
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/gdrive')
  path_to_file = '/content/gdrive/My Drive/AI Sem II/NLP/A2'
  print(path_to_file)
  os.chdir(path_to_file)
  !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/AI Sem II/NLP/A2
/content/gdrive/My Drive/AI Sem II/NLP/A2


In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data_fr = 'data/training/news-commentary-v9.fr-en.fr'
data_en = 'data/training/news-commentary-v9.fr-en.en'

with open(data_fr, 'rb') as fr: 
  sents_fr = [line.decode("utf-8") for line in fr]      
 # sents_cs = [value for value in sents_cs if value != '']   
with open(data_en, 'rb') as en: 
  sents_en = [line.decode("utf-8") for line in en]
 # sents_en = [value for value in sents_en if value != '']
len(sents_en), len(sents_fr)

(183251, 183251)

In [4]:
#max length of string 
length_fr = [len(i.split()) for i in sents_fr]
max(length_fr)

223

In [5]:
length_en = [len(i.split()) for i in sents_en]
max(length_en)

171

In [6]:
SOS_token = 0
EOS_token = 1

class Lang:
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {0: "SOS", 1: "EOS"}
    self.n_words = 2

  def addSentence(self, sentence):
    
    for word in sentence.split(' '):
      self.addWord(word)
  
  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word 
      self.n_words += 1
    else:
      self.word2count[word] += 1

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
def sent_pairs(lang1=sents_fr, lang2=sents_en):
  pairs = []
  for i, (cs_sent, en_sent) in enumerate(zip(lang1, lang2)):
    #if i < 100:
      pairs.append([cs_sent, en_sent])
   # print(i)
  pairs = [[normalizeString(s) for s in line] for line in pairs]
  input_lang1 = Lang('fr')
  output_lang1 = Lang('en')

  input_lang2 = Lang('fr')
  output_lang2 = Lang('en')

  input_lang3 = Lang('fr')
  output_lang3 = Lang('en')

  input_lang4 = Lang('fr')
  output_lang4 = Lang('en')

  input_lang5 = Lang('fr')
  output_lang5 = Lang('en')

     
  return input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, pairs
 

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs

In [8]:

def prepareData(lang1=sents_fr, lang2=sents_en):
    input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, pairs = sent_pairs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))

    #Assumption: as the dataset is news commentary on different topics, there is highly unlikely exactly the same sentences 
    # collect test pairs
    num_test = int(len(pairs)*0.2)
    print("Number of test pairs:", num_test)
    random.seed(3)
    random.shuffle(pairs)
    
    #fold 1
    test_pairs1 = pairs[:num_test]
     # collect train pairs
    train_pairs1 = pairs[num_test:]
    print("Number of train pairs:", len(train_pairs1))
    print("Counting words...")

    for pair in train_pairs1:      
      input_lang1.addSentence(pair[0])
      output_lang1.addSentence(pair[1])
    print("Counted words:")
    print(input_lang1.name, input_lang1.n_words)
    print(output_lang1.name, output_lang1.n_words)

    #fold 2
    test_pairs2 = pairs[num_test:num_test*2]
     # collect train pairs
    train_pairs2 = pairs[:num_test]
    for x in pairs[num_test*2:]:
      train_pairs2.append(x)
    print("Number of train pairs:", len(train_pairs2))
    print("Counting words...")


    for pair in train_pairs2:      
      input_lang2.addSentence(pair[0])
      output_lang2.addSentence(pair[1])
    print("Counted words:")
    print(input_lang2.name, input_lang2.n_words)
    print(output_lang2.name, output_lang2.n_words)

    #fold 3
    test_pairs3 = pairs[num_test*2:num_test*3]
     # collect train pairs
    train_pairs3 = pairs[:num_test*2]
    for x in pairs[num_test*3:]:
      train_pairs3.append(x)
    print("Number of train pairs:", len(train_pairs3))
    print("Counting words...")


    for pair in train_pairs3:      
      input_lang3.addSentence(pair[0])
      output_lang3.addSentence(pair[1])
    print("Counted words:")
    print(input_lang3.name, input_lang3.n_words)
    print(output_lang3.name, output_lang3.n_words)


    #fold 4
    test_pairs4 = pairs[num_test*3:num_test*4]
     # collect train pairs
    train_pairs4 = pairs[:num_test*3]
    for x in pairs[num_test*4:]:
      train_pairs4.append(x)
    print("Number of train pairs:", len(train_pairs4))
    print("Counting words...")


    for pair in train_pairs4:      
      input_lang4.addSentence(pair[0])
      output_lang4.addSentence(pair[1])
    print("Counted words:")
    print(input_lang4.name, input_lang4.n_words)
    print(output_lang4.name, output_lang4.n_words)

    #fold 5
    test_pairs5 = pairs[num_test*4:]
     # collect train pairs
    train_pairs5 = pairs[:num_test*4]
    print("Number of train pairs:", len(train_pairs5))
    print("Counting words...")


    for pair in train_pairs5:      
      input_lang5.addSentence(pair[0])
      output_lang5.addSentence(pair[1])
    print("Counted words:")
    print(input_lang5.name, input_lang5.n_words)
    print(output_lang5.name, output_lang5.n_words)


    return (input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4,input_lang5, output_lang5,train_pairs1, 
            test_pairs1, train_pairs2, test_pairs2, train_pairs3, test_pairs3,train_pairs4, test_pairs4,train_pairs5, test_pairs5)


(input_lang1, output_lang1, input_lang2, output_lang2, input_lang3, output_lang3, input_lang4, output_lang4, input_lang5, output_lang5, train_pairs1, test_pairs1, train_pairs2,
 test_pairs2,train_pairs3, test_pairs3, train_pairs4, test_pairs4,train_pairs5, test_pairs5) = prepareData(sents_fr, sents_en)
print(random.choice(train_pairs1))
print(random.choice(train_pairs2))
print(random.choice(train_pairs3))
print(random.choice(train_pairs4))
print(random.choice(train_pairs5))

Read 183251 sentence pairs
Number of test pairs: 36650
Number of train pairs: 146601
Counting words...
Counted words:
fr 57432
en 42977
Number of train pairs: 146601
Counting words...
Counted words:
fr 57510
en 42960
Number of train pairs: 146601
Counting words...
Counted words:
fr 57715
en 43042
Number of train pairs: 146601
Counting words...
Counted words:
fr 57623
en 43010
Number of train pairs: 146600
Counting words...
Counted words:
fr 57515
en 42988
['de petits groupes de palestiniens demolissent les vestiges des infrastructures industrielles aneanties par les bombes des blocs de beton qui polluent le paysage sablonneux .', 'small groups of palestinians smash up the remains of gaza s bombed industrial infrastructure the concrete blocks that litter the sandy landscape .']
['les hierarques chiites ayant coutume de releguer l avenement du mahdi a un avenir eloigne le penchant millenariste d ahmadinejad les agacent .', 'for the shia religious hierarchy long accustomed to relegating t

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 240
# additive https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, hidden_size))
        self.attn = nn.Linear(self.hidden_size , self.max_length) #additive
        
        ######################/
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #print((embedded[0]*hidden[0]).shape)

        #additive
        x = torch.tanh(self.fc_hidden(hidden[0])+self.fc_encoder(encoder_outputs))
        #print(x.unsqueeze(0).shape,self.weight.unsqueeze(2).shape)
        alignment_scores = torch.bmm(x.unsqueeze(0), self.weight.unsqueeze(2))  

        attn_weights = F.softmax(
            #################################
            alignment_scores.view(1,-1), dim=1) #dot product
            #################################
        #print(attn_weights.shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print(attn_applied.shape)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)

        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang1, pair[0])
    target_tensor = tensorFromSentence(output_lang1, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs1))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [10]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang1, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang1.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang1, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang1.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang1.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs1)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs1:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs1:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [11]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang1.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN3(hidden_size, output_lang1.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 1 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 7s (- 82m 41s) (1000 1%) 6.1527
2m 9s (- 78m 31s) (2000 2%) 5.8172
3m 11s (- 76m 41s) (3000 4%) 5.5865
4m 12s (- 74m 39s) (4000 5%) 5.5127
5m 16s (- 73m 49s) (5000 6%) 5.5178
6m 16s (- 72m 7s) (6000 8%) 5.3911
7m 18s (- 70m 55s) (7000 9%) 5.5033
8m 22s (- 70m 9s) (8000 10%) 5.4506
9m 26s (- 69m 14s) (9000 12%) 5.5056
10m 32s (- 68m 29s) (10000 13%) 5.4993
11m 38s (- 67m 41s) (11000 14%) 5.6805
12m 44s (- 66m 54s) (12000 16%) 5.6302
13m 52s (- 66m 11s) (13000 17%) 5.6400
15m 0s (- 65m 23s) (14000 18%) 5.6587
16m 7s (- 64m 31s) (15000 20%) 5.6553
17m 14s (- 63m 36s) (16000 21%) 5.7069
18m 20s (- 62m 36s) (17000 22%) 5.6364
19m 29s (- 61m 43s) (18000 24%) 5.6128
20m 35s (- 60m 42s) (19000 25%) 5.6012
21m 46s (- 59m 52s) (20000 26%) 5.6844
22m 54s (- 58m 53s) (21000 28%) 5.5105
24m 0s (- 57m 50s) (22000 29%) 5.5662
25m 9s (- 56m 52s) (23000 30%) 5.5767
26m 19s (- 55m 55s) (24000 32%) 5.5727
27m 26s (- 54m 53s) (25000 33%) 5.5189
28m 35s (- 53m 52s) (26000 34%) 5.5326
29m 46s (- 52m 55s)

###Fold2

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 240
# additive https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, hidden_size))
        self.attn = nn.Linear(self.hidden_size , self.max_length) #additive
        
        ######################/
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #print((embedded[0]*hidden[0]).shape)

        #additive
        x = torch.tanh(self.fc_hidden(hidden[0])+self.fc_encoder(encoder_outputs))
        #print(x.unsqueeze(0).shape, self.weight.unsqueeze(2).shape)
        alignment_scores = torch.bmm(x.unsqueeze(0), self.weight.unsqueeze(2))  

        attn_weights = F.softmax(
            #################################
            alignment_scores.view(1,-1), dim=1) #dot product
            #################################
        #print(attn_weights.shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print(attn_applied.shape)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)

        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang2, pair[0])
    target_tensor = tensorFromSentence(output_lang2, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs2))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [13]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang2, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang2.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang2, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang2.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang2.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs2)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs2:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs2:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [14]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang2.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN3(hidden_size, output_lang2.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 2 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 6s (- 82m 27s) (1000 1%) 6.1464
2m 10s (- 79m 28s) (2000 2%) 5.7445
3m 13s (- 77m 13s) (3000 4%) 5.5759
4m 14s (- 75m 16s) (4000 5%) 5.5351
5m 17s (- 74m 2s) (5000 6%) 5.6463
6m 17s (- 72m 21s) (6000 8%) 5.2841
7m 20s (- 71m 15s) (7000 9%) 5.3697
8m 23s (- 70m 17s) (8000 10%) 5.4002
9m 26s (- 69m 15s) (9000 12%) 5.4159
10m 30s (- 68m 19s) (10000 13%) 5.4464
11m 32s (- 67m 11s) (11000 14%) 5.3296
12m 39s (- 66m 25s) (12000 16%) 5.5226
13m 43s (- 65m 28s) (13000 17%) 5.6374
14m 50s (- 64m 39s) (14000 18%) 5.6536
15m 57s (- 63m 49s) (15000 20%) 5.6166
17m 4s (- 62m 59s) (16000 21%) 5.6640
18m 11s (- 62m 3s) (17000 22%) 5.5386
19m 17s (- 61m 6s) (18000 24%) 5.6718
20m 28s (- 60m 21s) (19000 25%) 5.6649
21m 36s (- 59m 24s) (20000 26%) 5.6199
22m 44s (- 58m 27s) (21000 28%) 5.5502
23m 52s (- 57m 31s) (22000 29%) 5.5632
25m 1s (- 56m 35s) (23000 30%) 5.5457
26m 10s (- 55m 36s) (24000 32%) 5.5259
27m 17s (- 54m 34s) (25000 33%) 5.5433
28m 25s (- 53m 33s) (26000 34%) 5.4990
29m 32s (- 52m 31

###Fold 3

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 250
# additive https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, hidden_size))
        self.attn = nn.Linear(self.hidden_size , self.max_length) #additive
        
        ######################/
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #print((embedded[0]*hidden[0]).shape)

        #additive
        x = torch.tanh(self.fc_hidden(hidden[0])+self.fc_encoder(encoder_outputs))
        #print(x.unsqueeze(0).shape, self.weight.unsqueeze(2).shape)
        alignment_scores = torch.bmm(x.unsqueeze(0), self.weight.unsqueeze(2))  

        attn_weights = F.softmax(
            #################################
            alignment_scores.view(1,-1), dim=1) #dot product
            #################################
        #print(attn_weights.shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print(attn_applied.shape)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)

        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang2, pair[0])
    target_tensor = tensorFromSentence(output_lang2, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs2))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [22]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang3, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang3.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang3, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang3.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang3.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs3)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs3:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs3:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [23]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang3.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN3(hidden_size, output_lang3.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 3 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 7s (- 82m 50s) (1000 1%) 6.0418
2m 6s (- 77m 9s) (2000 2%) 5.6940
3m 7s (- 75m 11s) (3000 4%) 5.4610
4m 9s (- 73m 50s) (4000 5%) 5.5075
5m 12s (- 72m 54s) (5000 6%) 5.4781
6m 14s (- 71m 50s) (6000 8%) 5.4042
7m 17s (- 70m 49s) (7000 9%) 5.4121
8m 20s (- 69m 53s) (8000 10%) 5.4246
9m 24s (- 68m 57s) (9000 12%) 5.5348
10m 29s (- 68m 12s) (10000 13%) 5.5128
11m 34s (- 67m 21s) (11000 14%) 5.5826
12m 41s (- 66m 39s) (12000 16%) 5.7057
13m 47s (- 65m 48s) (13000 17%) 5.7344
14m 56s (- 65m 4s) (14000 18%) 5.7456
16m 4s (- 64m 18s) (15000 20%) 5.6854
17m 12s (- 63m 26s) (16000 21%) 5.6593
18m 21s (- 62m 37s) (17000 22%) 5.6733
19m 29s (- 61m 42s) (18000 24%) 5.5957
20m 38s (- 60m 49s) (19000 25%) 5.7049
21m 46s (- 59m 51s) (20000 26%) 5.6549
22m 53s (- 58m 52s) (21000 28%) 5.5534
24m 3s (- 57m 56s) (22000 29%) 5.6106
25m 11s (- 56m 56s) (23000 30%) 5.5915
26m 19s (- 55m 57s) (24000 32%) 5.5994
27m 30s (- 55m 0s) (25000 33%) 5.6146
28m 36s (- 53m 55s) (26000 34%) 5.5497
29m 46s (- 52m 55s) 

###Fold 4

In [24]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 245
# additive https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, hidden_size))
        self.attn = nn.Linear(self.hidden_size , self.max_length) #additive
        
        ######################/
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #print((embedded[0]*hidden[0]).shape)

        #additive
        x = torch.tanh(self.fc_hidden(hidden[0])+self.fc_encoder(encoder_outputs))
        #print(x.unsqueeze(0).shape, self.weight.unsqueeze(2).shape)
        alignment_scores = torch.bmm(x.unsqueeze(0), self.weight.unsqueeze(2))  

        attn_weights = F.softmax(
            #################################
            alignment_scores.view(1,-1), dim=1) #dot product
            #################################
        #print(attn_weights.shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print(attn_applied.shape)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)

        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang4, pair[0])
    target_tensor = tensorFromSentence(output_lang4, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs4))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [25]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang4, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang4.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang4, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang4.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang4.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs4)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs4:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs4:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [26]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang4.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN3(hidden_size, output_lang4.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 4 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 7s (- 82m 45s) (1000 1%) 6.1393
2m 10s (- 79m 22s) (2000 2%) 5.7469
3m 11s (- 76m 41s) (3000 4%) 5.5859
4m 13s (- 74m 58s) (4000 5%) 5.5465
5m 15s (- 73m 35s) (5000 6%) 5.4684
6m 18s (- 72m 32s) (6000 8%) 5.5268
7m 21s (- 71m 32s) (7000 9%) 5.4338
8m 25s (- 70m 36s) (8000 10%) 5.4200
9m 31s (- 69m 51s) (9000 12%) 5.5425
10m 37s (- 69m 6s) (10000 13%) 5.5499
11m 42s (- 68m 4s) (11000 14%) 5.6016
12m 47s (- 67m 8s) (12000 16%) 5.5551
13m 51s (- 66m 6s) (13000 17%) 5.5703
14m 57s (- 65m 11s) (14000 18%) 5.6237
16m 4s (- 64m 19s) (15000 20%) 5.5930
17m 11s (- 63m 23s) (16000 21%) 5.5931
18m 17s (- 62m 23s) (17000 22%) 5.6432
19m 24s (- 61m 28s) (18000 24%) 5.6535
20m 32s (- 60m 31s) (19000 25%) 5.5970
21m 39s (- 59m 33s) (20000 26%) 5.5842
22m 47s (- 58m 37s) (21000 28%) 5.6004
23m 53s (- 57m 34s) (22000 29%) 5.4872
25m 0s (- 56m 33s) (23000 30%) 5.5750
26m 6s (- 55m 29s) (24000 32%) 5.5237
27m 13s (- 54m 27s) (25000 33%) 5.5067
28m 21s (- 53m 25s) (26000 34%) 5.4699
29m 28s (- 52m 23s)

###Fold 5

In [33]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
MAX_LENGTH = 250
# additive https://blog.floydhub.com/attention-mechanism/
class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        #################
        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight = nn.Parameter(torch.FloatTensor(1, hidden_size))
        self.attn = nn.Linear(self.hidden_size , self.max_length) #additive
        
        ######################/
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #print((embedded[0]*hidden[0]).shape)

        #additive
        x = torch.tanh(self.fc_hidden(hidden[0])+self.fc_encoder(encoder_outputs))
        #print(x.unsqueeze(0).shape, self.weight.unsqueeze(2).shape)
        alignment_scores = torch.bmm(x.unsqueeze(0), self.weight.unsqueeze(2))  

        attn_weights = F.softmax(
            #################################
            alignment_scores.view(1,-1), dim=1) #dot product
            #################################
        #print(attn_weights.shape)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #print(attn_applied.shape)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)

        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang5, pair[0])
    target_tensor = tensorFromSentence(output_lang5, pair[1])
    return (input_tensor, target_tensor)

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs5))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [34]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang5, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang5.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang5, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
            #    continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)
            nextnodes = []

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nextnodes.append((score, node))

            # put them into queue
            for i in range(len(nextnodes)):
                score, nn = nextnodes[i]
                nodes.put((score, nn))
                
            # increase qsize
            qsize += len(nextnodes) - 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        utterance = []
        utterance.append(output_lang5.index2word[n.wordid.item()])
        
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang5.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(test_pairs5)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    references, candidates = [], []
    i= 0
    for sent_eng, sents_fre in test_pairs5:
        i=i+1
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
        if i%1000==0:
          print(i)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs5:
        sents_fre = [sent_fre.split(' ') for sent_fre in [sents_fre]]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    #return bleu 1, bleu 2, bleu 3
    score1 = corpus_bleu(references, candidates, weights=(1.0, 0, 0))
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0))
    score3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33))
    #print (references,  candidates)
    scores = [score1, score2, score3]
    return scores

In [35]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang5.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN3(hidden_size, output_lang5.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=1000) #reduce number of epoch
print("Fold 5 Bleu-1, Bleu-2, Bleu-3 scores are ",evaluateBleu(encoder1, attn_decoder1))

1m 8s (- 84m 58s) (1000 1%) 6.1356
2m 8s (- 77m 54s) (2000 2%) 5.6153
3m 9s (- 75m 52s) (3000 4%) 5.5034
4m 12s (- 74m 49s) (4000 5%) 5.6018
5m 15s (- 73m 41s) (5000 6%) 5.4943
6m 17s (- 72m 20s) (6000 8%) 5.4132
7m 20s (- 71m 21s) (7000 9%) 5.4318
8m 24s (- 70m 24s) (8000 10%) 5.3664
9m 28s (- 69m 29s) (9000 12%) 5.4825
10m 32s (- 68m 33s) (10000 13%) 5.5454
11m 38s (- 67m 42s) (11000 14%) 5.5579
12m 45s (- 66m 58s) (12000 16%) 5.6829
13m 51s (- 66m 5s) (13000 17%) 5.6837
14m 59s (- 65m 18s) (14000 18%) 5.6726
16m 7s (- 64m 28s) (15000 20%) 5.5737
17m 14s (- 63m 36s) (16000 21%) 5.6573
18m 21s (- 62m 36s) (17000 22%) 5.6135
19m 26s (- 61m 32s) (18000 24%) 5.5645
20m 31s (- 60m 29s) (19000 25%) 5.5632
21m 39s (- 59m 33s) (20000 26%) 5.5746
22m 48s (- 58m 37s) (21000 28%) 5.6214
23m 55s (- 57m 37s) (22000 29%) 5.5937
25m 2s (- 56m 37s) (23000 30%) 5.4637
26m 12s (- 55m 41s) (24000 32%) 5.6242
27m 19s (- 54m 39s) (25000 33%) 5.5134
28m 28s (- 53m 39s) (26000 34%) 5.5528
29m 35s (- 52m 36

In [36]:
evaluateRandomly(encoder1, attn_decoder1)

> l un des meilleurs aspects et l un des plus vivaces de la revolution orange est qu elle a donne les pleins pouvoirs democratiques aux citoyens .
= the best and still most living thing about our orange revolution was the democratic empowerment of our people .
< one of the the the and the the the and the the the the the the the . <EOS>

> l incapacite de l europe a jouer un role dans le processus de paix ne provient pas de ses supposes prejuges contre les israeliens mais du fait que l ue n est pas un etat . les roles que joue un etat ne lui sont pas attribues .
= europe s failure to play a role in resolving this conflict does not result from its supposed anti israeli views but from the fact that the eu is not a state .
< europe s role in the middle east it is not a europe but of the the the a europe of the the the the role of in . <EOS>

> beaucoup se rappellent la strategie envisagee par l ancien premier ministre israelien yitzhak shamir j aurais entame des negociations sur l autonomi