In [1]:
%matplotlib inline

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Loading data files
==================



We'll need a unique index per word to use as the inputs and targets of
the networks later. To keep track of all this we will use a helper class
called ``Lang`` which has word → index (``word2index``) and index → word
(``index2word``) dictionaries, as well as a count of each word
``word2count`` to use to later replace rare words.




In [2]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

The files are all in Unicode, to simplify we will turn Unicode
characters to ASCII, make everything lowercase, and trim most
punctuation.




In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

To read the data file we will split the file into lines, and then split
lines into pairs. The files are all English → Other Language, so if we
want to translate from Other Language → English I added the ``reverse``
flag to reverse the pairs.




In [4]:
def readLangs(lang1, lang2, lang3, lang4, reverse=False):
    print("Reading lines...")

    source_lang = open(f'training/news-commentary-v9.%s-%s.%s' %(lang1, lang2, lang3), encoding='utf-8',mode='r',newline='').read().strip().split('\n')
    source_lang = [normalizeString(x) for x in source_lang]
    print(len(source_lang))
    target_lang = open(f'training/news-commentary-v9.%s-%s.%s' %(lang1, lang2, lang4), encoding='utf-8',mode='r',newline='').read().strip().split('\n')
    target_lang = [normalizeString(x) for x in target_lang]
    print(len(target_lang))
    pairs = []
    pairs = [[x,y] for x, y in zip(source_lang, target_lang)]
    print(len(pairs))

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
#a, b, c = readLangs('ru', 'en', 'ru' ,'en')

In [7]:
#c

In [5]:
a, b, c = readLangs('cs', 'en', 'cs' ,'en', False)

Reading lines...
146549
146549
146549


Since there are a *lot* of example sentences and we want to train
something quickly, we'll trim the data set to only relatively short and
simple sentences. Here the maximum length is 10 words (that includes
ending punctuation) and we're filtering to sentences that translate to
the form "I am" or "He is" etc. (accounting for apostrophes replaced
earlier).




In [8]:
MAX_LENGTH = 20

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
        #p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [9]:
def prepareData(lang1, lang2, lang3, lang4, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, lang3, lang4, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    # collect test pairs
    num_test = int(len(pairs)*0.2)
    print("Number of test pairs:", num_test)
    random.Random(1).shuffle(pairs)
    test_pairs = pairs[:num_test]
    set_test_eng = set([sent_eng for sent_eng, _ in test_pairs])
    
    test_pair_dict = {}
    for sent_eng, sent_fre in pairs:
        if sent_eng not in set_test_eng:
            continue 
        elif sent_eng not in test_pair_dict:
            test_pair_dict[sent_eng] = set([sent_fre])
        else:
            test_pair_dict[sent_eng].add(sent_fre)
    test_pairs = [(sent_eng, list(test_pair_dict[sent_eng])) for sent_eng in test_pair_dict]
    print("Number of test cases (sent + list):", len(test_pairs))
    
    # collect train pairs
    train_pairs = [(sent_eng, sent_fre) for sent_eng, sent_fre in pairs[num_test:] if sent_eng not in set_test_eng]
    print("Number of train pairs:", len(train_pairs))
    
    print("Counting words...")
    for pair in train_pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, train_pairs, test_pairs


input_lang, output_lang, train_pairs, test_pairs = prepareData('cs', 'en', 'cs' ,'en', False)
print(random.choice(train_pairs))

Reading lines...
146549
146549
146549
Read 146549 sentence pairs
Trimmed to 61714 sentence pairs
Number of test pairs: 12342
Number of test cases (sent + list): 12285
Number of train pairs: 36792
Counting words...
Counted words:
cs 48019
en 20518
('v pekingu dochazelo k umrtim vetsinou v oblastech se stromy a jezery .', 'in beijing deaths occurred mostly in areas wuth trees and lakes .')


In [10]:
print(test_pairs)



In [11]:
print(len(set([sent for sent, _ in train_pairs])))
print(len(set([sent for _, sent in train_pairs])))
print(len(set([sent for sent, _ in test_pairs])))
#print(len(set([sent for _, sent in test_pairs])))

36737
36616
12285


Pre-train word embedding layer
-----------







For the encoder part
-----------







In [12]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
input_pretrain_lang = open(f'emb_training/news-commentary-v8.cs' , encoding='utf-8',mode='r',newline='').read().strip().split('\n')
input_pretrain_lang = [normalizeString(x) for x in input_pretrain_lang]
print(len(input_pretrain_lang))

182827


In [15]:
input_pretrain_lang

['zlato za dolaru ?',
 'san francisco vest racionalni rozhovor o hodnote zlata nikdy nebylo snadne .',
 'v posledni dobe kdy se ceny zlata zvedly o vice nez za deset let je to jeste tezsi .',
 'loni v prosinci napsali kolegove ekonomove martin feldstein a nouriel roubini komentare v nichz odvazne zpochybnili byci naladu na trhu a rozumne poukazali na rizika spojena se zlatem .',
 'a co se nestalo ?',
 'od zverejneni jejich clanku se cena zlata vysplhala jeste vyse a nedavno dokonce dosahla rekordnich dolaru .',
 'uz loni v prosinci pritom mnoho zlatych nadsencu tvrdilo ze cena teto komodity nevyhnutelne smeruje k hranici dolaru .',
 'a nekteri z nich osmeleni jeho setrvalym zhodnocovanim dnes naznacuji ze by zlato mohlo stat jeste vice .',
 'jeden uspesny investor do zlata mi nedavno vysvetloval ze ceny akcii vice nez deset let skomiraly a vzpamatovaly se az na pocatku . let kdy dow jonesuv index prekrocil hranici bodu .',
 'od te doby se index vysplhal nad bodu .',
 'neni tedy mozne z

In [16]:
input_tokenized = []
for a in input_pretrain_lang:
    input_tokenized.append(word_tokenize(a))

In [17]:
input_tokenized

[['zlato', 'za', 'dolaru', '?'],
 ['san',
  'francisco',
  'vest',
  'racionalni',
  'rozhovor',
  'o',
  'hodnote',
  'zlata',
  'nikdy',
  'nebylo',
  'snadne',
  '.'],
 ['v',
  'posledni',
  'dobe',
  'kdy',
  'se',
  'ceny',
  'zlata',
  'zvedly',
  'o',
  'vice',
  'nez',
  'za',
  'deset',
  'let',
  'je',
  'to',
  'jeste',
  'tezsi',
  '.'],
 ['loni',
  'v',
  'prosinci',
  'napsali',
  'kolegove',
  'ekonomove',
  'martin',
  'feldstein',
  'a',
  'nouriel',
  'roubini',
  'komentare',
  'v',
  'nichz',
  'odvazne',
  'zpochybnili',
  'byci',
  'naladu',
  'na',
  'trhu',
  'a',
  'rozumne',
  'poukazali',
  'na',
  'rizika',
  'spojena',
  'se',
  'zlatem',
  '.'],
 ['a', 'co', 'se', 'nestalo', '?'],
 ['od',
  'zverejneni',
  'jejich',
  'clanku',
  'se',
  'cena',
  'zlata',
  'vysplhala',
  'jeste',
  'vyse',
  'a',
  'nedavno',
  'dokonce',
  'dosahla',
  'rekordnich',
  'dolaru',
  '.'],
 ['uz',
  'loni',
  'v',
  'prosinci',
  'pritom',
  'mnoho',
  'zlatych',
  'nadsenc

In [18]:
input_pretrain_model = Word2Vec(input_tokenized, min_count=1, workers = 6)

In [19]:
print(input_pretrain_model)

Word2Vec(vocab=140178, size=100, alpha=0.025)


In [20]:
#Access model vocab
input_tokenized_vocab = list(input_pretrain_model.wv.vocab)
print(input_tokenized_vocab)



In [21]:
input_matrix_len = input_lang.n_words
encoder_weights_matrix = np.zeros((input_matrix_len, 100))
#Random dense vector for <SOS> and <EOS>
encoder_weights_matrix[0] = np.random.normal(scale=0.6, size=(100, ))
encoder_weights_matrix[1] = np.random.normal(scale=0.6, size=(100, ))

for word, index in input_lang.word2index.items():
    if word in input_tokenized_vocab:
        encoder_weights_matrix[index] = input_pretrain_model[word]
    else:
        encoder_weights_matrix[index] = np.random.normal(scale=0.6, size=(100, ))

  encoder_weights_matrix[index] = input_pretrain_model[word]


In [22]:
encoder_weights_matrix = torch.from_numpy(encoder_weights_matrix)

In [23]:
type(encoder_weights_matrix)

torch.Tensor

In [24]:
encoder_weights_matrix.shape

torch.Size([48019, 100])

In [25]:
a, b = encoder_weights_matrix.shape

In [26]:
def create_encoder_emb_layer(weights_matrix, trainable):
    encoder_num_embeddings, encoder_embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(encoder_num_embeddings, encoder_embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    emb_layer.weight.requires_grad = trainable

    return emb_layer, encoder_num_embeddings, encoder_embedding_dim

For the decoder part
-----------







In [27]:
output_pretrain_lang = open(f'emb_training/news-commentary-v8.en' , encoding='utf-8',mode='r',newline='').read().strip().split('\n')
output_pretrain_lang = [normalizeString(x) for x in output_pretrain_lang]
print(len(output_pretrain_lang))

304174


In [28]:
output_pretrain_lang

[' gold ?',
 'san francisco it has never been easy to have a rational conversation about the value of gold .',
 'lately with gold prices up more than over the last decade it is harder than ever .',
 'just last december fellow economists martin feldstein and nouriel roubini each penned op eds bravely questioning bullish market sentiment sensibly pointing out gold s risks .',
 'wouldn t you know it ?',
 'since their articles appeared the price of gold has moved up still further .',
 'gold prices even hit a record high recently .',
 'last december many gold bugs were arguing that the price was inevitably headed for .',
 'now emboldened by continuing appreciation some are suggesting that gold could be headed even higher than that .',
 'one successful gold investor recently explained to me that stock prices languished for a more than a decade before the dow jones index crossed the mark in the early s .',
 'since then the index has climbed above .',
 'now that gold has crossed the magic barr

In [29]:
output_tokenized = []
for a in output_pretrain_lang:
    output_tokenized.append(word_tokenize(a))

In [30]:
output_tokenized

[['gold', '?'],
 ['san',
  'francisco',
  'it',
  'has',
  'never',
  'been',
  'easy',
  'to',
  'have',
  'a',
  'rational',
  'conversation',
  'about',
  'the',
  'value',
  'of',
  'gold',
  '.'],
 ['lately',
  'with',
  'gold',
  'prices',
  'up',
  'more',
  'than',
  'over',
  'the',
  'last',
  'decade',
  'it',
  'is',
  'harder',
  'than',
  'ever',
  '.'],
 ['just',
  'last',
  'december',
  'fellow',
  'economists',
  'martin',
  'feldstein',
  'and',
  'nouriel',
  'roubini',
  'each',
  'penned',
  'op',
  'eds',
  'bravely',
  'questioning',
  'bullish',
  'market',
  'sentiment',
  'sensibly',
  'pointing',
  'out',
  'gold',
  's',
  'risks',
  '.'],
 ['wouldn', 't', 'you', 'know', 'it', '?'],
 ['since',
  'their',
  'articles',
  'appeared',
  'the',
  'price',
  'of',
  'gold',
  'has',
  'moved',
  'up',
  'still',
  'further',
  '.'],
 ['gold', 'prices', 'even', 'hit', 'a', 'record', 'high', 'recently', '.'],
 ['last',
  'december',
  'many',
  'gold',
  'bugs',
 

In [31]:
output_pretrain_model = Word2Vec(output_tokenized, min_count=1, workers = 6)

In [32]:
print(output_pretrain_model)

Word2Vec(vocab=55983, size=100, alpha=0.025)


In [33]:
#Access model vocab
output_tokenized_vocab = list(output_pretrain_model.wv.vocab)
print(output_tokenized_vocab)



In [34]:
output_matrix_len = output_lang.n_words
decoder_weights_matrix = np.zeros((output_matrix_len, 100))
#Random dense vector for <SOS> and <EOS>
decoder_weights_matrix[0] = np.random.normal(scale=0.6, size=(100, ))
decoder_weights_matrix[1] = np.random.normal(scale=0.6, size=(100, ))

for word, index in output_lang.word2index.items():
    if word in output_tokenized_vocab:
        decoder_weights_matrix[index] = output_pretrain_model[word]
    else:
        decoder_weights_matrix[index] = np.random.normal(scale=0.6, size=(100, ))

  decoder_weights_matrix[index] = output_pretrain_model[word]


In [35]:
decoder_weights_matrix = torch.from_numpy(decoder_weights_matrix)

In [36]:
decoder_weights_matrix.shape

torch.Size([20518, 100])

In [37]:
def create_decoder_emb_layer(weights_matrix, trainable):
    decoder_num_embeddings, decoder_embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(decoder_num_embeddings, decoder_embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    emb_layer.weight.requires_grad = trainable

    return emb_layer, decoder_num_embeddings, decoder_embedding_dim

The Seq2Seq Model
=================

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.





The Encoder
-----------

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.





In [38]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, weights_matrix):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding, encoder_num_embeddings, encoder_embedding_dim = create_encoder_emb_layer(weights_matrix, True)
        self.gru = nn.GRU(encoder_embedding_dim, hidden_size)
        #self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is another RNN that takes the encoder output vector(s) and
outputs a sequence of words to create the translation.




Simple Decoder
^^^^^^^^^^^^^^

In the simplest seq2seq decoder we use only last output of the encoder.
This last output is sometimes called the *context vector* as it encodes
context from the entire sequence. This context vector is used as the
initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and
hidden state. The initial input token is the start-of-string ``<SOS>``
token, and the first hidden state is the context vector (the encoder's
last hidden state).





In [39]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, weights_matrix):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        #self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding, decoder_num_embeddings, decoder_embedding_dim = create_decoder_emb_layer(weights_matrix, True)
        self.gru = nn.GRU(decoder_embedding_dim, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

I encourage you to train and observe the results of this model, but to
save space we'll be going straight for the gold and introducing the
Attention Mechanism.




Attention Decoder
^^^^^^^^^^^^^^^^^

If only the context vector is passed betweeen the encoder and decoder,
that single vector carries the burden of encoding the entire sentence.

Attention allows the decoder network to "focus" on a different part of
the encoder's outputs for every step of the decoder's own outputs. First
we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.




In [40]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, weights_matrix, dropout_p=0.1, max_length=MAX_LENGTH, ):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        #self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding, decoder_num_embeddings, decoder_embedding_dim = create_decoder_emb_layer(weights_matrix, True)
        self.attn = nn.Linear(self.hidden_size + decoder_embedding_dim, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + decoder_embedding_dim, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [41]:
class MultiplicativeAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, weights_matrix, dropout_p=0.1, max_length=MAX_LENGTH):
        super(MultiplicativeAttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.W = torch.nn.Parameter(torch.FloatTensor(
            hidden_size, hidden_size).uniform_(-0.1, 0.1))
    
        #self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding, decoder_num_embeddings, decoder_embedding_dim = create_decoder_emb_layer(weights_matrix, False)
        #self.attn = nn.Linear(self.hidden_size + decoder_embedding_dim, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + decoder_embedding_dim, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        weights = torch.matmul(hidden[0], self.W)
        weights = torch.matmul(weights, encoder_outputs.T)
        attention_scores = weights/np.sqrt(hidden_size)
        attn_weights = F.softmax(attention_scores)

        #attn_weights = F.softmax(
            #self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [42]:
class AdditiveAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, weights_matrix, attention_dim, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AdditiveAttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.v = torch.nn.Parameter(torch.FloatTensor(attention_dim).uniform_(-0.1, 0.1))
        self.W_1 = torch.nn.Linear(hidden_size, attention_dim)
        self.W_2 = torch.nn.Linear(hidden_size, attention_dim)
    
        #self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding, decoder_num_embeddings, decoder_embedding_dim = create_decoder_emb_layer(weights_matrix, False)
        #self.attn = nn.Linear(self.hidden_size + decoder_embedding_dim, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + decoder_embedding_dim, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        hiddens = hidden[0].repeat(encoder_outputs.size(0), 1)
        weights = self.W_1(hiddens) + self.W_2(encoder_outputs)
        attention_scores = torch.matmul(torch.tanh(weights),self.v)
        attn_weights = F.softmax(attention_scores)

        #attn_weights = F.softmax(
            #self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.matmul(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

<div class="alert alert-info"><h4>Note</h4><p>There are other forms of attention that work around the length
  limitation by using a relative position approach. Read about "local
  attention" in `Effective Approaches to Attention-based Neural Machine
  Translation <https://arxiv.org/abs/1508.04025>`__.</p></div>

Training
========

Preparing Training Data
-----------------------

To train, for each pair we will need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the
EOS token to both sequences.




In [43]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ') if word in lang.word2index]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.378.4095&rep=rep1&type=pdf>`__.

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation -
intuitively it has learned to represent the output grammar and can "pick
up" the meaning once the teacher tells it the first few words, but it
has not properly learned how to create the sentence from the translation
in the first place.

Because of the freedom PyTorch's autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
``teacher_forcing_ratio`` up to use more of it.




In [44]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [45]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [46]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.




In [47]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [48]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [49]:
class BeamSearchNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length

    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward

        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
    
    def __lt__(self, other):
        return self.eval() < other.eval()
    
    
from queue import PriorityQueue

def evaluate_beam_search(encoder, decoder, sentence, max_length=MAX_LENGTH, beam_size=2):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        # Number of sentence to generate
        endnodes = []
        number_required = 1
        
        # starting node -  hidden vector, previous node, word id, logp, length
        node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1)
        nodes = PriorityQueue()

        # start the queue
        nodes.put((-node.eval(), node))
        qsize = 1
        
        # start beam search
        while True:
            # give up when decoding takes too long
            if qsize > 2000: break

            # fetch the best node
            score, n = nodes.get()
            decoder_input = n.wordid
            decoder_hidden = n.h

            if n.wordid.item() == EOS_token and n.prevNode != None:
                endnodes.append((score, n))
                # if we reached maximum # of sentences required
                if len(endnodes) >= number_required:
                    break
                else:
                    continue
            #elif n.leng > max_length:
                #continue

            # decode for one step using decoder
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)

            # PUT HERE REAL BEAM SEARCH OF TOP
            log_prob, indexes = torch.topk(decoder_output, beam_size)

            for new_k in range(beam_size):
                decoded_t = indexes[0][new_k].view(1, -1)
                log_p = log_prob[0][new_k].item()

                node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1)
                score = -node.eval()
                nodes.put((score,node))
                qsize += 1
            
        # choose nbest paths, back trace them
        if len(endnodes) == 0:
            endnodes = [nodes.get() for _ in range(number_required)]

        _, n = endnodes[0]
        
        # back trace
        utterance = [output_lang.index2word[n.wordid.item()]]
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(output_lang.index2word[n.wordid.item()])

        utterance = utterance[::-1]
            
    return utterance, None

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [50]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(train_pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [51]:
from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu(encoder, decoder):
    start = time.time()
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs:
        sents_fre = [sent_fre.split(' ') for sent_fre in sents_fre]
        output_words, _ = evaluate(encoder, decoder, sent_eng)
        references.append(sents_fre)
        candidates.append(output_words)
    score1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    print("The BLEU-1 score is %0.10f." %score1)
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5))
    print("The BLEU-2 score is %0.10f." %score2)
    score3 = corpus_bleu(references, candidates, weights=(1./3., 1./3., 1./3.))
    print("The BLEU-3 score is %0.10f." %score3)
    time_elapse = (time.time() - start)/60
    print("It takes %f mins to complete."  %time_elapse)

In [52]:
from nltk.translate.bleu_score import corpus_bleu

def evaluateBleu_beam_search(encoder, decoder, beam_size):
    start = time.time()
    references, candidates = [], []
    for sent_eng, sents_fre in test_pairs:
        sents_fre = [sent_fre.split(' ') for sent_fre in sents_fre]
        output_words, _ = evaluate_beam_search(encoder, decoder, sent_eng, beam_size=beam_size)
        references.append(sents_fre)
        candidates.append(output_words)
    score1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    print("The BLEU-1 score is %0.10f." %score1)
    score2 = corpus_bleu(references, candidates, weights=(0.5, 0.5))
    print("The BLEU-2 score is %0.10f." %score2)
    score3 = corpus_bleu(references, candidates, weights=(1./3., 1./3., 1./3.))
    print("The BLEU-3 score is %0.10f." %score3)
    time_elapse = (time.time() - start)/60
    print("It takes %f mins to complete."  %time_elapse)

Training and Evaluating
=======================

With all these helper functions in place (it looks like extra work, but
it makes it easier to run multiple experiments) we can actually
initialize a network and start training.

Remember that the input sentences were heavily filtered. For this small
dataset we can use relatively small networks of 256 hidden nodes and a
single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
reasonable results.

.. Note::
   If you run this notebook you can train, interrupt the kernel,
   evaluate, and continue training later. Comment out the lines where the
   encoder and decoder are initialized and run ``trainIters`` again.




In [53]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size, encoder_weights_matrix).to(device)
#attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, decoder_weights_matrix, dropout_p=0.1, ).to(device)
attn_decoder1 = MultiplicativeAttnDecoderRNN(hidden_size, output_lang.n_words, decoder_weights_matrix, dropout_p=0.1).to(device)
#attn_decoder1 = AdditiveAttnDecoderRNN(hidden_size, output_lang.n_words, decoder_weights_matrix, attention_dim=256, dropout_p=0.1).to(device)
trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

  attn_weights = F.softmax(attention_scores)


2m 19s (- 32m 37s) (5000 6%) 5.7325
4m 31s (- 29m 25s) (10000 13%) 5.5605
6m 44s (- 26m 56s) (15000 20%) 5.4464
8m 56s (- 24m 35s) (20000 26%) 5.2881
11m 10s (- 22m 21s) (25000 33%) 5.1720
13m 27s (- 20m 11s) (30000 40%) 5.0929
15m 41s (- 17m 56s) (35000 46%) 4.9975
17m 57s (- 15m 42s) (40000 53%) 4.9264
20m 5s (- 13m 23s) (45000 60%) 4.8835
22m 11s (- 11m 5s) (50000 66%) 4.8130
24m 18s (- 8m 50s) (55000 73%) 4.7629
26m 31s (- 6m 37s) (60000 80%) 4.7829
28m 39s (- 4m 24s) (65000 86%) 4.7292
30m 45s (- 2m 11s) (70000 93%) 4.7097
32m 53s (- 0m 0s) (75000 100%) 4.6761


In [54]:
evaluateRandomly(encoder1, attn_decoder1)

> od te doby probehly uz troje volby .
= that was three elections ago .
< since has the the since the . <EOS>

> takovy uspech by nas mel povzbudit .
= this kind of success should encourage us .
< this change should be to . . <EOS>

> dobre vedeni ma situacni charakter .
= good leadership is situational .
< the democratic has has to . . <EOS>

> jinymi slovy v centru pozornosti medii si vedle hiv zaslouzi stat i tuberkuloza .
= that must mean sharing hiv s more notorious spotlight .
< in other words to the more in . . <EOS>

> na zadnou z techto otazek ale neni snadne odpovedet .
= to all of these questions there are no simple answers .
< none of these these questions are not to be . <EOS>

> prebytky se vyuzivaji ke splaceni dluhu zatimco deficity financuji pujcky .
= surpluses are applied to repaying debt and borrowing finances deficits .
< banks are to to to to to to debt to . . <EOS>

> co vsak pojem genocida zahrnuje ?
= but what constitutes genocide ?
< but what does not ? ? ? <E

  attn_weights = F.softmax(attention_scores)


In [55]:
evaluateBleu(encoder1, attn_decoder1)

  attn_weights = F.softmax(attention_scores)


The BLEU-1 score is 0.2479569668.
The BLEU-2 score is 0.0834511846.
The BLEU-3 score is 0.0307925547.
It takes 2.406421 mins to complete.


In [56]:
evaluateBleu_beam_search(encoder1, attn_decoder1, 10)

  attn_weights = F.softmax(attention_scores)


The BLEU-1 score is 0.2422807174.
The BLEU-2 score is 0.0855736918.
The BLEU-3 score is 0.0333766309.
It takes 15.279147 mins to complete.


Saving the model
---------------------






In [57]:
#torch.save(encoder1, "first_first_encoder.pth")

In [58]:
#torch.save(attn_decoder1, "first_first_attn_decoder.pth")

In [59]:
#torch.save(encoder1.state_dict(), "first_first_encoder_dict.pth")

In [60]:
#torch.save(attn_decoder1.state_dict(), "first_first_attn_decoder_dict.pth")

Visualizing Attention
---------------------

A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run ``plt.matshow(attentions)`` to see attention output
displayed as a matrix, with the columns being input steps and rows being
output steps:




In [61]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

  attn_weights = F.softmax(attention_scores)


<matplotlib.image.AxesImage at 0x2089c2ef6d0>

For a better viewing experience we will do the extra work of adding axes
and labels:




In [62]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")

  attn_weights = F.softmax(attention_scores)
  ax.set_xticklabels([''] + input_sentence.split(' ') +
  ax.set_yticklabels([''] + output_words)
  plt.show()


input = elle a cinq ans de moins que moi .
output = and the <EOS>
input = elle est trop petit .
output = the . <EOS>
input = je ne crains pas de mourir .
output = is not is <EOS>
input = c est un jeune directeur plein de talent .
output = this year of . . <EOS>
