In [1]:
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

## Loading data files

The data for this project is a set of many thousands of English to French translation pairs from http://www.manythings.org/anki/



In [2]:
# lets download data set
!wget - c "http://www.manythings.org/anki/ukr-eng.zip"

--2018-01-07 02:16:00--  http://www.manythings.org/anki/ukr-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.109.196, 104.24.108.196
Connecting to www.manythings.org (www.manythings.org)|104.24.109.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1205944 (1.1M) [application/zip]
Saving to: 'ukr-eng.zip'


2018-01-07 02:16:02 (972 KB/s) - 'ukr-eng.zip' saved [1205944/1205944]



In [3]:
# check it was downloaded
!ls - la

total 4232
drwxr-xr-x   5 webdizz  staff      160 Jan  7 02:16 [34m.[m[m
drwxr-xr-x  12 webdizz  staff      384 Jan  6 13:26 [34m..[m[m
drwxr-xr-x   3 webdizz  staff       96 Jan  6 13:34 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 webdizz  staff    30647 Jan  7 02:14 machine-translation.ipynb
-rw-r--r--@  1 webdizz  staff  1205944 Oct 30 10:14 ukr-eng.zip


In [4]:
# extract dataset
!unzip - o ukr - eng.zip
!ls - la

Archive:  ukr-eng.zip
  inflating: _about.txt              
  inflating: ukr.txt                 
total 12656
drwxr-xr-x   7 webdizz  staff      224 Jan  7 02:16 [34m.[m[m
drwxr-xr-x  12 webdizz  staff      384 Jan  6 13:26 [34m..[m[m
drwxr-xr-x   3 webdizz  staff       96 Jan  6 13:34 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 webdizz  staff     1441 Oct 30 17:14 _about.txt
-rw-r--r--   1 webdizz  staff    30647 Jan  7 02:14 machine-translation.ipynb
-rw-r--r--@  1 webdizz  staff  1205944 Oct 30 10:14 ukr-eng.zip
-rw-r--r--   1 webdizz  staff  4308132 Oct 30 17:14 ukr.txt


## Indexing words

We'll need a unique index per word to use as the inputs and targets of the network later. To keep of all this we'll create an wrapper class called `Language` along with utility methods to represents **word->index** and **index->word** associations as well as count of each word, which will be useful later to replace rare words.

In [5]:
SOS_token = 0
EOS_token = 1


class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'SOS', 1: 'EOS'}
        self.n_words = 2
        self.max_sentence_length = 0

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def index_words(self, sentence):
        words_in_sentence = sentence.split(' ')
        for word in words_in_sentence:
            self.index_word(word)

        # update max_sentence_length to later usage within Tensor
        sentence_len = len(words_in_sentence)
        if self.max_sentence_length < sentence_len:
            self.max_sentence_length = sentence_len

## Reading and decoding files

The files are all in Unicode, to simplify we will turn Unicode to ASCII, make everything lowercase and remove punctuation.

In [6]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalize(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([-;,.!?])", r"", s)
    return s.strip()

We need to read file line by line and the split lines into pairs.

In [7]:
def read_langs(lang_from, lang_to):
    print("Reading language lines...")

    # read file while splitting line by line
    lines = open('ukr.txt').read().strip().split('\n')

    # split every line into pairs and normalize
    pairs = [[normalize(s) for s in l.split('\t')] for l in lines]

    # prepare output
    input_lang = Language(lang_from)
    output_lang = Language(lang_to)
    return input_lang, output_lang, pairs


def prepare_data(lang_from, lang_to):
    input_lang, output_lang, pairs = read_langs(lang_from, lang_to)
    print("Read %s sentence pairs" % len(pairs))

    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

In [8]:
# load and prepare data
input_lang, output_lang, pairs = prepare_data('en', 'ua')

print(random.choice(pairs))
print("Max sentence length %d for en" % input_lang.max_sentence_length)
print("Max sentence length %d for ua" % output_lang.max_sentence_length)
print('Input language number of words', input_lang.n_words)

Reading language lines...
Read 54635 sentence pairs
Indexing words...
["i'm wet", 'я мокрии']
Max sentence length 32 for en
Max sentence length 25 for ua
Input language number of words 8060


## Turning training data to Tensors/Variables

To train we need to turn sentences into something the neural network can understand, which of course means numbers.
Each sentence will be split into words and turned into a `Tensor`, where each words is replaced with the index (from the `Language` indexes made earlier). While creating these tensors we will also append the `EOS` token to signal that the sentence is over.

![sentence as word index representation](https://camo.githubusercontent.com/f6702e41fb7582581c82be688c791416de11f761/68747470733a2f2f692e696d6775722e636f6d2f4c7a6f637047482e706e67)

Trainable [PyTorch](http://pytorch.org/) modules take `Variable` as input, rather than plain `Tensor`s. A `Variable` is basically a `Tensor` that is able to keep track of the graph state, which is what makes `autograd` (automatic calculation of backwards gradients for backpropagation to work) possible.

In [9]:
# return a list of indexes, one for each word in the sentence
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)
    variable = Variable(torch.LongTensor([indexes])).view(-1)
    return variable


def variables_from_pair(input_lang, output_lang, pair):
    input_variable = variable_from_sentence(input_lang, pair[0])
    output_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, output_variable)

In [10]:
variables_from_pair(input_lang, output_lang, pairs[30000])

(Variable containing:
    44
  1490
    96
   731
  1792
     1
 [torch.LongTensor of size 6], Variable containing:
     81
     12
    808
     70
  12130
      1
 [torch.LongTensor of size 6])

# Building models

## The Encoder

The `Encoder` of a **seq2seq** network is a **RNN** that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

![Encoder diagram](https://github.com/spro/practical-pytorch/raw/89d8ad57f9570927ae869ec2cc6d90e9a7b38bb5/seq2seq-translation/images/encoder-network.png "tooltip")

In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()

        # define network hyperparameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # define network architecture
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)

    def forward(self, input_words, hidden_state):
        seq_len = len(input_words)
        embedded = self.embedding(input_words).view(seq_len, 1, -1)
        output, hidden = self.gru(embedded, hidden_state)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        return hidden

## Attention Decoder

In summary out `Decoder` should consist of four main parts - an embedding layer turning an iput words into a vector; a layer to calculate the attention energy per encoder output; a RNN layer; and an output layer.

The `Decoder`'s inputs are the last RNN hidden state, last output and all encoder output.

Let's create `Attention` model following [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025) by Luong et al. where there is a description of few more attention models that offer improvements and simplifications. They describe a few "global attention" models, the description between them being the way attention scores are calculated.

The general form of the attention calculation relies on the target (decoder) side hidden state and corresponding source (encoder) side state, normalized over all states to get values summing to 1 e.g. by application of `softmax` function.

The specific `score` function that compares two states is either `dot`, a sample dot product between the states; `general`, a dot product between the decoder hidden state and a linear transform of the encoder state; or `concat`, a dot product between a new parameter and a linear transform of the states concatenated together.

![Effective Approaches to Attention-based Neural Machine Translation](https://render.githubusercontent.com/render/math?math=score%28h_t%2C%20%5Cbar%20h_s%29%20%3D%0A%5Cbegin%7Bcases%7D%0Ah_t%20%5E%5Ctop%20%5Cbar%20h_s%20%26amp%3B%20dot%20%5C%5C%0Ah_t%20%5E%5Ctop%20%5Ctextbf%7BW%7D_a%20%5Cbar%20h_s%20%26amp%3B%20general%20%5C%5C%0Av_a%20%5E%5Ctop%20%5Ctextbf%7BW%7D_a%20%5B%20h_t%20%3B%20%5Cbar%20h_s%20%5D%20%26amp%3B%20concat%0A%5Cend%7Bcases%7D&mode=display)

The modular definition of the scripting functions gives us an opportunity to build specific attention module that can switch between the different score methods. The input to this module is always the hidden state (of the decoder RNN) and set of encoder outputs.

In [12]:
MAX_SENT_LENGTH = 32


class Attention(nn.Module):
    def __init__(self, method, hidden_size, max_len=MAX_SENT_LENGTH):
        super(Attention, self).__init__()

        # define hyperparameters
        self.method = method
        self.hidden_size = hidden_size

        # define architecture depending on the method
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)

        # create variable to store attention energies
        attn_energies = Variable(torch.zeros(seq_len))  # B x 1 x S

        # calculate energies for each encoder output
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])

        # normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
        attn_energies = attn_energies.view(-1)
        return F.softmax(attn_energies, dim=0).unsqueeze(0).unsqueeze(0)

    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.other.dot(energy)

        return energy

Now we can build a decoder that plugs this `Attention` module in after RNN to calculate attention weights, and apply those weights to the encoder outputs to get a context vector.

In [13]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(AttentionDecoderRNN, self).__init__()

        # define network hyperparameters
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p

        # define network arhcitecture
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size,
                          n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size * 2, output_size)

        # choose attention model
        if attn_model != 'none':
            self.attn = Attention(attn_model, hidden_size)

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        # note: we run this one step at a time
        # get the embedding of the current input woed (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1)  # S=1xBxN

        # combine embedded input word and last context, run through RNN
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)

        # calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
        attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # Bx1xN

        # final output layer (next word prediction) using the RNN hidden state and context vector
        rnn_output = rnn_output.squeeze(0)  # S=1XbXN -> BxN
        context = context.squeeze(1)  # BxS=1xN -> BxN
        ctx_out_cat = torch.cat((rnn_output, context), 1)
        output = F.log_softmax(self.out(ctx_out_cat), dim=1)

        # return final output, hidden state, and attention weights (for visualization)
        return output, context, hidden, attn_weights

## Testing models

To make sure the `Encoder` and `Decoder` model are working (and weights together) we'll do a quick test with fake word inputs

In [14]:
random_pair = random.choice(pairs)
training_pair = variables_from_pair(input_lang, output_lang, random_pair)

t_n_layers = 2
t_hidden_size = 50

encoder_test = EncoderRNN(10, t_hidden_size, t_n_layers)
decoder_test = AttentionDecoderRNN('general', t_hidden_size, 10, t_n_layers)

print(encoder_test)
print(decoder_test)

encoder_hidden = encoder_test.init_hidden()
word_input = Variable(torch.LongTensor([1, 2, 3]))
# word_input = training_pair[0]

encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(torch.LongTensor([1, 2, 3]))
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))

for i in range(3):
    decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(
        word_inputs[i], decoder_context, decoder_hidden, encoder_outputs)
    print('\nDecoded output=> {} hidden state=> \n{} \nattention=> {}'.format(
        decoder_output.data, decoder_hidden.size(), decoder_attn.data))
    decoder_attns[0, i] = decoder_attn.squeeze(0).data

EncoderRNN(
  (embedding): Embedding(10, 50)
  (gru): GRU(50, 50, num_layers=2)
)
AttentionDecoderRNN(
  (embedding): Embedding(10, 50)
  (gru): GRU(100, 50, num_layers=2, dropout=0.1)
  (out): Linear(in_features=100, out_features=10)
  (attn): Attention(
    (attn): Linear(in_features=50, out_features=50)
  )
)

Decoded output=> 
-2.2149 -2.3126 -2.1680 -2.2941 -2.2682 -2.3453 -2.2841 -2.4082 -2.2984 -2.4652
[torch.FloatTensor of size 1x10]
 hidden state=> 
torch.Size([2, 1, 50]) 
attention=> 
(0 ,.,.) = 
  0.3438  0.3325  0.3237
[torch.FloatTensor of size 1x1x3]


Decoded output=> 
-2.1976 -2.2813 -2.0996 -2.3142 -2.2876 -2.3663 -2.3250 -2.4028 -2.3096 -2.4943
[torch.FloatTensor of size 1x10]
 hidden state=> 
torch.Size([2, 1, 50]) 
attention=> 
(0 ,.,.) = 
  0.3476  0.3267  0.3257
[torch.FloatTensor of size 1x1x3]


Decoded output=> 
-2.1959 -2.3061 -2.1015 -2.3447 -2.2868 -2.3615 -2.2871 -2.3750 -2.3262 -2.4910
[torch.FloatTensor of size 1x10]
 hidden state=> 
torch.Size([2, 1, 50]

# Train and Test

## Training approach


To train we first run the input sentence through the encoder word by word, and keep track of every output and the latest hidden state. Next the decoder is given the last hidden state of the encoder as its first hidden state, and the <SOS> token as its first input.
From there we iterate to predict a next token from the decoder.

In [15]:
teacher_forcing_ratio = 0.5
clip = 0.5


def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_sent_len=MAX_SENT_LENGTH):

    # zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0  # added onto for each word

    # get size of input and target sentences
    input_len = input_variable.size()[0]
    target_len = target_variable.size()[0]

    # run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # prepare input and output variables for decoder
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    # use last hidden state from encoder to start decoder
    decoder_hidden = encoder_hidden

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio

    if use_teacher_forcing:
        # teacher forcing: use the ground-truth target as the next input
        for di in range(target_len):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # next target is next input
    else:
        # withount teacher forcing: use network own prediction as the next input
        for di in range(target_len):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])

            # get most likely word index (highest value) from output
            top_value, top_index = decoder_output.data.topk(1)
            ni = top_index[0][0]

            # chosen word is next input
            decoder_input = Variable(torch.LongTensor([[ni]]))

            if ni == EOS_token:
                break

    # backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_len

Finally helper functions to print time elapsed and estimated time remaining, given the current time and progress

In [16]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

## Run training

With everything in place we can actually initialize a network and start training.
To start we initialize models, optimizers, and a loss function (criterion).

In [38]:
attn_model = 'general'
hidden_size = 500
n_layers = 2
dropout_p = 0.05

# initialize models
encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers)
decoder = AttentionDecoderRNN(
    attn_model, hidden_size, output_lang.n_words, n_layers, dropout_p=dropout_p)

# movinitialize optimizers and criterion
lr = 0.0001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.NLLLoss()

Then setup variables for plotting and tracking progress

In [39]:
# configuring training
n_epochs = 20
n_iterations = len(pairs)
plot_every = 1000
print_every = 500
save_model_every = 1

# keep track of time elapsed and running averages
plot_losses = []
print_loss_total = 0  # reset every print_every
plot_loss_total = 0  # reset every plot_every

To actually train, we call the `train` function many times, printing a summary as we go.

*Note*: If you run this notebook you can train, interrupt the kernel, evaluate, and continue training later. You can comment out the lines above where the encoder and decoder are initialized (so they aren't reset) or simply run the notebook starting from the following cell.

In [40]:
# let's begin
start = time.time()
print('Is about to start training...')
trained_model_path = 'nmt-at-epoch-{}.pt'

step = 1
for epoch in range(1, n_epochs + 1):
    for it in range(n_iterations):
        # get training data for this cycle
        training_pair = variables_from_pair(input_lang, output_lang, pairs[it])
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        # run the train function
        loss = train(input_variable, target_variable, encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion)

        # keep track loss
        print_loss_total += loss
        plot_loss_total += loss

        if it == 0:
            continue

        if it % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print_summary = '%s (e: %d %d %d%%) %.4f' % (time_since(
                start, step / (n_epochs * n_iterations)), step, epoch, (step / (n_epochs * n_iterations)) * 100, print_loss_avg)
            print(print_summary)

        if it % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_loss_total = 0
            plot_losses.append(plot_loss_avg)
        step += 1

    if epoch % save_model_every == 0:
        torch.save({
            'encoder': encoder.state_dict(),
            'decoder': decoder.state_dict(),
            'encoder_optimizer': encoder_optimizer.state_dict(),
            'decoder_optimizer': decoder_optimizer.state_dict()
        },
            trained_model_path.format(epoch))

Is about to start training...
0m 1s (- 27146m 14s) (e: 1 1 0%) 0.0198
10m 53s (- 23740m 11s) (e: 501 1 0%) 4.9986
22m 41s (- 24751m 39s) (e: 1001 1 0%) 4.7145
35m 13s (- 25604m 30s) (e: 1501 1 0%) 4.6050
47m 46s (- 26038m 13s) (e: 2001 1 0%) 4.7562
60m 24s (- 26333m 5s) (e: 2501 1 0%) 4.7161
73m 4s (- 26536m 38s) (e: 3001 1 0%) 4.3892
85m 42s (- 26665m 55s) (e: 3501 1 0%) 4.4740
98m 33s (- 26816m 13s) (e: 4001 1 0%) 4.3423
111m 25s (- 26937m 59s) (e: 4501 1 0%) 4.6449
122m 51s (- 26720m 7s) (e: 5001 1 0%) 4.3042
133m 48s (- 26445m 19s) (e: 5501 1 0%) 4.3957
144m 56s (- 26246m 45s) (e: 6001 1 0%) 4.2375
156m 12s (- 26100m 11s) (e: 6501 1 0%) 4.3387
167m 37s (- 25994m 24s) (e: 7001 1 0%) 4.3639
179m 22s (- 25949m 48s) (e: 7501 1 0%) 4.4329
190m 48s (- 25867m 18s) (e: 8001 1 0%) 4.1573
202m 36s (- 25840m 12s) (e: 8501 1 0%) 4.4647
215m 14s (- 25914m 46s) (e: 9001 1 0%) 4.3177
227m 21s (- 25920m 48s) (e: 9501 1 0%) 4.3925
239m 28s (- 25925m 52s) (e: 10001 1 0%) 4.3654
252m 4s (- 25978m 22s

KeyboardInterrupt: 

## Plotting training loss

Plotting is done with `matplotlib`, using the array `plot_loses` that was created while training.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

%matplotlib inline


def draw_loss_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)  # put ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


draw_loss_plot(plot_losses)

## Evaluating the network

Evaluation is mostly the same as training, but there are no targets. Instead we always feed decoder' predictions back itself.
Every time it predicts a word, we add it to the output string. If it predicts the EOS token we stop there. We also store the decoder's attention outputs for each step to display later.

In [None]:
def evaluate(sentence, max_sent_len=MAX_SENT_LENGTH):
    input_variable = variable_from_sentence(input_lang, sentence)
    input_len = input_variable.size()[0]

    # run through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)

    # create starting vectors for decoder
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_sent_len, max_sent_len)

    # run through decoder
    for di in range(max_sent_len):
        decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(
            decoder_output, decoder_context, decoder_hidden, encoder_outputs)
        decoder_attentions[di, :decoder_attention.size(
            2)] += decoder_attention.squeeze(0).squeeze(0).data

        top_val, top_idx = decoder_output.data.topk(1)
        ni = top_idx[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word([[ni]]))

        decoder_input = Variable(torch.LongTensor([[ni]]))

    return decoded_words, decoder_attentions[:di + 1, :len(encoder_outputs)]

We can evaluate random sentences from the training set and print out the input, target, and output to make some subjective quality judgements

In [None]:
def evaluate_randomly():
    pair = random.choice(pairs)

    output_words, decoder_attn = evaluate(pair[0])
    output_sentence = ' '.join(output_words)

    print('>', pair[0])
    print('=', pair[1])
    print('<', output_sentence)
    print('')

In [None]:
evaluate_randomly()

In [30]:
a = [1, 2, 3]
for i in range(len(a)):
    print(a[i])

1
2
3
