In [1]:
%matplotlib inline

## SCAN Add-Prim JUMP Experiment
*************************************************************

Reference: http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html


**Requirements**

* Python 3.6
* PyTorch 0.4

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print("Device is using", device)

Device is using cpu


Loading data files
==================

In [4]:
SOS_token = 0
EOS_token = 1
TASK_NAME = "addprim-jump"


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

To read the data file we will split the file into lines, and then split
lines into pairs. 



In [5]:
def readLangs(lang1, lang2, reverse=False, trainOrtest='train'):
    print("Reading lines...")

    # Read the file and split into lines        
    lines = open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/processed/{}-{}_{}-{}.txt'.\
                 format(trainOrtest, TASK_NAME, lang1, lang2), encoding='utf-8').\
                 read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 50
# PRED_LENGTH = 50

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [7]:
def prepareData(lang1, lang2, reverse=False, dataFrom='train'):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse=False, trainOrtest=dataFrom)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('in', 'out', True)
print(random.choice(pairs))

Reading lines...
Read 37046 sentence pairs
Trimmed to 37046 sentence pairs
Counting words...
Counted words:
in 15
out 8
['run opposite right and turn around left thrice', 'I_TURN_RIGHT I_TURN_RIGHT I_RUN I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT']


Model
=================

The model we are using is a GRU encoder-decoder seq2seq model with attention mechanism. In order to solve the zero-shot generalization task, we embed the encoder networks with pre-trained embeddings, from GloVe and Google Word2Vec.  

In [8]:
EMBEDDEING_SOURCE = 'google'
hidden_size = 300

if EMBEDDEING_SOURCE == 'google':
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_GoogleNews300Negative.pkl', 'rb') as handle:
        b = pickle.load(handle)
else:
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_raw{}d.pkl'.format(hidden_size), 'rb') as handle:
        b = pickle.load(handle)

pretrained_emb = np.zeros((input_lang.n_words, hidden_size))
for k, v in input_lang.index2word.items():
    if v == 'SOS':
        pretrained_emb[k] = np.zeros(hidden_size)
    elif (v == 'EOS') and (EMBEDDEING_SOURCE != 'google'):
        pretrained_emb[k] = b['.']
    elif (v == 'and') and (EMBEDDEING_SOURCE == 'google'):
        pretrained_emb[k] = b['AND']
    else:
        pretrained_emb[k] = b[v]

The Encoder
-----------

The encoder of this seq2seq network is a GRU netword. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.




In [9]:
EMBEDDEING_PRETRAINED = True
WEIGHT_UPDATE = False

MODEL_VERSION = 'T0.4_gg300'

In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        if EMBEDDEING_PRETRAINED:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
            self.embedding.weight.requires_grad = WEIGHT_UPDATE
        
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is a GRU network with attention mechanism that takes the last output of the encoder and
outputs a sequence of words to create the translation.

First we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.




In [11]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we append the
EOS token to both sequences.




In [12]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

We use teacher forcing to help converge faster with a delay fashion.




In [13]:
teacher_forcing_ratio = 0.8


def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Helper function for timing




In [14]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Training interation

In [15]:
def trainIters(encoder, decoder, n_iters, print_every=1000, eval_every=1000, learning_rate=0.001):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
        encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
        decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
        
    best_test_acc = evaluateAccuracy(encoder, decoder, 500)
    print("Best evaluation accuracy: {0:.2f}%".format(best_test_acc * 100))

    parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        
    encoder_optimizer = optim.Adam(parameters, lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg), end=' ')
            
            if iter % eval_every == 0:
                test_acc = evaluateAccuracy(encoder, decoder, 200)
                print('{0:.2f}%'.format(test_acc * 100))
                
                if test_acc > best_test_acc:
                    with open("saved_models/encoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(encoder, f)
                    with open("saved_models/decoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(decoder, f)
                    print("New best test accuracy! Model Updated!")
                    best_test_acc = test_acc
#                 elif test_acc < best_test_acc - 0.001:
#                     encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
#                     decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
                    
            else:
                print('')

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [17]:
input_lang, output_lang, pairs_eval = prepareData('in', 'out', True, dataFrom='test')
print(random.choice(pairs_eval))

Reading lines...
Read 15412 sentence pairs
Trimmed to 15412 sentence pairs
Counting words...
Counted words:
in 15
out 8
['jump around right and walk opposite right thrice', 'I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_TURN_RIGHT I_WALK I_TURN_RIGHT I_TURN_RIGHT I_WALK I_TURN_RIGHT I_TURN_RIGHT I_WALK']


In [18]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_eval)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [19]:
def evaluateAccuracy(encoder, decoder, n=10):
    ACCs = []
    for i in range(n):
        pair = random.choice(pairs_eval)
        output_words, _ = evaluate(encoder, decoder, pair[0])
        
        if output_words[-1] == '<EOS>':
            output_words = output_words[:-1]
        output_sentence = ' '.join(output_words)
        
        if output_sentence == pair[1]:
            ACCs.append(1)
        else:
            ACCs.append(0)
    return np.array(ACCs).mean()

Training and Evaluating
=======================


The model is initially trained with a higher teacher aid, and relatively large learning rate. Both teacher forcing effect and the learning rate decay over iterations when the model approaches the optimum.  

#### The model achieves 97% accuracy rate for the best test sample evaluation, and is 94% correct on average for the testset.

In [20]:
teacher_forcing_ratio = 0.8

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 5000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 0.00%
0m 10s (- 17m 53s) (50 1%) 1.8409 
0m 16s (- 13m 17s) (100 2%) 1.3678 
0m 20s (- 11m 13s) (150 3%) 1.1563 
0m 25s (- 10m 14s) (200 4%) 1.4303 
0m 30s (- 9m 34s) (250 5%) 1.3190 
0m 34s (- 9m 3s) (300 6%) 1.1561 
0m 39s (- 8m 44s) (350 7%) 1.0983 
0m 43s (- 8m 16s) (400 8%) 1.0989 
0m 47s (- 7m 57s) (450 9%) 1.1244 
0m 50s (- 7m 38s) (500 10%) 1.0068 0.00%
1m 1s (- 8m 18s) (550 11%) 1.0164 
1m 6s (- 8m 7s) (600 12%) 0.9592 
1m 11s (- 7m 59s) (650 13%) 0.9208 
1m 18s (- 8m 3s) (700 14%) 0.8341 
1m 23s (- 7m 54s) (750 15%) 1.0649 
1m 29s (- 7m 50s) (800 16%) 0.9913 
1m 35s (- 7m 46s) (850 17%) 0.9482 
1m 42s (- 7m 45s) (900 18%) 0.8757 
1m 46s (- 7m 35s) (950 19%) 0.7944 
1m 51s (- 7m 27s) (1000 20%) 0.7529 0.50%
New best test accuracy! Model Updated!
2m 2s (- 7m 40s) (1050 21%) 0.9433 
2m 7s (- 7m 31s) (1100 22%) 0.8243 
2m 12s (- 7m 23s) (1150 23%) 0.8185 
2m 17s (- 7m 14s) (1200 24%) 0.8771 
2m 22s (- 7m 8s) (1250 25%) 0.8414 
2m 26s (- 6m 58s) (1300 26%

In [22]:
teacher_forcing_ratio = 0.5
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 18.00%
0m 16s (- 5m 11s) (50 5%) 0.4941 
0m 20s (- 3m 8s) (100 10%) 0.4029 
0m 24s (- 2m 17s) (150 15%) 0.4301 
0m 27s (- 1m 51s) (200 20%) 0.5234 
0m 31s (- 1m 35s) (250 25%) 0.3886 
0m 35s (- 1m 23s) (300 30%) 0.4076 
0m 39s (- 1m 13s) (350 35%) 0.4116 
0m 43s (- 1m 5s) (400 40%) 0.3716 
0m 47s (- 0m 57s) (450 45%) 0.3658 
0m 50s (- 0m 50s) (500 50%) 0.2636 23.00%
New best test accuracy! Model Updated!
0m 57s (- 0m 46s) (550 55%) 0.3923 
0m 59s (- 0m 39s) (600 60%) 0.3438 
1m 3s (- 0m 34s) (650 65%) 0.3711 
1m 7s (- 0m 28s) (700 70%) 0.3209 
1m 11s (- 0m 23s) (750 75%) 0.4423 
1m 15s (- 0m 18s) (800 80%) 0.4426 
1m 18s (- 0m 13s) (850 85%) 0.4209 
1m 22s (- 0m 9s) (900 90%) 0.3264 
1m 25s (- 0m 4s) (950 95%) 0.2829 
1m 28s (- 0m 0s) (1000 100%) 0.3117 19.50%


In [23]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 16.40%
0m 13s (- 4m 17s) (50 5%) 0.3768 
0m 17s (- 2m 36s) (100 10%) 0.3642 
0m 20s (- 1m 58s) (150 15%) 0.2782 
0m 24s (- 1m 39s) (200 20%) 0.3501 
0m 28s (- 1m 25s) (250 25%) 0.4310 
0m 31s (- 1m 13s) (300 30%) 0.3726 
0m 34s (- 1m 4s) (350 35%) 0.3641 
0m 37s (- 0m 56s) (400 40%) 0.4038 
0m 41s (- 0m 50s) (450 45%) 0.3018 
0m 45s (- 0m 45s) (500 50%) 0.3922 13.00%
0m 53s (- 0m 43s) (550 55%) 0.4571 
0m 57s (- 0m 38s) (600 60%) 0.2379 
1m 1s (- 0m 33s) (650 65%) 0.3253 
1m 5s (- 0m 27s) (700 70%) 0.3545 
1m 9s (- 0m 23s) (750 75%) 0.3274 
1m 14s (- 0m 18s) (800 80%) 0.3917 
1m 18s (- 0m 13s) (850 85%) 0.3644 
1m 23s (- 0m 9s) (900 90%) 0.3851 
1m 28s (- 0m 4s) (950 95%) 0.4378 
1m 32s (- 0m 0s) (1000 100%) 0.3949 20.50%
New best test accuracy! Model Updated!


In [25]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 20.40%
0m 15s (- 4m 52s) (50 5%) 0.3883 
0m 19s (- 2m 56s) (100 10%) 0.3780 
0m 23s (- 2m 10s) (150 15%) 0.4593 
0m 26s (- 1m 46s) (200 20%) 0.3323 
0m 30s (- 1m 32s) (250 25%) 0.3254 
0m 34s (- 1m 20s) (300 30%) 0.3944 
0m 38s (- 1m 11s) (350 35%) 0.4481 
0m 42s (- 1m 3s) (400 40%) 0.4148 
0m 46s (- 0m 57s) (450 45%) 0.3838 
0m 50s (- 0m 50s) (500 50%) 0.3132 22.00%
New best test accuracy! Model Updated!
0m 58s (- 0m 47s) (550 55%) 0.2395 
1m 2s (- 0m 41s) (600 60%) 0.3002 
1m 5s (- 0m 35s) (650 65%) 0.2523 
1m 10s (- 0m 30s) (700 70%) 0.3210 
1m 14s (- 0m 24s) (750 75%) 0.2427 
1m 17s (- 0m 19s) (800 80%) 0.3313 
1m 21s (- 0m 14s) (850 85%) 0.3432 
1m 25s (- 0m 9s) (900 90%) 0.3127 
1m 29s (- 0m 4s) (950 95%) 0.2886 
1m 33s (- 0m 0s) (1000 100%) 0.2132 23.50%
New best test accuracy! Model Updated!


In [26]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 24.00%
0m 14s (- 4m 35s) (50 5%) 0.2055 
0m 18s (- 2m 47s) (100 10%) 0.2915 
0m 22s (- 2m 8s) (150 15%) 0.3316 
0m 26s (- 1m 47s) (200 20%) 0.3369 
0m 31s (- 1m 33s) (250 25%) 0.3257 
0m 35s (- 1m 22s) (300 30%) 0.3602 
0m 38s (- 1m 11s) (350 35%) 0.2130 
0m 42s (- 1m 4s) (400 40%) 0.2990 
0m 46s (- 0m 56s) (450 45%) 0.3939 
0m 50s (- 0m 50s) (500 50%) 0.3390 25.50%
New best test accuracy! Model Updated!
1m 0s (- 0m 49s) (550 55%) 0.3873 
1m 4s (- 0m 42s) (600 60%) 0.2341 
1m 7s (- 0m 36s) (650 65%) 0.3750 
1m 11s (- 0m 30s) (700 70%) 0.3211 
1m 15s (- 0m 25s) (750 75%) 0.3049 
1m 19s (- 0m 19s) (800 80%) 0.2484 
1m 24s (- 0m 14s) (850 85%) 0.2174 
1m 28s (- 0m 9s) (900 90%) 0.2939 
1m 32s (- 0m 4s) (950 95%) 0.2548 
1m 36s (- 0m 0s) (1000 100%) 0.2174 30.00%
New best test accuracy! Model Updated!


In [27]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 27.40%
0m 14s (- 4m 28s) (50 5%) 0.2138 
0m 18s (- 2m 43s) (100 10%) 0.3305 
0m 22s (- 2m 6s) (150 15%) 0.1774 
0m 26s (- 1m 44s) (200 20%) 0.2059 
0m 29s (- 1m 29s) (250 25%) 0.1545 
0m 34s (- 1m 19s) (300 30%) 0.3385 
0m 38s (- 1m 11s) (350 35%) 0.1867 
0m 42s (- 1m 4s) (400 40%) 0.2761 
0m 47s (- 0m 58s) (450 45%) 0.2276 
0m 51s (- 0m 51s) (500 50%) 0.2262 39.50%
New best test accuracy! Model Updated!
0m 59s (- 0m 48s) (550 55%) 0.2733 
1m 3s (- 0m 42s) (600 60%) 0.2091 
1m 7s (- 0m 36s) (650 65%) 0.2506 
1m 10s (- 0m 30s) (700 70%) 0.2209 
1m 14s (- 0m 24s) (750 75%) 0.2089 
1m 17s (- 0m 19s) (800 80%) 0.1850 
1m 21s (- 0m 14s) (850 85%) 0.1786 
1m 25s (- 0m 9s) (900 90%) 0.2017 
1m 29s (- 0m 4s) (950 95%) 0.1565 
1m 33s (- 0m 0s) (1000 100%) 0.1601 45.00%
New best test accuracy! Model Updated!


In [28]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 46.80%
0m 13s (- 4m 8s) (50 5%) 0.2971 
0m 16s (- 2m 25s) (100 10%) 0.1704 
0m 19s (- 1m 53s) (150 15%) 0.2576 
0m 23s (- 1m 35s) (200 20%) 0.0989 
0m 27s (- 1m 23s) (250 25%) 0.2374 
0m 32s (- 1m 16s) (300 30%) 0.1630 
0m 38s (- 1m 10s) (350 35%) 0.2024 
0m 43s (- 1m 4s) (400 40%) 0.1583 
0m 46s (- 0m 56s) (450 45%) 0.1418 
0m 50s (- 0m 50s) (500 50%) 0.1388 52.50%
New best test accuracy! Model Updated!
1m 2s (- 0m 51s) (550 55%) 0.2873 
1m 7s (- 0m 45s) (600 60%) 0.1489 
1m 11s (- 0m 38s) (650 65%) 0.2218 
1m 16s (- 0m 32s) (700 70%) 0.1651 
1m 20s (- 0m 26s) (750 75%) 0.1274 
1m 24s (- 0m 21s) (800 80%) 0.2083 
1m 28s (- 0m 15s) (850 85%) 0.2011 
1m 33s (- 0m 10s) (900 90%) 0.1716 
1m 37s (- 0m 5s) (950 95%) 0.1793 
1m 41s (- 0m 0s) (1000 100%) 0.1482 48.50%


In [29]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 51.40%
0m 13s (- 4m 9s) (50 5%) 0.1805 
0m 16s (- 2m 31s) (100 10%) 0.1852 
0m 20s (- 1m 58s) (150 15%) 0.1226 
0m 24s (- 1m 39s) (200 20%) 0.1361 
0m 29s (- 1m 27s) (250 25%) 0.1556 
0m 32s (- 1m 16s) (300 30%) 0.1276 
0m 36s (- 1m 8s) (350 35%) 0.1370 
0m 40s (- 1m 1s) (400 40%) 0.1657 
0m 44s (- 0m 54s) (450 45%) 0.0982 
0m 48s (- 0m 48s) (500 50%) 0.0849 47.50%
0m 55s (- 0m 45s) (550 55%) 0.1171 
0m 59s (- 0m 39s) (600 60%) 0.0808 
1m 3s (- 0m 34s) (650 65%) 0.1705 
1m 7s (- 0m 28s) (700 70%) 0.1722 
1m 10s (- 0m 23s) (750 75%) 0.0980 
1m 14s (- 0m 18s) (800 80%) 0.1280 
1m 18s (- 0m 13s) (850 85%) 0.1184 
1m 21s (- 0m 9s) (900 90%) 0.1153 
1m 25s (- 0m 4s) (950 95%) 0.1021 
1m 29s (- 0m 0s) (1000 100%) 0.1574 69.00%
New best test accuracy! Model Updated!


In [30]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 60.80%
0m 11s (- 3m 39s) (50 5%) 0.0860 
0m 15s (- 2m 23s) (100 10%) 0.1414 
0m 20s (- 1m 56s) (150 15%) 0.0955 
0m 26s (- 1m 44s) (200 20%) 0.0981 
0m 29s (- 1m 29s) (250 25%) 0.0834 
0m 34s (- 1m 19s) (300 30%) 0.0793 
0m 38s (- 1m 11s) (350 35%) 0.1010 
0m 42s (- 1m 3s) (400 40%) 0.0812 
0m 47s (- 0m 57s) (450 45%) 0.1487 
0m 51s (- 0m 51s) (500 50%) 0.1070 56.50%
0m 58s (- 0m 47s) (550 55%) 0.1221 
1m 2s (- 0m 41s) (600 60%) 0.1193 
1m 7s (- 0m 36s) (650 65%) 0.0785 
1m 13s (- 0m 31s) (700 70%) 0.0816 
1m 17s (- 0m 25s) (750 75%) 0.0872 
1m 21s (- 0m 20s) (800 80%) 0.1353 
1m 27s (- 0m 15s) (850 85%) 0.1284 
1m 32s (- 0m 10s) (900 90%) 0.1308 
1m 36s (- 0m 5s) (950 95%) 0.0787 
1m 40s (- 0m 0s) (1000 100%) 0.0898 68.00%
New best test accuracy! Model Updated!


In [31]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 65.60%
0m 13s (- 4m 14s) (50 5%) 0.0937 
0m 18s (- 2m 45s) (100 10%) 0.1445 
0m 22s (- 2m 4s) (150 15%) 0.1132 
0m 26s (- 1m 44s) (200 20%) 0.0943 
0m 30s (- 1m 30s) (250 25%) 0.0708 
0m 33s (- 1m 18s) (300 30%) 0.1276 
0m 37s (- 1m 10s) (350 35%) 0.0563 
0m 41s (- 1m 1s) (400 40%) 0.0629 
0m 45s (- 0m 55s) (450 45%) 0.0756 
0m 49s (- 0m 49s) (500 50%) 0.0897 69.00%
New best test accuracy! Model Updated!
0m 56s (- 0m 46s) (550 55%) 0.0810 
1m 0s (- 0m 40s) (600 60%) 0.0591 
1m 4s (- 0m 34s) (650 65%) 0.0702 
1m 8s (- 0m 29s) (700 70%) 0.0613 
1m 12s (- 0m 24s) (750 75%) 0.0727 
1m 16s (- 0m 19s) (800 80%) 0.0796 
1m 19s (- 0m 14s) (850 85%) 0.1010 
1m 23s (- 0m 9s) (900 90%) 0.0593 
1m 27s (- 0m 4s) (950 95%) 0.0596 
1m 31s (- 0m 0s) (1000 100%) 0.0536 66.50%


In [33]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 68.60%
0m 14s (- 4m 38s) (50 5%) 0.0849 
0m 17s (- 2m 40s) (100 10%) 0.0758 
0m 21s (- 2m 4s) (150 15%) 0.0879 
0m 25s (- 1m 42s) (200 20%) 0.0722 
0m 29s (- 1m 28s) (250 25%) 0.0556 
0m 34s (- 1m 20s) (300 30%) 0.0585 
0m 39s (- 1m 12s) (350 35%) 0.0524 
0m 43s (- 1m 5s) (400 40%) 0.1150 
0m 47s (- 0m 58s) (450 45%) 0.0724 
0m 51s (- 0m 51s) (500 50%) 0.0517 68.50%
0m 59s (- 0m 48s) (550 55%) 0.1545 
1m 3s (- 0m 42s) (600 60%) 0.1066 
1m 7s (- 0m 36s) (650 65%) 0.0618 
1m 11s (- 0m 30s) (700 70%) 0.0642 
1m 15s (- 0m 25s) (750 75%) 0.0864 
1m 19s (- 0m 19s) (800 80%) 0.0425 
1m 23s (- 0m 14s) (850 85%) 0.0785 
1m 27s (- 0m 9s) (900 90%) 0.0559 
1m 31s (- 0m 4s) (950 95%) 0.0853 
1m 35s (- 0m 0s) (1000 100%) 0.0525 71.00%
New best test accuracy! Model Updated!


In [34]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 70.20%
0m 13s (- 4m 24s) (50 5%) 0.0530 
0m 17s (- 2m 38s) (100 10%) 0.0907 
0m 21s (- 2m 1s) (150 15%) 0.0591 
0m 25s (- 1m 43s) (200 20%) 0.1120 
0m 29s (- 1m 28s) (250 25%) 0.0422 
0m 33s (- 1m 18s) (300 30%) 0.0514 
0m 37s (- 1m 9s) (350 35%) 0.0513 
0m 41s (- 1m 2s) (400 40%) 0.1138 
0m 45s (- 0m 56s) (450 45%) 0.0414 
0m 52s (- 0m 52s) (500 50%) 0.0438 78.00%
New best test accuracy! Model Updated!
1m 1s (- 0m 50s) (550 55%) 0.0459 
1m 6s (- 0m 44s) (600 60%) 0.1263 
1m 10s (- 0m 37s) (650 65%) 0.0487 
1m 14s (- 0m 31s) (700 70%) 0.0616 
1m 19s (- 0m 26s) (750 75%) 0.0501 
1m 24s (- 0m 21s) (800 80%) 0.0688 
1m 28s (- 0m 15s) (850 85%) 0.0602 
1m 33s (- 0m 10s) (900 90%) 0.1060 
1m 37s (- 0m 5s) (950 95%) 0.0538 
1m 41s (- 0m 0s) (1000 100%) 0.0445 73.50%


In [36]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 74.80%
0m 13s (- 4m 12s) (50 5%) 0.0505 
0m 17s (- 2m 35s) (100 10%) 0.0744 
0m 21s (- 2m 1s) (150 15%) 0.0636 
0m 25s (- 1m 41s) (200 20%) 0.0548 
0m 29s (- 1m 27s) (250 25%) 0.0513 
0m 33s (- 1m 17s) (300 30%) 0.0540 
0m 37s (- 1m 9s) (350 35%) 0.0458 
0m 41s (- 1m 2s) (400 40%) 0.0362 
0m 45s (- 0m 55s) (450 45%) 0.0345 
0m 49s (- 0m 49s) (500 50%) 0.0548 77.00%
New best test accuracy! Model Updated!
0m 57s (- 0m 46s) (550 55%) 0.0518 
1m 1s (- 0m 40s) (600 60%) 0.0460 
1m 6s (- 0m 35s) (650 65%) 0.0567 
1m 11s (- 0m 30s) (700 70%) 0.0495 
1m 16s (- 0m 25s) (750 75%) 0.0334 
1m 21s (- 0m 20s) (800 80%) 0.0573 
1m 25s (- 0m 15s) (850 85%) 0.0735 
1m 30s (- 0m 10s) (900 90%) 0.0422 
1m 33s (- 0m 4s) (950 95%) 0.0425 
1m 37s (- 0m 0s) (1000 100%) 0.0292 81.50%
New best test accuracy! Model Updated!


In [41]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 76.60%
0m 11s (- 3m 41s) (50 5%) 0.0727 
0m 16s (- 2m 25s) (100 10%) 0.1336 
0m 19s (- 1m 48s) (150 15%) 0.0386 
0m 22s (- 1m 29s) (200 20%) 0.0393 
0m 25s (- 1m 16s) (250 25%) 0.0275 
0m 29s (- 1m 7s) (300 30%) 0.0736 
0m 32s (- 1m 0s) (350 35%) 0.0563 
0m 35s (- 0m 53s) (400 40%) 0.0417 
0m 40s (- 0m 48s) (450 45%) 0.0793 
0m 44s (- 0m 44s) (500 50%) 0.0277 74.50%
0m 51s (- 0m 42s) (550 55%) 0.0318 
0m 55s (- 0m 36s) (600 60%) 0.0476 
0m 58s (- 0m 31s) (650 65%) 0.0403 
1m 1s (- 0m 26s) (700 70%) 0.0817 
1m 5s (- 0m 21s) (750 75%) 0.0423 
1m 9s (- 0m 17s) (800 80%) 0.1327 
1m 14s (- 0m 13s) (850 85%) 0.0357 
1m 17s (- 0m 8s) (900 90%) 0.0563 
1m 21s (- 0m 4s) (950 95%) 0.0510 
1m 27s (- 0m 0s) (1000 100%) 0.0600 82.50%
New best test accuracy! Model Updated!


In [42]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 79.60%
0m 16s (- 5m 7s) (50 5%) 0.0429 
0m 20s (- 3m 2s) (100 10%) 0.0530 
0m 24s (- 2m 19s) (150 15%) 0.0408 
0m 28s (- 1m 55s) (200 20%) 0.0511 
0m 33s (- 1m 40s) (250 25%) 0.0575 
0m 37s (- 1m 27s) (300 30%) 0.0441 
0m 41s (- 1m 17s) (350 35%) 0.0480 
0m 46s (- 1m 9s) (400 40%) 0.0342 
0m 50s (- 1m 1s) (450 45%) 0.0501 
0m 53s (- 0m 53s) (500 50%) 0.0641 78.00%
1m 1s (- 0m 50s) (550 55%) 0.0650 
1m 5s (- 0m 43s) (600 60%) 0.0786 
1m 9s (- 0m 37s) (650 65%) 0.0369 
1m 13s (- 0m 31s) (700 70%) 0.0408 
1m 17s (- 0m 25s) (750 75%) 0.0369 
1m 21s (- 0m 20s) (800 80%) 0.0800 
1m 25s (- 0m 15s) (850 85%) 0.0403 
1m 29s (- 0m 9s) (900 90%) 0.0319 
1m 33s (- 0m 4s) (950 95%) 0.0458 
1m 37s (- 0m 0s) (1000 100%) 0.0421 80.50%
New best test accuracy! Model Updated!


In [46]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 78.60%
0m 16s (- 5m 18s) (50 5%) 0.0785 
0m 20s (- 3m 7s) (100 10%) 0.0238 
0m 26s (- 2m 27s) (150 15%) 0.0617 
0m 31s (- 2m 4s) (200 20%) 0.0397 
0m 35s (- 1m 47s) (250 25%) 0.0374 
0m 40s (- 1m 34s) (300 30%) 0.0416 
0m 44s (- 1m 23s) (350 35%) 0.0544 
0m 51s (- 1m 16s) (400 40%) 0.0360 
0m 55s (- 1m 8s) (450 45%) 0.0401 
1m 1s (- 1m 1s) (500 50%) 0.0513 86.00%
New best test accuracy! Model Updated!
1m 13s (- 1m 0s) (550 55%) 0.0346 
1m 21s (- 0m 54s) (600 60%) 0.0399 
1m 26s (- 0m 46s) (650 65%) 0.0295 
1m 32s (- 0m 39s) (700 70%) 0.0319 
1m 40s (- 0m 33s) (750 75%) 0.0415 
1m 46s (- 0m 26s) (800 80%) 0.0389 
1m 50s (- 0m 19s) (850 85%) 0.0583 
1m 54s (- 0m 12s) (900 90%) 0.0611 
1m 59s (- 0m 6s) (950 95%) 0.0272 
2m 4s (- 0m 0s) (1000 100%) 0.0366 83.00%


In [48]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 83.80%
0m 21s (- 6m 45s) (50 5%) 0.0402 
0m 25s (- 3m 52s) (100 10%) 0.0407 
0m 30s (- 2m 50s) (150 15%) 0.0595 
0m 34s (- 2m 17s) (200 20%) 0.0267 
0m 38s (- 1m 56s) (250 25%) 0.0465 
0m 42s (- 1m 39s) (300 30%) 0.0226 
0m 47s (- 1m 27s) (350 35%) 0.0381 
0m 51s (- 1m 16s) (400 40%) 0.0456 
0m 55s (- 1m 7s) (450 45%) 0.0304 
0m 59s (- 0m 59s) (500 50%) 0.0757 85.50%
New best test accuracy! Model Updated!
1m 6s (- 0m 54s) (550 55%) 0.0267 
1m 10s (- 0m 47s) (600 60%) 0.0794 
1m 15s (- 0m 40s) (650 65%) 0.0294 
1m 19s (- 0m 33s) (700 70%) 0.0502 
1m 22s (- 0m 27s) (750 75%) 0.0293 
1m 27s (- 0m 21s) (800 80%) 0.0364 
1m 30s (- 0m 16s) (850 85%) 0.0665 
1m 35s (- 0m 10s) (900 90%) 0.0335 
1m 39s (- 0m 5s) (950 95%) 0.0295 
1m 43s (- 0m 0s) (1000 100%) 0.0375 81.00%


In [49]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 82.00%
0m 14s (- 4m 26s) (50 5%) 0.1268 
0m 18s (- 2m 42s) (100 10%) 0.0295 
0m 22s (- 2m 6s) (150 15%) 0.0238 
0m 27s (- 1m 48s) (200 20%) 0.0579 
0m 31s (- 1m 34s) (250 25%) 0.0842 
0m 35s (- 1m 22s) (300 30%) 0.0282 
0m 40s (- 1m 15s) (350 35%) 0.0415 
0m 44s (- 1m 6s) (400 40%) 0.0238 
0m 49s (- 0m 59s) (450 45%) 0.0363 
0m 54s (- 0m 54s) (500 50%) 0.0586 82.50%
New best test accuracy! Model Updated!
1m 3s (- 0m 52s) (550 55%) 0.0325 
1m 8s (- 0m 45s) (600 60%) 0.0239 
1m 12s (- 0m 39s) (650 65%) 0.0347 
1m 18s (- 0m 33s) (700 70%) 0.0247 
1m 22s (- 0m 27s) (750 75%) 0.0282 
1m 26s (- 0m 21s) (800 80%) 0.0260 
1m 33s (- 0m 16s) (850 85%) 0.0479 
1m 40s (- 0m 11s) (900 90%) 0.0176 
1m 46s (- 0m 5s) (950 95%) 0.0336 
1m 53s (- 0m 0s) (1000 100%) 0.0591 81.00%


In [50]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 82.00%
0m 12s (- 4m 6s) (50 5%) 0.0808 
0m 17s (- 2m 36s) (100 10%) 0.0683 
0m 21s (- 2m 2s) (150 15%) 0.0500 
0m 25s (- 1m 41s) (200 20%) 0.0450 
0m 29s (- 1m 27s) (250 25%) 0.0365 
0m 32s (- 1m 16s) (300 30%) 0.0239 
0m 37s (- 1m 8s) (350 35%) 0.0289 
0m 40s (- 1m 1s) (400 40%) 0.0237 
0m 44s (- 0m 54s) (450 45%) 0.0283 
0m 48s (- 0m 48s) (500 50%) 0.0312 82.00%
0m 58s (- 0m 47s) (550 55%) 0.0311 
1m 2s (- 0m 41s) (600 60%) 0.0495 
1m 6s (- 0m 35s) (650 65%) 0.0378 
1m 10s (- 0m 30s) (700 70%) 0.0448 
1m 14s (- 0m 24s) (750 75%) 0.0292 
1m 17s (- 0m 19s) (800 80%) 0.0365 
1m 21s (- 0m 14s) (850 85%) 0.0262 
1m 25s (- 0m 9s) (900 90%) 0.0560 
1m 28s (- 0m 4s) (950 95%) 0.0205 
1m 32s (- 0m 0s) (1000 100%) 0.0228 87.50%
New best test accuracy! Model Updated!


In [51]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 86.00%
0m 16s (- 5m 5s) (50 5%) 0.0225 
0m 20s (- 3m 0s) (100 10%) 0.0175 
0m 24s (- 2m 17s) (150 15%) 0.0207 
0m 28s (- 1m 52s) (200 20%) 0.0356 
0m 31s (- 1m 35s) (250 25%) 0.0299 
0m 36s (- 1m 24s) (300 30%) 0.0298 
0m 39s (- 1m 13s) (350 35%) 0.0240 
0m 43s (- 1m 5s) (400 40%) 0.0254 
0m 47s (- 0m 58s) (450 45%) 0.0209 
0m 51s (- 0m 51s) (500 50%) 0.0398 86.00%
0m 58s (- 0m 48s) (550 55%) 0.0409 
1m 3s (- 0m 42s) (600 60%) 0.0250 
1m 6s (- 0m 35s) (650 65%) 0.0155 
1m 11s (- 0m 30s) (700 70%) 0.0273 
1m 14s (- 0m 24s) (750 75%) 0.0146 
1m 18s (- 0m 19s) (800 80%) 0.0774 
1m 22s (- 0m 14s) (850 85%) 0.0241 
1m 26s (- 0m 9s) (900 90%) 0.0188 
1m 30s (- 0m 4s) (950 95%) 0.0296 
1m 34s (- 0m 0s) (1000 100%) 0.0267 89.00%
New best test accuracy! Model Updated!


In [52]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 88.20%
0m 9s (- 6m 13s) (50 2%) 0.0208 
0m 12s (- 4m 1s) (100 5%) 0.0207 
0m 15s (- 3m 16s) (150 7%) 0.0418 
0m 19s (- 2m 55s) (200 10%) 0.0258 
0m 22s (- 2m 38s) (250 12%) 0.0207 
0m 26s (- 2m 27s) (300 15%) 0.0243 
0m 29s (- 2m 19s) (350 17%) 0.0228 
0m 32s (- 2m 10s) (400 20%) 0.0336 
0m 36s (- 2m 4s) (450 22%) 0.0235 
0m 39s (- 1m 57s) (500 25%) 0.0178 90.00%
New best test accuracy! Model Updated!
0m 45s (- 2m 0s) (550 27%) 0.0231 
0m 49s (- 1m 54s) (600 30%) 0.0249 
0m 52s (- 1m 49s) (650 32%) 0.0237 
0m 55s (- 1m 43s) (700 35%) 0.0172 
0m 59s (- 1m 38s) (750 37%) 0.0192 
1m 2s (- 1m 33s) (800 40%) 0.0227 
1m 5s (- 1m 29s) (850 42%) 0.0402 
1m 8s (- 1m 24s) (900 45%) 0.0405 
1m 12s (- 1m 19s) (950 47%) 0.0162 
1m 15s (- 1m 15s) (1000 50%) 0.0371 90.50%
New best test accuracy! Model Updated!
1m 21s (- 1m 13s) (1050 52%) 0.0127 
1m 25s (- 1m 10s) (1100 55%) 0.0229 
1m 30s (- 1m 6s) (1150 57%) 0.0210 
1m 34s (- 1m 2s) (1200 60%) 0.0183 
1m 37s (- 0m 58s) (12

In [53]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 89.80%
0m 11s (- 7m 41s) (50 2%) 0.0396 
0m 15s (- 4m 50s) (100 5%) 0.0222 
0m 19s (- 4m 5s) (150 7%) 0.0224 
0m 25s (- 3m 45s) (200 10%) 0.0236 
0m 28s (- 3m 22s) (250 12%) 0.0197 
0m 32s (- 3m 6s) (300 15%) 0.0321 
0m 37s (- 2m 56s) (350 17%) 0.0353 
0m 40s (- 2m 41s) (400 20%) 0.0191 
0m 43s (- 2m 29s) (450 22%) 0.0216 
0m 46s (- 2m 20s) (500 25%) 0.0229 91.50%
New best test accuracy! Model Updated!
0m 52s (- 2m 18s) (550 27%) 0.0504 
0m 57s (- 2m 13s) (600 30%) 0.0213 
1m 0s (- 2m 6s) (650 32%) 0.0179 
1m 6s (- 2m 3s) (700 35%) 0.0228 
1m 9s (- 1m 56s) (750 37%) 0.0154 
1m 13s (- 1m 49s) (800 40%) 0.0214 
1m 16s (- 1m 44s) (850 42%) 0.0194 
1m 20s (- 1m 38s) (900 45%) 0.0263 
1m 24s (- 1m 33s) (950 47%) 0.0269 
1m 27s (- 1m 27s) (1000 50%) 0.0447 88.50%
1m 35s (- 1m 26s) (1050 52%) 0.0626 
1m 39s (- 1m 21s) (1100 55%) 0.0392 
1m 44s (- 1m 17s) (1150 57%) 0.0257 
1m 48s (- 1m 12s) (1200 60%) 0.0162 
1m 52s (- 1m 7s) (1250 62%) 0.0170 
1m 56s (- 1m 2s) (1300

In [54]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 89.00%
0m 15s (- 10m 20s) (50 2%) 0.0208 
0m 19s (- 6m 6s) (100 5%) 0.0496 
0m 23s (- 4m 45s) (150 7%) 0.0340 
0m 27s (- 4m 6s) (200 10%) 0.0233 
0m 30s (- 3m 34s) (250 12%) 0.0178 
0m 34s (- 3m 16s) (300 15%) 0.0172 
0m 38s (- 3m 1s) (350 17%) 0.0183 
0m 43s (- 2m 54s) (400 20%) 0.0297 
0m 48s (- 2m 46s) (450 22%) 0.0119 
0m 52s (- 2m 37s) (500 25%) 0.0198 90.50%
New best test accuracy! Model Updated!
0m 58s (- 2m 35s) (550 27%) 0.0351 
1m 3s (- 2m 27s) (600 30%) 0.0362 
1m 7s (- 2m 20s) (650 32%) 0.0188 
1m 11s (- 2m 12s) (700 35%) 0.0201 
1m 14s (- 2m 4s) (750 37%) 0.0421 
1m 18s (- 1m 57s) (800 40%) 0.0201 
1m 22s (- 1m 51s) (850 42%) 0.0149 
1m 25s (- 1m 44s) (900 45%) 0.0253 
1m 29s (- 1m 38s) (950 47%) 0.0337 
1m 32s (- 1m 32s) (1000 50%) 0.0187 87.50%
1m 39s (- 1m 30s) (1050 52%) 0.0379 
1m 42s (- 1m 24s) (1100 55%) 0.0444 
1m 46s (- 1m 18s) (1150 57%) 0.0196 
1m 50s (- 1m 13s) (1200 60%) 0.0408 
1m 55s (- 1m 9s) (1250 62%) 0.0203 
2m 0s (- 1m 4s) (130

In [55]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 92.40%
0m 11s (- 7m 19s) (50 2%) 0.0162 
0m 14s (- 4m 41s) (100 5%) 0.0170 
0m 18s (- 3m 43s) (150 7%) 0.0167 
0m 21s (- 3m 12s) (200 10%) 0.0498 
0m 25s (- 2m 57s) (250 12%) 0.0124 
0m 28s (- 2m 43s) (300 15%) 0.0160 
0m 31s (- 2m 30s) (350 17%) 0.0384 
0m 35s (- 2m 22s) (400 20%) 0.0159 
0m 38s (- 2m 13s) (450 22%) 0.0199 
0m 42s (- 2m 6s) (500 25%) 0.0156 94.00%
New best test accuracy! Model Updated!
0m 48s (- 2m 7s) (550 27%) 0.0168 
0m 51s (- 2m 0s) (600 30%) 0.0208 
0m 55s (- 1m 54s) (650 32%) 0.0228 
0m 58s (- 1m 49s) (700 35%) 0.0168 
1m 2s (- 1m 43s) (750 37%) 0.0236 
1m 5s (- 1m 38s) (800 40%) 0.0199 
1m 8s (- 1m 32s) (850 42%) 0.0167 
1m 12s (- 1m 28s) (900 45%) 0.0217 
1m 15s (- 1m 23s) (950 47%) 0.0180 
1m 18s (- 1m 18s) (1000 50%) 0.0146 91.00%
1m 24s (- 1m 16s) (1050 52%) 0.0165 
1m 29s (- 1m 13s) (1100 55%) 0.0216 
1m 33s (- 1m 9s) (1150 57%) 0.0280 
1m 37s (- 1m 4s) (1200 60%) 0.0634 
1m 40s (- 1m 0s) (1250 62%) 0.0572 
1m 43s (- 0m 55s) (1300

---

### Samples Evaluation

In [58]:
if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
    encoder2 = torch.load("saved_models/encoder_" + MODEL_VERSION)
    decoder2 = torch.load("saved_models/decoder_" + MODEL_VERSION)
evaluateAccuracy(encoder2, decoder2, n=2000)

0.90400000000000003

In [59]:
evaluateRandomly(encoder2, decoder2)

> walk around right after jump opposite left thrice
= I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK
< I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK <EOS>

> jump opposite left twice and look left
= I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_LOOK
< I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_LOOK <EOS>

> turn around left twice after jump
= I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT
< I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT <EOS>

> look around left thrice and jump opposite left twice
= I_TURN_LEFT I_LOOK I_TURN_LEFT I_LOOK I_TURN_LEFT I_LOOK I_TURN_LEF

---