In [1]:
%matplotlib inline

## SCAN Add-Prim JUMP Experiment
*************************************************************

Reference: http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html


**Requirements**

* Python 3.6
* PyTorch 0.4

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print("Device is using", device)

Device is using cpu


Loading data files
==================

In [4]:
SOS_token = 0
EOS_token = 1
TASK_NAME = "addprim-jump"


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

To read the data file we will split the file into lines, and then split
lines into pairs. 



In [5]:
def readLangs(lang1, lang2, reverse=False, trainOrtest='train'):
    print("Reading lines...")

    # Read the file and split into lines        
    lines = open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/processed/{}-{}_{}-{}.txt'.\
                 format(trainOrtest, TASK_NAME, lang1, lang2), encoding='utf-8').\
                 read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 50
# PRED_LENGTH = 50

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [7]:
def prepareData(lang1, lang2, reverse=False, dataFrom='train'):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse=False, trainOrtest=dataFrom)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('in', 'out', True)
print(random.choice(pairs))

Reading lines...
Read 37046 sentence pairs
Trimmed to 37046 sentence pairs
Counting words...
Counted words:
in 15
out 8
['turn around right twice after jump left', 'I_TURN_LEFT I_JUMP I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT']


Model
=================

The model we are using is a GRU encoder-decoder seq2seq model with attention mechanism. In order to solve the zero-shot generalization task, we embed the encoder networks with pre-trained embeddings, from GloVe and Google Word2Vec.  

In [8]:
EMBEDDEING_SOURCE = 'glove'
hidden_size = 200

if EMBEDDEING_SOURCE == 'google':
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_GoogleNews300Negative.pkl', 'rb') as handle:
        b = pickle.load(handle)
else:
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_raw{}d.pkl'.format(hidden_size), 'rb') as handle:
        b = pickle.load(handle)

pretrained_emb = np.zeros((input_lang.n_words, hidden_size))
for k, v in input_lang.index2word.items():
    if v == 'SOS':
        pretrained_emb[k] = np.zeros(hidden_size)
    elif (v == 'EOS') and (EMBEDDEING_SOURCE != 'google'):
        pretrained_emb[k] = b['.']
    elif (v == 'and') and (EMBEDDEING_SOURCE == 'google'):
        pretrained_emb[k] = b['AND']
    else:
        pretrained_emb[k] = b[v]

The Encoder
-----------

The encoder of this seq2seq network is a GRU netword. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.




In [10]:
EMBEDDEING_PRETRAINED = True
WEIGHT_UPDATE = False

MODEL_VERSION = 'T0.4_glv200'

In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        if EMBEDDEING_PRETRAINED:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
            self.embedding.weight.requires_grad = WEIGHT_UPDATE
        
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is a GRU network with attention mechanism that takes the last output of the encoder and
outputs a sequence of words to create the translation.

First we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.




In [12]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we append the
EOS token to both sequences.




In [13]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

We use teacher forcing to help converge faster with a delay fashion.




In [14]:
teacher_forcing_ratio = 0.8


def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Helper function for timing




In [15]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Training interation

In [16]:
def trainIters(encoder, decoder, n_iters, print_every=1000, eval_every=1000, learning_rate=0.001):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
        encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
        decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
        
    best_test_acc = evaluateAccuracy(encoder, decoder, 500)
    print("Best evaluation accuracy: {0:.2f}%".format(best_test_acc * 100))

    parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        
    encoder_optimizer = optim.Adam(parameters, lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg), end=' ')
            
            if iter % eval_every == 0:
                test_acc = evaluateAccuracy(encoder, decoder, 200)
                print('{0:.2f}%'.format(test_acc * 100))
                
                if test_acc > best_test_acc:
                    with open("saved_models/encoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(encoder, f)
                    with open("saved_models/decoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(decoder, f)
                    print("New best test accuracy! Model Updated!")
                    best_test_acc = test_acc
#                 elif test_acc < best_test_acc - 0.001:
#                     encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
#                     decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
                    
            else:
                print('')

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [17]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [18]:
input_lang, output_lang, pairs_eval = prepareData('in', 'out', True, dataFrom='test')
print(random.choice(pairs_eval))

Reading lines...
Read 15412 sentence pairs
Trimmed to 15412 sentence pairs
Counting words...
Counted words:
in 15
out 8
['turn opposite left thrice after jump opposite left', 'I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT']


In [19]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_eval)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [20]:
def evaluateAccuracy(encoder, decoder, n=10):
    ACCs = []
    for i in range(n):
        pair = random.choice(pairs_eval)
        output_words, _ = evaluate(encoder, decoder, pair[0])
        
        if output_words[-1] == '<EOS>':
            output_words = output_words[:-1]
        output_sentence = ' '.join(output_words)
        
        if output_sentence == pair[1]:
            ACCs.append(1)
        else:
            ACCs.append(0)
    return np.array(ACCs).mean()

Training and Evaluating
=======================


The model is initially trained with a higher teacher aid, and relatively large learning rate. Both teacher forcing effect and the learning rate decay over iterations when the model approaches the optimum.  

#### The model achieves 97% accuracy rate for the best test sample evaluation, and is 94% correct on average for the testset.

In [21]:
teacher_forcing_ratio = 0.8

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 5000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 0.00%
0m 23s (- 39m 25s) (50 1%) 1.8528 
0m 30s (- 24m 33s) (100 2%) 1.4852 
0m 36s (- 19m 38s) (150 3%) 1.2120 
0m 39s (- 15m 44s) (200 4%) 1.3494 
0m 41s (- 13m 17s) (250 5%) 1.2040 
0m 44s (- 11m 37s) (300 6%) 1.2234 
0m 47s (- 10m 32s) (350 7%) 0.9504 
0m 50s (- 9m 40s) (400 8%) 1.1201 
0m 53s (- 9m 1s) (450 9%) 1.0016 
0m 56s (- 8m 28s) (500 10%) 1.0039 0.50%
New best test accuracy! Model Updated!
1m 6s (- 8m 58s) (550 11%) 0.9534 
1m 9s (- 8m 30s) (600 12%) 0.8782 
1m 12s (- 8m 3s) (650 13%) 1.1839 
1m 15s (- 7m 42s) (700 14%) 0.9678 
1m 18s (- 7m 22s) (750 15%) 0.8661 
1m 20s (- 7m 2s) (800 16%) 0.9400 
1m 23s (- 6m 45s) (850 17%) 0.7798 
1m 25s (- 6m 30s) (900 18%) 0.7225 
1m 28s (- 6m 18s) (950 19%) 0.6772 
1m 31s (- 6m 7s) (1000 20%) 0.6705 0.00%
1m 38s (- 6m 10s) (1050 21%) 0.7176 
1m 40s (- 5m 57s) (1100 22%) 0.6323 
1m 44s (- 5m 49s) (1150 23%) 0.7206 
1m 47s (- 5m 38s) (1200 24%) 0.6546 
1m 50s (- 5m 30s) (1250 25%) 0.7597 
1m 52s (- 5m 21s) (130

In [22]:
teacher_forcing_ratio = 0.5
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 13.00%
0m 10s (- 3m 27s) (50 5%) 0.4327 
0m 13s (- 2m 1s) (100 10%) 0.4090 
0m 15s (- 1m 29s) (150 15%) 0.3706 
0m 18s (- 1m 13s) (200 20%) 0.3954 
0m 20s (- 1m 2s) (250 25%) 0.3077 
0m 23s (- 0m 53s) (300 30%) 0.3458 
0m 25s (- 0m 47s) (350 35%) 0.4096 
0m 27s (- 0m 41s) (400 40%) 0.2793 
0m 29s (- 0m 36s) (450 45%) 0.3734 
0m 31s (- 0m 31s) (500 50%) 0.4641 13.50%
New best test accuracy! Model Updated!
0m 36s (- 0m 29s) (550 55%) 0.3641 
0m 38s (- 0m 25s) (600 60%) 0.4805 
0m 40s (- 0m 21s) (650 65%) 0.4370 
0m 42s (- 0m 18s) (700 70%) 0.3724 
0m 44s (- 0m 14s) (750 75%) 0.3727 
0m 46s (- 0m 11s) (800 80%) 0.4332 
0m 49s (- 0m 8s) (850 85%) 0.3828 
0m 51s (- 0m 5s) (900 90%) 0.4553 
0m 53s (- 0m 2s) (950 95%) 0.3721 
0m 55s (- 0m 0s) (1000 100%) 0.3132 18.50%
New best test accuracy! Model Updated!


In [23]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 22.80%
0m 10s (- 3m 26s) (50 5%) 0.3685 
0m 13s (- 2m 3s) (100 10%) 0.3895 
0m 16s (- 1m 31s) (150 15%) 0.3307 
0m 18s (- 1m 14s) (200 20%) 0.4044 
0m 21s (- 1m 3s) (250 25%) 0.4664 
0m 24s (- 0m 56s) (300 30%) 0.3737 
0m 26s (- 0m 48s) (350 35%) 0.3540 
0m 29s (- 0m 44s) (400 40%) 0.4336 
0m 31s (- 0m 38s) (450 45%) 0.2859 
0m 34s (- 0m 34s) (500 50%) 0.3185 23.50%
New best test accuracy! Model Updated!
0m 39s (- 0m 32s) (550 55%) 0.2757 
0m 42s (- 0m 28s) (600 60%) 0.2597 
0m 45s (- 0m 24s) (650 65%) 0.3287 
0m 48s (- 0m 20s) (700 70%) 0.2763 
0m 51s (- 0m 17s) (750 75%) 0.3816 
0m 54s (- 0m 13s) (800 80%) 0.4050 
0m 56s (- 0m 10s) (850 85%) 0.4415 
0m 59s (- 0m 6s) (900 90%) 0.4049 
1m 2s (- 0m 3s) (950 95%) 0.3234 
1m 4s (- 0m 0s) (1000 100%) 0.2589 26.00%
New best test accuracy! Model Updated!


In [24]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 26.00%
0m 9s (- 3m 8s) (50 5%) 0.2597 
0m 12s (- 1m 53s) (100 10%) 0.3489 
0m 15s (- 1m 25s) (150 15%) 0.3552 
0m 17s (- 1m 9s) (200 20%) 0.3550 
0m 19s (- 0m 58s) (250 25%) 0.3171 
0m 22s (- 0m 51s) (300 30%) 0.3217 
0m 24s (- 0m 45s) (350 35%) 0.3326 
0m 26s (- 0m 40s) (400 40%) 0.3178 
0m 29s (- 0m 35s) (450 45%) 0.3383 
0m 31s (- 0m 31s) (500 50%) 0.3638 19.00%
0m 38s (- 0m 31s) (550 55%) 0.2803 
0m 42s (- 0m 28s) (600 60%) 0.2171 
0m 45s (- 0m 24s) (650 65%) 0.2560 
0m 47s (- 0m 20s) (700 70%) 0.3973 
0m 50s (- 0m 16s) (750 75%) 0.3027 
0m 52s (- 0m 13s) (800 80%) 0.2712 
0m 54s (- 0m 9s) (850 85%) 0.2445 
0m 56s (- 0m 6s) (900 90%) 0.2572 
0m 58s (- 0m 3s) (950 95%) 0.2502 
1m 1s (- 0m 0s) (1000 100%) 0.3007 28.50%
New best test accuracy! Model Updated!


In [25]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 24.20%
0m 8s (- 2m 33s) (50 5%) 0.2736 
0m 10s (- 1m 31s) (100 10%) 0.2354 
0m 12s (- 1m 8s) (150 15%) 0.2280 
0m 14s (- 0m 56s) (200 20%) 0.2210 
0m 15s (- 0m 47s) (250 25%) 0.1877 
0m 18s (- 0m 42s) (300 30%) 0.2043 
0m 20s (- 0m 37s) (350 35%) 0.2662 
0m 22s (- 0m 33s) (400 40%) 0.1662 
0m 24s (- 0m 29s) (450 45%) 0.2147 
0m 26s (- 0m 26s) (500 50%) 0.2646 37.00%
New best test accuracy! Model Updated!
0m 30s (- 0m 25s) (550 55%) 0.1655 
0m 33s (- 0m 22s) (600 60%) 0.2257 
0m 36s (- 0m 19s) (650 65%) 0.1719 
0m 39s (- 0m 16s) (700 70%) 0.1569 
0m 42s (- 0m 14s) (750 75%) 0.1472 
0m 44s (- 0m 11s) (800 80%) 0.1904 
0m 47s (- 0m 8s) (850 85%) 0.1280 
0m 49s (- 0m 5s) (900 90%) 0.1214 
0m 52s (- 0m 2s) (950 95%) 0.1518 
0m 54s (- 0m 0s) (1000 100%) 0.2459 50.00%
New best test accuracy! Model Updated!


In [26]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 50.60%
0m 9s (- 3m 1s) (50 5%) 0.2201 
0m 12s (- 1m 53s) (100 10%) 0.1564 
0m 15s (- 1m 27s) (150 15%) 0.1260 
0m 18s (- 1m 13s) (200 20%) 0.2432 
0m 21s (- 1m 3s) (250 25%) 0.1582 
0m 23s (- 0m 55s) (300 30%) 0.2006 
0m 25s (- 0m 47s) (350 35%) 0.1389 
0m 28s (- 0m 42s) (400 40%) 0.1476 
0m 31s (- 0m 37s) (450 45%) 0.2308 
0m 33s (- 0m 33s) (500 50%) 0.1481 54.50%
New best test accuracy! Model Updated!
0m 38s (- 0m 31s) (550 55%) 0.1340 
0m 41s (- 0m 27s) (600 60%) 0.1744 
0m 43s (- 0m 23s) (650 65%) 0.1972 
0m 45s (- 0m 19s) (700 70%) 0.1305 
0m 48s (- 0m 16s) (750 75%) 0.1683 
0m 50s (- 0m 12s) (800 80%) 0.1459 
0m 53s (- 0m 9s) (850 85%) 0.0958 
0m 55s (- 0m 6s) (900 90%) 0.0811 
0m 58s (- 0m 3s) (950 95%) 0.1281 
1m 0s (- 0m 0s) (1000 100%) 0.1597 58.50%
New best test accuracy! Model Updated!


In [27]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 55.20%
0m 10s (- 3m 27s) (50 5%) 0.1399 
0m 13s (- 2m 4s) (100 10%) 0.1119 
0m 16s (- 1m 31s) (150 15%) 0.1526 
0m 18s (- 1m 13s) (200 20%) 0.1238 
0m 20s (- 1m 2s) (250 25%) 0.1109 
0m 22s (- 0m 52s) (300 30%) 0.1096 
0m 24s (- 0m 45s) (350 35%) 0.1343 
0m 26s (- 0m 39s) (400 40%) 0.1570 
0m 28s (- 0m 35s) (450 45%) 0.1851 
0m 30s (- 0m 30s) (500 50%) 0.1958 53.00%
0m 36s (- 0m 29s) (550 55%) 0.0906 
0m 39s (- 0m 26s) (600 60%) 0.1460 
0m 41s (- 0m 22s) (650 65%) 0.1232 
0m 44s (- 0m 18s) (700 70%) 0.1376 
0m 46s (- 0m 15s) (750 75%) 0.1311 
0m 49s (- 0m 12s) (800 80%) 0.1192 
0m 52s (- 0m 9s) (850 85%) 0.1359 
0m 54s (- 0m 6s) (900 90%) 0.1289 
0m 56s (- 0m 2s) (950 95%) 0.1102 
0m 59s (- 0m 0s) (1000 100%) 0.1914 56.00%
New best test accuracy! Model Updated!


In [28]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 55.20%
0m 11s (- 3m 39s) (50 5%) 0.0987 
0m 13s (- 2m 5s) (100 10%) 0.0995 
0m 16s (- 1m 32s) (150 15%) 0.0885 
0m 18s (- 1m 15s) (200 20%) 0.1138 
0m 21s (- 1m 4s) (250 25%) 0.1024 
0m 24s (- 0m 56s) (300 30%) 0.1225 
0m 26s (- 0m 49s) (350 35%) 0.1334 
0m 28s (- 0m 42s) (400 40%) 0.2292 
0m 31s (- 0m 38s) (450 45%) 0.1512 
0m 33s (- 0m 33s) (500 50%) 0.1226 58.00%
New best test accuracy! Model Updated!
0m 38s (- 0m 31s) (550 55%) 0.1112 
0m 41s (- 0m 27s) (600 60%) 0.1707 
0m 45s (- 0m 24s) (650 65%) 0.0874 
0m 48s (- 0m 20s) (700 70%) 0.0974 
0m 51s (- 0m 17s) (750 75%) 0.0835 
0m 53s (- 0m 13s) (800 80%) 0.1868 
0m 56s (- 0m 9s) (850 85%) 0.1516 
0m 58s (- 0m 6s) (900 90%) 0.1065 
1m 1s (- 0m 3s) (950 95%) 0.1120 
1m 3s (- 0m 0s) (1000 100%) 0.1012 62.00%
New best test accuracy! Model Updated!


In [29]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 59.20%
0m 9s (- 2m 56s) (50 5%) 0.0926 
0m 11s (- 1m 47s) (100 10%) 0.1560 
0m 14s (- 1m 20s) (150 15%) 0.1176 
0m 17s (- 1m 9s) (200 20%) 0.1497 
0m 20s (- 1m 0s) (250 25%) 0.0919 
0m 22s (- 0m 53s) (300 30%) 0.1208 
0m 25s (- 0m 47s) (350 35%) 0.0871 
0m 27s (- 0m 41s) (400 40%) 0.0994 
0m 30s (- 0m 37s) (450 45%) 0.0788 
0m 32s (- 0m 32s) (500 50%) 0.0960 70.00%
New best test accuracy! Model Updated!
0m 38s (- 0m 31s) (550 55%) 0.1235 
0m 40s (- 0m 27s) (600 60%) 0.0853 
0m 43s (- 0m 23s) (650 65%) 0.0938 
0m 46s (- 0m 19s) (700 70%) 0.2044 
0m 48s (- 0m 16s) (750 75%) 0.1214 
0m 50s (- 0m 12s) (800 80%) 0.1076 
0m 52s (- 0m 9s) (850 85%) 0.1148 
0m 54s (- 0m 6s) (900 90%) 0.0723 
0m 57s (- 0m 3s) (950 95%) 0.0979 
0m 59s (- 0m 0s) (1000 100%) 0.1076 71.00%
New best test accuracy! Model Updated!


In [30]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 64.80%
0m 7s (- 2m 29s) (50 5%) 0.0884 
0m 10s (- 1m 33s) (100 10%) 0.0809 
0m 13s (- 1m 13s) (150 15%) 0.0707 
0m 15s (- 1m 2s) (200 20%) 0.1138 
0m 18s (- 0m 54s) (250 25%) 0.0956 
0m 20s (- 0m 47s) (300 30%) 0.1063 
0m 22s (- 0m 41s) (350 35%) 0.0559 
0m 24s (- 0m 37s) (400 40%) 0.0657 
0m 26s (- 0m 32s) (450 45%) 0.1779 
0m 29s (- 0m 29s) (500 50%) 0.1235 64.00%
0m 33s (- 0m 27s) (550 55%) 0.0665 
0m 35s (- 0m 23s) (600 60%) 0.0625 
0m 38s (- 0m 20s) (650 65%) 0.0727 
0m 40s (- 0m 17s) (700 70%) 0.0523 
0m 42s (- 0m 14s) (750 75%) 0.0692 
0m 44s (- 0m 11s) (800 80%) 0.0613 
0m 46s (- 0m 8s) (850 85%) 0.0567 
0m 48s (- 0m 5s) (900 90%) 0.0630 
0m 50s (- 0m 2s) (950 95%) 0.0723 
0m 52s (- 0m 0s) (1000 100%) 0.0678 73.50%
New best test accuracy! Model Updated!


In [31]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 67.40%
0m 9s (- 3m 5s) (50 5%) 0.0534 
0m 12s (- 1m 48s) (100 10%) 0.0483 
0m 14s (- 1m 21s) (150 15%) 0.0469 
0m 16s (- 1m 6s) (200 20%) 0.0351 
0m 19s (- 0m 57s) (250 25%) 0.0483 
0m 21s (- 0m 50s) (300 30%) 0.0464 
0m 24s (- 0m 45s) (350 35%) 0.0702 
0m 26s (- 0m 40s) (400 40%) 0.0747 
0m 29s (- 0m 35s) (450 45%) 0.1533 
0m 32s (- 0m 32s) (500 50%) 0.0419 82.00%
New best test accuracy! Model Updated!
0m 38s (- 0m 31s) (550 55%) 0.0500 
0m 41s (- 0m 27s) (600 60%) 0.0427 
0m 43s (- 0m 23s) (650 65%) 0.0763 
0m 46s (- 0m 19s) (700 70%) 0.0511 
0m 49s (- 0m 16s) (750 75%) 0.0573 
0m 51s (- 0m 12s) (800 80%) 0.0485 
0m 53s (- 0m 9s) (850 85%) 0.0375 
0m 55s (- 0m 6s) (900 90%) 0.0679 
0m 57s (- 0m 3s) (950 95%) 0.0572 
1m 0s (- 0m 0s) (1000 100%) 0.0566 81.00%


In [32]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 79.00%
0m 8s (- 2m 50s) (50 5%) 0.0559 
0m 11s (- 1m 41s) (100 10%) 0.0360 
0m 13s (- 1m 16s) (150 15%) 0.0537 
0m 16s (- 1m 4s) (200 20%) 0.0376 
0m 18s (- 0m 56s) (250 25%) 0.0370 
0m 20s (- 0m 48s) (300 30%) 0.0255 
0m 23s (- 0m 42s) (350 35%) 0.0454 
0m 25s (- 0m 37s) (400 40%) 0.0286 
0m 28s (- 0m 34s) (450 45%) 0.0535 
0m 30s (- 0m 30s) (500 50%) 0.0490 78.00%
0m 35s (- 0m 28s) (550 55%) 0.0605 
0m 37s (- 0m 25s) (600 60%) 0.0356 
0m 40s (- 0m 21s) (650 65%) 0.0420 
0m 42s (- 0m 18s) (700 70%) 0.0317 
0m 45s (- 0m 15s) (750 75%) 0.0408 
0m 47s (- 0m 11s) (800 80%) 0.1052 
0m 50s (- 0m 8s) (850 85%) 0.0390 
0m 53s (- 0m 5s) (900 90%) 0.0371 
0m 55s (- 0m 2s) (950 95%) 0.0381 
0m 58s (- 0m 0s) (1000 100%) 0.0383 85.00%
New best test accuracy! Model Updated!


In [33]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 83.40%
0m 10s (- 3m 27s) (50 5%) 0.0358 
0m 13s (- 2m 1s) (100 10%) 0.0549 
0m 16s (- 1m 33s) (150 15%) 0.0409 
0m 18s (- 1m 15s) (200 20%) 0.0284 
0m 21s (- 1m 5s) (250 25%) 0.0608 
0m 24s (- 0m 57s) (300 30%) 0.0460 
0m 27s (- 0m 50s) (350 35%) 0.0408 
0m 29s (- 0m 44s) (400 40%) 0.0392 
0m 31s (- 0m 38s) (450 45%) 0.0369 
0m 34s (- 0m 34s) (500 50%) 0.0242 85.50%
New best test accuracy! Model Updated!
0m 38s (- 0m 31s) (550 55%) 0.0257 
0m 40s (- 0m 26s) (600 60%) 0.0380 
0m 42s (- 0m 22s) (650 65%) 0.0482 
0m 44s (- 0m 19s) (700 70%) 0.1251 
0m 47s (- 0m 15s) (750 75%) 0.0346 
0m 49s (- 0m 12s) (800 80%) 0.0241 
0m 51s (- 0m 9s) (850 85%) 0.0430 
0m 53s (- 0m 5s) (900 90%) 0.0317 
0m 55s (- 0m 2s) (950 95%) 0.0807 
0m 57s (- 0m 0s) (1000 100%) 0.0241 88.00%
New best test accuracy! Model Updated!


In [34]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 85.60%
0m 9s (- 2m 58s) (50 5%) 0.0235 
0m 11s (- 1m 44s) (100 10%) 0.0331 
0m 13s (- 1m 18s) (150 15%) 0.0393 
0m 16s (- 1m 5s) (200 20%) 0.0192 
0m 18s (- 0m 56s) (250 25%) 0.0220 
0m 21s (- 0m 49s) (300 30%) 0.0298 
0m 23s (- 0m 44s) (350 35%) 0.0320 
0m 26s (- 0m 39s) (400 40%) 0.0253 
0m 28s (- 0m 35s) (450 45%) 0.0495 
0m 31s (- 0m 31s) (500 50%) 0.0495 85.50%
0m 37s (- 0m 30s) (550 55%) 0.0305 
0m 39s (- 0m 26s) (600 60%) 0.0196 
0m 42s (- 0m 22s) (650 65%) 0.0208 
0m 45s (- 0m 19s) (700 70%) 0.0160 
0m 47s (- 0m 15s) (750 75%) 0.0243 
0m 49s (- 0m 12s) (800 80%) 0.0239 
0m 52s (- 0m 9s) (850 85%) 0.0199 
0m 55s (- 0m 6s) (900 90%) 0.0230 
0m 57s (- 0m 3s) (950 95%) 0.0191 
0m 59s (- 0m 0s) (1000 100%) 0.0476 91.50%
New best test accuracy! Model Updated!


In [35]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 88.80%
0m 10s (- 3m 15s) (50 5%) 0.0397 
0m 13s (- 1m 58s) (100 10%) 0.0334 
0m 15s (- 1m 29s) (150 15%) 0.0328 
0m 18s (- 1m 13s) (200 20%) 0.0334 
0m 20s (- 1m 2s) (250 25%) 0.0386 
0m 22s (- 0m 53s) (300 30%) 0.0276 
0m 24s (- 0m 46s) (350 35%) 0.0593 
0m 26s (- 0m 40s) (400 40%) 0.0178 
0m 28s (- 0m 35s) (450 45%) 0.0197 
0m 31s (- 0m 31s) (500 50%) 0.0327 90.00%
New best test accuracy! Model Updated!
0m 37s (- 0m 30s) (550 55%) 0.0364 
0m 40s (- 0m 26s) (600 60%) 0.0392 
0m 42s (- 0m 22s) (650 65%) 0.0172 
0m 45s (- 0m 19s) (700 70%) 0.0313 
0m 47s (- 0m 15s) (750 75%) 0.0150 
0m 50s (- 0m 12s) (800 80%) 0.0286 
0m 52s (- 0m 9s) (850 85%) 0.0199 
0m 55s (- 0m 6s) (900 90%) 0.0354 
0m 59s (- 0m 3s) (950 95%) 0.0378 
1m 1s (- 0m 0s) (1000 100%) 0.0298 89.00%


In [37]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 89.40%
0m 10s (- 3m 13s) (50 5%) 0.0194 
0m 12s (- 1m 52s) (100 10%) 0.0284 
0m 14s (- 1m 23s) (150 15%) 0.0219 
0m 16s (- 1m 7s) (200 20%) 0.0568 
0m 19s (- 0m 57s) (250 25%) 0.0324 
0m 21s (- 0m 50s) (300 30%) 0.0176 
0m 24s (- 0m 44s) (350 35%) 0.0290 
0m 26s (- 0m 39s) (400 40%) 0.0186 
0m 29s (- 0m 35s) (450 45%) 0.0202 
0m 31s (- 0m 31s) (500 50%) 0.0247 87.00%
0m 36s (- 0m 30s) (550 55%) 0.0392 
0m 39s (- 0m 26s) (600 60%) 0.0209 
0m 41s (- 0m 22s) (650 65%) 0.0214 
0m 44s (- 0m 19s) (700 70%) 0.0190 
0m 47s (- 0m 15s) (750 75%) 0.0319 
0m 49s (- 0m 12s) (800 80%) 0.0319 
0m 51s (- 0m 9s) (850 85%) 0.0219 
0m 54s (- 0m 6s) (900 90%) 0.0463 
0m 57s (- 0m 3s) (950 95%) 0.0174 
0m 59s (- 0m 0s) (1000 100%) 0.0133 90.00%
New best test accuracy! Model Updated!


In [38]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 91.20%
0m 9s (- 2m 59s) (50 5%) 0.0243 
0m 12s (- 1m 51s) (100 10%) 0.0248 
0m 14s (- 1m 24s) (150 15%) 0.0177 
0m 17s (- 1m 10s) (200 20%) 0.0416 
0m 20s (- 1m 0s) (250 25%) 0.0593 
0m 22s (- 0m 52s) (300 30%) 0.0275 
0m 25s (- 0m 46s) (350 35%) 0.0144 
0m 27s (- 0m 41s) (400 40%) 0.0148 
0m 30s (- 0m 36s) (450 45%) 0.0429 
0m 32s (- 0m 32s) (500 50%) 0.0185 95.50%
New best test accuracy! Model Updated!
0m 37s (- 0m 30s) (550 55%) 0.0421 
0m 40s (- 0m 26s) (600 60%) 0.0195 
0m 42s (- 0m 22s) (650 65%) 0.0132 
0m 45s (- 0m 19s) (700 70%) 0.0134 
0m 47s (- 0m 15s) (750 75%) 0.0324 
0m 50s (- 0m 12s) (800 80%) 0.0224 
0m 52s (- 0m 9s) (850 85%) 0.0158 
0m 55s (- 0m 6s) (900 90%) 0.0210 
0m 57s (- 0m 3s) (950 95%) 0.0174 
1m 0s (- 0m 0s) (1000 100%) 0.0249 93.50%


In [39]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 92.40%
0m 8s (- 2m 47s) (50 5%) 0.0242 
0m 10s (- 1m 38s) (100 10%) 0.0401 
0m 12s (- 1m 13s) (150 15%) 0.0147 
0m 14s (- 0m 59s) (200 20%) 0.0188 
0m 17s (- 0m 51s) (250 25%) 0.0419 
0m 18s (- 0m 44s) (300 30%) 0.0489 
0m 20s (- 0m 38s) (350 35%) 0.0143 
0m 22s (- 0m 34s) (400 40%) 0.0147 
0m 24s (- 0m 30s) (450 45%) 0.0111 
0m 26s (- 0m 26s) (500 50%) 0.0294 91.00%
0m 30s (- 0m 24s) (550 55%) 0.0146 
0m 32s (- 0m 21s) (600 60%) 0.0185 
0m 34s (- 0m 18s) (650 65%) 0.0262 
0m 36s (- 0m 15s) (700 70%) 0.0133 
0m 38s (- 0m 12s) (750 75%) 0.0144 
0m 40s (- 0m 10s) (800 80%) 0.0152 
0m 42s (- 0m 7s) (850 85%) 0.0105 
0m 45s (- 0m 5s) (900 90%) 0.0194 
0m 47s (- 0m 2s) (950 95%) 0.0250 
0m 49s (- 0m 0s) (1000 100%) 0.0153 93.50%
New best test accuracy! Model Updated!


In [40]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 91.40%
0m 7s (- 2m 27s) (50 5%) 0.0208 
0m 10s (- 1m 31s) (100 10%) 0.0221 
0m 13s (- 1m 15s) (150 15%) 0.0321 
0m 15s (- 1m 2s) (200 20%) 0.0169 
0m 17s (- 0m 53s) (250 25%) 0.0162 
0m 20s (- 0m 47s) (300 30%) 0.0146 
0m 22s (- 0m 42s) (350 35%) 0.0202 
0m 25s (- 0m 38s) (400 40%) 0.0212 
0m 27s (- 0m 34s) (450 45%) 0.0130 
0m 30s (- 0m 30s) (500 50%) 0.0176 91.50%
New best test accuracy! Model Updated!
0m 34s (- 0m 28s) (550 55%) 0.0129 
0m 37s (- 0m 25s) (600 60%) 0.0141 
0m 40s (- 0m 21s) (650 65%) 0.0678 
0m 43s (- 0m 18s) (700 70%) 0.0161 
0m 45s (- 0m 15s) (750 75%) 0.0125 
0m 48s (- 0m 12s) (800 80%) 0.0134 
0m 50s (- 0m 8s) (850 85%) 0.0282 
0m 53s (- 0m 5s) (900 90%) 0.0157 
0m 55s (- 0m 2s) (950 95%) 0.0178 
0m 58s (- 0m 0s) (1000 100%) 0.0153 91.50%


In [41]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 91.60%
0m 7s (- 2m 27s) (50 5%) 0.0161 
0m 10s (- 1m 38s) (100 10%) 0.0314 
0m 13s (- 1m 17s) (150 15%) 0.0167 
0m 16s (- 1m 5s) (200 20%) 0.0124 
0m 18s (- 0m 55s) (250 25%) 0.0175 
0m 21s (- 0m 49s) (300 30%) 0.0143 
0m 23s (- 0m 44s) (350 35%) 0.0215 
0m 26s (- 0m 40s) (400 40%) 0.0113 
0m 29s (- 0m 36s) (450 45%) 0.0171 
0m 31s (- 0m 31s) (500 50%) 0.0187 93.00%
New best test accuracy! Model Updated!
0m 36s (- 0m 29s) (550 55%) 0.0128 
0m 38s (- 0m 25s) (600 60%) 0.0096 
0m 40s (- 0m 21s) (650 65%) 0.0132 
0m 42s (- 0m 18s) (700 70%) 0.0283 
0m 44s (- 0m 14s) (750 75%) 0.0140 
0m 46s (- 0m 11s) (800 80%) 0.0504 
0m 48s (- 0m 8s) (850 85%) 0.0152 
0m 50s (- 0m 5s) (900 90%) 0.0116 
0m 52s (- 0m 2s) (950 95%) 0.0206 
0m 55s (- 0m 0s) (1000 100%) 0.0131 93.50%
New best test accuracy! Model Updated!


In [42]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00005)

Best evaluation accuracy: 90.60%
0m 7s (- 2m 31s) (50 5%) 0.0249 
0m 9s (- 1m 29s) (100 10%) 0.0148 
0m 12s (- 1m 9s) (150 15%) 0.0116 
0m 14s (- 0m 57s) (200 20%) 0.0127 
0m 16s (- 0m 50s) (250 25%) 0.0144 
0m 19s (- 0m 44s) (300 30%) 0.0174 
0m 21s (- 0m 39s) (350 35%) 0.0347 
0m 22s (- 0m 34s) (400 40%) 0.0130 
0m 24s (- 0m 30s) (450 45%) 0.0113 
0m 27s (- 0m 27s) (500 50%) 0.0139 95.50%
New best test accuracy! Model Updated!
0m 31s (- 0m 26s) (550 55%) 0.0107 
0m 34s (- 0m 22s) (600 60%) 0.0099 
0m 36s (- 0m 19s) (650 65%) 0.0097 
0m 38s (- 0m 16s) (700 70%) 0.0107 
0m 41s (- 0m 13s) (750 75%) 0.0067 
0m 43s (- 0m 10s) (800 80%) 0.0328 
0m 46s (- 0m 8s) (850 85%) 0.0084 
0m 49s (- 0m 5s) (900 90%) 0.0256 
0m 51s (- 0m 2s) (950 95%) 0.0117 
0m 54s (- 0m 0s) (1000 100%) 0.0162 92.50%


In [43]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00005)

Best evaluation accuracy: 94.40%
0m 9s (- 2m 52s) (50 5%) 0.0118 
0m 11s (- 1m 45s) (100 10%) 0.0074 
0m 14s (- 1m 20s) (150 15%) 0.0108 
0m 16s (- 1m 4s) (200 20%) 0.0101 
0m 18s (- 0m 54s) (250 25%) 0.0078 
0m 21s (- 0m 49s) (300 30%) 0.0143 
0m 24s (- 0m 44s) (350 35%) 0.0102 
0m 27s (- 0m 41s) (400 40%) 0.0236 
0m 29s (- 0m 36s) (450 45%) 0.0157 
0m 32s (- 0m 32s) (500 50%) 0.0068 91.50%
0m 37s (- 0m 30s) (550 55%) 0.0145 
0m 39s (- 0m 26s) (600 60%) 0.0080 
0m 42s (- 0m 22s) (650 65%) 0.0066 
0m 44s (- 0m 19s) (700 70%) 0.0126 
0m 47s (- 0m 15s) (750 75%) 0.0080 
0m 49s (- 0m 12s) (800 80%) 0.0131 
0m 51s (- 0m 9s) (850 85%) 0.0190 
0m 53s (- 0m 5s) (900 90%) 0.0084 
0m 55s (- 0m 2s) (950 95%) 0.0106 
0m 57s (- 0m 0s) (1000 100%) 0.0099 95.50%
New best test accuracy! Model Updated!


In [44]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 95.60%
0m 8s (- 5m 20s) (50 2%) 0.0073 
0m 11s (- 3m 34s) (100 5%) 0.0161 
0m 13s (- 2m 51s) (150 7%) 0.0071 
0m 16s (- 2m 26s) (200 10%) 0.0127 
0m 18s (- 2m 12s) (250 12%) 0.0076 
0m 21s (- 2m 3s) (300 15%) 0.0246 
0m 24s (- 1m 55s) (350 17%) 0.0115 
0m 27s (- 1m 49s) (400 20%) 0.0067 
0m 30s (- 1m 43s) (450 22%) 0.0080 
0m 32s (- 1m 37s) (500 25%) 0.0075 97.50%
New best test accuracy! Model Updated!
0m 38s (- 1m 40s) (550 27%) 0.0315 
0m 40s (- 1m 34s) (600 30%) 0.0077 
0m 42s (- 1m 28s) (650 32%) 0.0108 
0m 44s (- 1m 22s) (700 35%) 0.0109 
0m 46s (- 1m 18s) (750 37%) 0.0088 
0m 49s (- 1m 13s) (800 40%) 0.0051 
0m 51s (- 1m 9s) (850 42%) 0.0093 
0m 53s (- 1m 5s) (900 45%) 0.0068 
0m 55s (- 1m 1s) (950 47%) 0.0092 
0m 57s (- 0m 57s) (1000 50%) 0.0063 92.00%
1m 1s (- 0m 56s) (1050 52%) 0.0146 
1m 4s (- 0m 52s) (1100 55%) 0.0251 
1m 6s (- 0m 49s) (1150 57%) 0.0085 
1m 8s (- 0m 45s) (1200 60%) 0.0092 
1m 10s (- 0m 42s) (1250 62%) 0.0223 
1m 13s (- 0m 39s) (1300

In [45]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 96.40%
0m 7s (- 5m 5s) (50 2%) 0.0103 
0m 11s (- 3m 38s) (100 5%) 0.0182 
0m 14s (- 2m 58s) (150 7%) 0.0097 
0m 17s (- 2m 34s) (200 10%) 0.0110 
0m 19s (- 2m 18s) (250 12%) 0.0063 
0m 22s (- 2m 6s) (300 15%) 0.0095 
0m 24s (- 1m 56s) (350 17%) 0.0411 
0m 27s (- 1m 50s) (400 20%) 0.0094 
0m 31s (- 1m 47s) (450 22%) 0.0100 
0m 33s (- 1m 41s) (500 25%) 0.0119 97.50%
New best test accuracy! Model Updated!
0m 38s (- 1m 41s) (550 27%) 0.0318 
0m 41s (- 1m 36s) (600 30%) 0.0082 
0m 44s (- 1m 31s) (650 32%) 0.0112 
0m 47s (- 1m 27s) (700 35%) 0.0076 
0m 49s (- 1m 22s) (750 37%) 0.0187 
0m 52s (- 1m 18s) (800 40%) 0.0098 
0m 55s (- 1m 14s) (850 42%) 0.0075 
0m 57s (- 1m 10s) (900 45%) 0.0069 
1m 0s (- 1m 6s) (950 47%) 0.0203 
1m 2s (- 1m 2s) (1000 50%) 0.0063 98.00%
New best test accuracy! Model Updated!
1m 8s (- 1m 1s) (1050 52%) 0.0100 
1m 10s (- 0m 57s) (1100 55%) 0.0070 
1m 13s (- 0m 53s) (1150 57%) 0.0094 
1m 15s (- 0m 50s) (1200 60%) 0.0120 
1m 18s (- 0m 47s) (12

In [46]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 96.80%
0m 9s (- 2m 56s) (50 5%) 0.0053 
0m 11s (- 1m 42s) (100 10%) 0.0083 
0m 13s (- 1m 17s) (150 15%) 0.0063 
0m 16s (- 1m 4s) (200 20%) 0.0076 
0m 18s (- 0m 55s) (250 25%) 0.0145 
0m 20s (- 0m 48s) (300 30%) 0.0112 
0m 23s (- 0m 43s) (350 35%) 0.0203 
0m 26s (- 0m 39s) (400 40%) 0.0100 
0m 28s (- 0m 34s) (450 45%) 0.0070 
0m 31s (- 0m 31s) (500 50%) 0.0214 96.00%
0m 37s (- 0m 30s) (550 55%) 0.0313 
0m 39s (- 0m 26s) (600 60%) 0.0215 
0m 42s (- 0m 22s) (650 65%) 0.0090 
0m 45s (- 0m 19s) (700 70%) 0.0078 
0m 47s (- 0m 15s) (750 75%) 0.0093 
0m 50s (- 0m 12s) (800 80%) 0.0079 
0m 52s (- 0m 9s) (850 85%) 0.0096 
0m 55s (- 0m 6s) (900 90%) 0.0066 
0m 57s (- 0m 3s) (950 95%) 0.0092 
0m 59s (- 0m 0s) (1000 100%) 0.0072 97.00%
New best test accuracy! Model Updated!


In [49]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 97.80%
0m 9s (- 2m 54s) (50 5%) 0.0246 
0m 11s (- 1m 42s) (100 10%) 0.0068 
0m 13s (- 1m 18s) (150 15%) 0.0063 
0m 15s (- 1m 3s) (200 20%) 0.0108 
0m 17s (- 0m 53s) (250 25%) 0.0346 
0m 19s (- 0m 46s) (300 30%) 0.0070 
0m 21s (- 0m 40s) (350 35%) 0.0072 
0m 23s (- 0m 35s) (400 40%) 0.0072 
0m 25s (- 0m 31s) (450 45%) 0.0402 
0m 28s (- 0m 28s) (500 50%) 0.0103 97.00%
0m 32s (- 0m 26s) (550 55%) 0.0067 
0m 34s (- 0m 23s) (600 60%) 0.0100 
0m 36s (- 0m 19s) (650 65%) 0.0078 
0m 38s (- 0m 16s) (700 70%) 0.0085 
0m 40s (- 0m 13s) (750 75%) 0.0072 
0m 42s (- 0m 10s) (800 80%) 0.0151 
0m 44s (- 0m 7s) (850 85%) 0.0101 
0m 46s (- 0m 5s) (900 90%) 0.0067 
0m 48s (- 0m 2s) (950 95%) 0.0104 
0m 50s (- 0m 0s) (1000 100%) 0.0107 97.00%


---

### Samples Evaluation

In [50]:
if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
    encoder2 = torch.load("saved_models/encoder_" + MODEL_VERSION)
    decoder2 = torch.load("saved_models/decoder_" + MODEL_VERSION)
evaluateAccuracy(encoder2, decoder2, n=2000)

0.96999999999999997

In [51]:
evaluateRandomly(encoder2, decoder2)

> jump around right twice and look around right twice
= I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK
< I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK <EOS>

> jump thrice and walk left thrice
= I_JUMP I_JUMP I_JUMP I_TURN_LEFT I_WALK I_TURN_LEFT I_WALK I_TURN_LEFT I_WALK
< I_JUMP I_JUMP I_JUMP I_TURN_LEFT I_WALK I_TURN_LEFT I_WALK I_TURN_LEFT I_WALK <EOS>

> jump opposite left thrice after look right thrice
= I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK

---