In [1]:
%matplotlib inline

## SCAN Add-Prim JUMP Experiment
*************************************************************

Reference: http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html


**Requirements**

* Python 3.6
* PyTorch 0.4

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print("Device is using", device)

Device is using cpu


Loading data files
==================

In [4]:
SOS_token = 0
EOS_token = 1
TASK_NAME = "addprim-jump"


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

To read the data file we will split the file into lines, and then split
lines into pairs. 



In [5]:
def readLangs(lang1, lang2, reverse=False, trainOrtest='train'):
    print("Reading lines...")

    # Read the file and split into lines        
    lines = open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/processed/{}-{}_{}-{}.txt'.\
                 format(trainOrtest, TASK_NAME, lang1, lang2), encoding='utf-8').\
                 read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 50
# PRED_LENGTH = 50

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [7]:
def prepareData(lang1, lang2, reverse=False, dataFrom='train'):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse=False, trainOrtest=dataFrom)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('in', 'out', True)
print(random.choice(pairs))

Reading lines...
Read 37046 sentence pairs
Trimmed to 37046 sentence pairs
Counting words...
Counted words:
in 15
out 8
['turn around right and turn opposite left', 'I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_RIGHT I_TURN_LEFT I_TURN_LEFT']


Model
=================

The model we are using is a GRU encoder-decoder seq2seq model with attention mechanism. In order to solve the zero-shot generalization task, we embed the encoder networks with pre-trained embeddings, from GloVe and Google Word2Vec.  

In [8]:
EMBEDDEING_SOURCE = 'glove'
hidden_size = 100

if EMBEDDEING_SOURCE == 'google':
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_GoogleNews300Negative.pkl', 'rb') as handle:
        b = pickle.load(handle)
else:
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_raw{}d.pkl'.format(hidden_size), 'rb') as handle:
        b = pickle.load(handle)

pretrained_emb = np.zeros((input_lang.n_words, hidden_size))
for k, v in input_lang.index2word.items():
    if v == 'SOS':
        pretrained_emb[k] = np.zeros(hidden_size)
    elif (v == 'EOS') and (EMBEDDEING_SOURCE != 'google'):
        pretrained_emb[k] = b['.']
    elif (v == 'and') and (EMBEDDEING_SOURCE == 'google'):
        pretrained_emb[k] = b['AND']
    else:
        pretrained_emb[k] = b[v]

The Encoder
-----------

The encoder of this seq2seq network is a GRU netword. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.




In [11]:
EMBEDDEING_PRETRAINED = True
WEIGHT_UPDATE = False

MODEL_VERSION = 'T0.4_glv100'

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        if EMBEDDEING_PRETRAINED:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
            self.embedding.weight.requires_grad = WEIGHT_UPDATE
        
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is a GRU network with attention mechanism that takes the last output of the encoder and
outputs a sequence of words to create the translation.

First we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.




In [13]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we append the
EOS token to both sequences.




In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

We use teacher forcing to help converge faster with a delay fashion.




In [15]:
teacher_forcing_ratio = 0.8


def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Helper function for timing




In [16]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Training interation

In [17]:
def trainIters(encoder, decoder, n_iters, print_every=1000, eval_every=1000, learning_rate=0.001):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
        encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
        decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
        
    best_test_acc = evaluateAccuracy(encoder, decoder, 500)
    print("Best evaluation accuracy: {0:.2f}%".format(best_test_acc * 100))

    parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        
    encoder_optimizer = optim.Adam(parameters, lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg), end=' ')
            
            if iter % eval_every == 0:
                test_acc = evaluateAccuracy(encoder, decoder, 200)
                print('{0:.2f}%'.format(test_acc * 100))
                
                if test_acc > best_test_acc:
                    with open("saved_models/encoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(encoder, f)
                    with open("saved_models/decoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(decoder, f)
                    print("New best test accuracy! Model Updated!")
                    best_test_acc = test_acc
#                 elif test_acc < best_test_acc - 0.001:
#                     encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
#                     decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
                    
            else:
                print('')

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [18]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [19]:
input_lang, output_lang, pairs_eval = prepareData('in', 'out', True, dataFrom='test')
print(random.choice(pairs_eval))

Reading lines...
Read 15412 sentence pairs
Trimmed to 15412 sentence pairs
Counting words...
Counted words:
in 15
out 8
['look left twice and jump around left', 'I_TURN_LEFT I_LOOK I_TURN_LEFT I_LOOK I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP']


In [20]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_eval)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [21]:
def evaluateAccuracy(encoder, decoder, n=10):
    ACCs = []
    for i in range(n):
        pair = random.choice(pairs_eval)
        output_words, _ = evaluate(encoder, decoder, pair[0])
        
        if output_words[-1] == '<EOS>':
            output_words = output_words[:-1]
        output_sentence = ' '.join(output_words)
        
        if output_sentence == pair[1]:
            ACCs.append(1)
        else:
            ACCs.append(0)
    return np.array(ACCs).mean()

Training and Evaluating
=======================


The model is initially trained with a higher teacher aid, and relatively large learning rate. Both teacher forcing effect and the learning rate decay over iterations when the model approaches the optimum.  

#### The model achieves 97% accuracy rate for the best test sample evaluation, and is 94% correct on average for the testset.

In [22]:
teacher_forcing_ratio = 0.8

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 5000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 0.00%
0m 4s (- 7m 16s) (50 1%) 1.8580 
0m 6s (- 4m 55s) (100 2%) 1.6051 
0m 7s (- 4m 5s) (150 3%) 1.4842 
0m 9s (- 3m 39s) (200 4%) 1.3071 
0m 10s (- 3m 26s) (250 5%) 1.3116 
0m 12s (- 3m 18s) (300 6%) 1.2965 
0m 14s (- 3m 13s) (350 7%) 1.0414 
0m 16s (- 3m 4s) (400 8%) 1.1737 
0m 18s (- 3m 5s) (450 9%) 1.1512 
0m 20s (- 3m 0s) (500 10%) 1.2825 0.00%
0m 28s (- 3m 49s) (550 11%) 1.2795 
0m 30s (- 3m 42s) (600 12%) 1.0927 
0m 31s (- 3m 34s) (650 13%) 1.1269 
0m 33s (- 3m 24s) (700 14%) 0.9627 
0m 35s (- 3m 18s) (750 15%) 1.0182 
0m 36s (- 3m 12s) (800 16%) 0.9904 
0m 38s (- 3m 7s) (850 17%) 0.9553 
0m 40s (- 3m 2s) (900 18%) 0.9635 
0m 41s (- 2m 59s) (950 19%) 0.9128 
0m 43s (- 2m 53s) (1000 20%) 0.9097 0.00%
0m 50s (- 3m 11s) (1050 21%) 0.8488 
0m 52s (- 3m 6s) (1100 22%) 0.8443 
0m 54s (- 3m 1s) (1150 23%) 0.8983 
0m 56s (- 2m 58s) (1200 24%) 0.8404 
0m 58s (- 2m 55s) (1250 25%) 0.7303 
1m 0s (- 2m 52s) (1300 26%) 0.8454 
1m 2s (- 2m 48s) (1350 27%) 0.8392 
1m

In [23]:
teacher_forcing_ratio = 0.5
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 5.20%
0m 7s (- 2m 24s) (50 5%) 0.5383 
0m 9s (- 1m 23s) (100 10%) 0.6090 
0m 10s (- 1m 2s) (150 15%) 0.6029 
0m 12s (- 0m 49s) (200 20%) 0.5605 
0m 13s (- 0m 41s) (250 25%) 0.6172 
0m 15s (- 0m 35s) (300 30%) 0.5836 
0m 17s (- 0m 32s) (350 35%) 0.5007 
0m 19s (- 0m 28s) (400 40%) 0.5395 
0m 20s (- 0m 25s) (450 45%) 0.4296 
0m 22s (- 0m 22s) (500 50%) 0.4641 7.00%
New best test accuracy! Model Updated!
0m 26s (- 0m 21s) (550 55%) 0.5286 
0m 28s (- 0m 18s) (600 60%) 0.4023 
0m 29s (- 0m 15s) (650 65%) 0.4764 
0m 30s (- 0m 13s) (700 70%) 0.5611 
0m 31s (- 0m 10s) (750 75%) 0.6120 
0m 33s (- 0m 8s) (800 80%) 0.5463 
0m 34s (- 0m 6s) (850 85%) 0.4433 
0m 35s (- 0m 3s) (900 90%) 0.4698 
0m 36s (- 0m 1s) (950 95%) 0.5156 
0m 38s (- 0m 0s) (1000 100%) 0.3970 9.00%
New best test accuracy! Model Updated!


In [24]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 7.60%
0m 6s (- 2m 9s) (50 5%) 0.4757 
0m 8s (- 1m 13s) (100 10%) 0.4615 
0m 9s (- 0m 55s) (150 15%) 0.5772 
0m 11s (- 0m 44s) (200 20%) 0.5126 
0m 12s (- 0m 36s) (250 25%) 0.3797 
0m 13s (- 0m 31s) (300 30%) 0.5019 
0m 14s (- 0m 27s) (350 35%) 0.4571 
0m 16s (- 0m 24s) (400 40%) 0.4054 
0m 17s (- 0m 20s) (450 45%) 0.4459 
0m 18s (- 0m 18s) (500 50%) 0.3970 8.50%
New best test accuracy! Model Updated!
0m 21s (- 0m 17s) (550 55%) 0.4339 
0m 23s (- 0m 15s) (600 60%) 0.4655 
0m 24s (- 0m 13s) (650 65%) 0.3987 
0m 25s (- 0m 10s) (700 70%) 0.4732 
0m 26s (- 0m 8s) (750 75%) 0.5640 
0m 27s (- 0m 6s) (800 80%) 0.4725 
0m 29s (- 0m 5s) (850 85%) 0.4714 
0m 30s (- 0m 3s) (900 90%) 0.4647 
0m 31s (- 0m 1s) (950 95%) 0.4640 
0m 33s (- 0m 0s) (1000 100%) 0.4241 7.50%


In [25]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 10.40%
0m 7s (- 2m 20s) (50 5%) 0.4427 
0m 8s (- 1m 17s) (100 10%) 0.5037 
0m 9s (- 0m 55s) (150 15%) 0.4901 
0m 10s (- 0m 43s) (200 20%) 0.4545 
0m 12s (- 0m 37s) (250 25%) 0.4620 
0m 13s (- 0m 32s) (300 30%) 0.4927 
0m 15s (- 0m 28s) (350 35%) 0.3937 
0m 16s (- 0m 24s) (400 40%) 0.4439 
0m 17s (- 0m 21s) (450 45%) 0.5226 
0m 18s (- 0m 18s) (500 50%) 0.3739 5.50%
0m 22s (- 0m 18s) (550 55%) 0.4772 
0m 24s (- 0m 16s) (600 60%) 0.5234 
0m 25s (- 0m 13s) (650 65%) 0.4979 
0m 26s (- 0m 11s) (700 70%) 0.3922 
0m 28s (- 0m 9s) (750 75%) 0.4187 
0m 29s (- 0m 7s) (800 80%) 0.5085 
0m 30s (- 0m 5s) (850 85%) 0.4024 
0m 31s (- 0m 3s) (900 90%) 0.4515 
0m 33s (- 0m 1s) (950 95%) 0.4398 
0m 34s (- 0m 0s) (1000 100%) 0.3864 12.00%
New best test accuracy! Model Updated!


In [30]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 14.20%
0m 6s (- 2m 4s) (50 5%) 0.3915 
0m 7s (- 1m 10s) (100 10%) 0.3490 
0m 9s (- 0m 51s) (150 15%) 0.4112 
0m 10s (- 0m 41s) (200 20%) 0.3682 
0m 11s (- 0m 35s) (250 25%) 0.3475 
0m 12s (- 0m 30s) (300 30%) 0.3576 
0m 14s (- 0m 26s) (350 35%) 0.3858 
0m 15s (- 0m 23s) (400 40%) 0.3455 
0m 17s (- 0m 21s) (450 45%) 0.3492 
0m 18s (- 0m 18s) (500 50%) 0.3704 13.50%
0m 21s (- 0m 17s) (550 55%) 0.2993 
0m 22s (- 0m 15s) (600 60%) 0.3477 
0m 24s (- 0m 13s) (650 65%) 0.4200 
0m 25s (- 0m 11s) (700 70%) 0.4100 
0m 26s (- 0m 8s) (750 75%) 0.3797 
0m 28s (- 0m 7s) (800 80%) 0.4026 
0m 29s (- 0m 5s) (850 85%) 0.4233 
0m 30s (- 0m 3s) (900 90%) 0.3627 
0m 31s (- 0m 1s) (950 95%) 0.3189 
0m 33s (- 0m 0s) (1000 100%) 0.3310 18.00%
New best test accuracy! Model Updated!


In [31]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 18.00%
0m 6s (- 2m 5s) (50 5%) 0.3199 
0m 8s (- 1m 13s) (100 10%) 0.3287 
0m 9s (- 0m 54s) (150 15%) 0.3888 
0m 10s (- 0m 43s) (200 20%) 0.4593 
0m 11s (- 0m 35s) (250 25%) 0.4251 
0m 13s (- 0m 30s) (300 30%) 0.3593 
0m 14s (- 0m 27s) (350 35%) 0.2973 
0m 16s (- 0m 24s) (400 40%) 0.3846 
0m 18s (- 0m 22s) (450 45%) 0.3521 
0m 19s (- 0m 19s) (500 50%) 0.2842 20.00%
New best test accuracy! Model Updated!
0m 23s (- 0m 18s) (550 55%) 0.4198 
0m 24s (- 0m 16s) (600 60%) 0.3272 
0m 25s (- 0m 13s) (650 65%) 0.3464 
0m 27s (- 0m 11s) (700 70%) 0.3055 
0m 28s (- 0m 9s) (750 75%) 0.3557 
0m 29s (- 0m 7s) (800 80%) 0.3601 
0m 30s (- 0m 5s) (850 85%) 0.3569 
0m 32s (- 0m 3s) (900 90%) 0.3738 
0m 33s (- 0m 1s) (950 95%) 0.3838 
0m 34s (- 0m 0s) (1000 100%) 0.3139 21.00%
New best test accuracy! Model Updated!


In [32]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 17.60%
0m 6s (- 1m 57s) (50 5%) 0.3104 
0m 7s (- 1m 8s) (100 10%) 0.2667 
0m 8s (- 0m 50s) (150 15%) 0.2784 
0m 10s (- 0m 40s) (200 20%) 0.3867 
0m 11s (- 0m 34s) (250 25%) 0.2541 
0m 12s (- 0m 29s) (300 30%) 0.2868 
0m 14s (- 0m 26s) (350 35%) 0.3123 
0m 15s (- 0m 22s) (400 40%) 0.3285 
0m 16s (- 0m 20s) (450 45%) 0.2926 
0m 17s (- 0m 17s) (500 50%) 0.3668 15.50%
0m 21s (- 0m 17s) (550 55%) 0.3510 
0m 22s (- 0m 14s) (600 60%) 0.2983 
0m 23s (- 0m 12s) (650 65%) 0.3660 
0m 24s (- 0m 10s) (700 70%) 0.3188 
0m 26s (- 0m 8s) (750 75%) 0.3105 
0m 27s (- 0m 6s) (800 80%) 0.2514 
0m 29s (- 0m 5s) (850 85%) 0.3052 
0m 30s (- 0m 3s) (900 90%) 0.3243 
0m 32s (- 0m 1s) (950 95%) 0.2222 
0m 34s (- 0m 0s) (1000 100%) 0.2810 21.00%
New best test accuracy! Model Updated!


In [33]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 23.60%
0m 6s (- 2m 3s) (50 5%) 0.3969 
0m 8s (- 1m 12s) (100 10%) 0.3800 
0m 9s (- 0m 55s) (150 15%) 0.2687 
0m 11s (- 0m 44s) (200 20%) 0.2581 
0m 13s (- 0m 39s) (250 25%) 0.2519 
0m 15s (- 0m 35s) (300 30%) 0.2826 
0m 17s (- 0m 31s) (350 35%) 0.3072 
0m 18s (- 0m 28s) (400 40%) 0.2636 
0m 20s (- 0m 24s) (450 45%) 0.4020 
0m 21s (- 0m 21s) (500 50%) 0.2641 27.00%
New best test accuracy! Model Updated!
0m 25s (- 0m 20s) (550 55%) 0.2587 
0m 26s (- 0m 17s) (600 60%) 0.3166 
0m 27s (- 0m 14s) (650 65%) 0.2867 
0m 28s (- 0m 12s) (700 70%) 0.2363 
0m 29s (- 0m 9s) (750 75%) 0.2565 
0m 31s (- 0m 7s) (800 80%) 0.2618 
0m 32s (- 0m 5s) (850 85%) 0.3527 
0m 33s (- 0m 3s) (900 90%) 0.2961 
0m 34s (- 0m 1s) (950 95%) 0.4262 
0m 36s (- 0m 0s) (1000 100%) 0.3297 30.00%
New best test accuracy! Model Updated!


In [34]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 23.40%
0m 6s (- 2m 12s) (50 5%) 0.2943 
0m 8s (- 1m 15s) (100 10%) 0.2845 
0m 9s (- 0m 54s) (150 15%) 0.2690 
0m 10s (- 0m 43s) (200 20%) 0.3612 
0m 12s (- 0m 36s) (250 25%) 0.3019 
0m 13s (- 0m 31s) (300 30%) 0.2435 
0m 14s (- 0m 27s) (350 35%) 0.2649 
0m 16s (- 0m 24s) (400 40%) 0.3982 
0m 17s (- 0m 21s) (450 45%) 0.3596 
0m 18s (- 0m 18s) (500 50%) 0.2912 27.00%
New best test accuracy! Model Updated!
0m 22s (- 0m 18s) (550 55%) 0.3309 
0m 23s (- 0m 15s) (600 60%) 0.2302 
0m 24s (- 0m 13s) (650 65%) 0.2767 
0m 26s (- 0m 11s) (700 70%) 0.2093 
0m 27s (- 0m 9s) (750 75%) 0.2868 
0m 28s (- 0m 7s) (800 80%) 0.3753 
0m 29s (- 0m 5s) (850 85%) 0.2151 
0m 30s (- 0m 3s) (900 90%) 0.2079 
0m 32s (- 0m 1s) (950 95%) 0.2660 
0m 34s (- 0m 0s) (1000 100%) 0.3611 34.50%
New best test accuracy! Model Updated!


In [35]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 32.20%
0m 8s (- 2m 40s) (50 5%) 0.2810 
0m 10s (- 1m 30s) (100 10%) 0.2241 
0m 11s (- 1m 6s) (150 15%) 0.2429 
0m 13s (- 0m 52s) (200 20%) 0.2949 
0m 14s (- 0m 43s) (250 25%) 0.2248 
0m 15s (- 0m 37s) (300 30%) 0.3535 
0m 17s (- 0m 32s) (350 35%) 0.2236 
0m 19s (- 0m 29s) (400 40%) 0.2602 
0m 21s (- 0m 26s) (450 45%) 0.3283 
0m 22s (- 0m 22s) (500 50%) 0.2657 30.50%
0m 27s (- 0m 22s) (550 55%) 0.2086 
0m 28s (- 0m 19s) (600 60%) 0.3000 
0m 30s (- 0m 16s) (650 65%) 0.3118 
0m 32s (- 0m 14s) (700 70%) 0.3167 
0m 34s (- 0m 11s) (750 75%) 0.2624 
0m 36s (- 0m 9s) (800 80%) 0.2352 
0m 37s (- 0m 6s) (850 85%) 0.2575 
0m 39s (- 0m 4s) (900 90%) 0.2056 
0m 41s (- 0m 2s) (950 95%) 0.2061 
0m 42s (- 0m 0s) (1000 100%) 0.2117 35.50%
New best test accuracy! Model Updated!


In [36]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 36.60%
0m 5s (- 1m 52s) (50 5%) 0.2936 
0m 7s (- 1m 4s) (100 10%) 0.2525 
0m 8s (- 0m 48s) (150 15%) 0.2295 
0m 9s (- 0m 39s) (200 20%) 0.2599 
0m 11s (- 0m 34s) (250 25%) 0.2990 
0m 12s (- 0m 29s) (300 30%) 0.2469 
0m 14s (- 0m 26s) (350 35%) 0.2587 
0m 15s (- 0m 23s) (400 40%) 0.2105 
0m 16s (- 0m 20s) (450 45%) 0.3169 
0m 18s (- 0m 18s) (500 50%) 0.1918 35.00%
0m 21s (- 0m 17s) (550 55%) 0.1580 
0m 22s (- 0m 14s) (600 60%) 0.2294 
0m 23s (- 0m 12s) (650 65%) 0.1708 
0m 24s (- 0m 10s) (700 70%) 0.2242 
0m 26s (- 0m 8s) (750 75%) 0.1808 
0m 27s (- 0m 6s) (800 80%) 0.2270 
0m 28s (- 0m 5s) (850 85%) 0.2014 
0m 29s (- 0m 3s) (900 90%) 0.2031 
0m 31s (- 0m 1s) (950 95%) 0.2194 
0m 32s (- 0m 0s) (1000 100%) 0.2527 42.00%
New best test accuracy! Model Updated!


In [38]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 39.00%
0m 8s (- 2m 34s) (50 5%) 0.2175 
0m 9s (- 1m 27s) (100 10%) 0.2281 
0m 11s (- 1m 7s) (150 15%) 0.2322 
0m 13s (- 0m 53s) (200 20%) 0.2345 
0m 15s (- 0m 45s) (250 25%) 0.2491 
0m 17s (- 0m 39s) (300 30%) 0.1614 
0m 18s (- 0m 34s) (350 35%) 0.1876 
0m 20s (- 0m 30s) (400 40%) 0.2191 
0m 22s (- 0m 26s) (450 45%) 0.1696 
0m 23s (- 0m 23s) (500 50%) 0.2750 42.00%
New best test accuracy! Model Updated!
0m 27s (- 0m 22s) (550 55%) 0.1913 
0m 29s (- 0m 19s) (600 60%) 0.1751 
0m 30s (- 0m 16s) (650 65%) 0.2070 
0m 32s (- 0m 13s) (700 70%) 0.2574 
0m 34s (- 0m 11s) (750 75%) 0.2228 
0m 35s (- 0m 8s) (800 80%) 0.2495 
0m 37s (- 0m 6s) (850 85%) 0.2237 
0m 39s (- 0m 4s) (900 90%) 0.1570 
0m 40s (- 0m 2s) (950 95%) 0.2232 
0m 42s (- 0m 0s) (1000 100%) 0.1791 39.50%


In [39]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 38.80%
0m 7s (- 2m 25s) (50 5%) 0.2002 
0m 9s (- 1m 22s) (100 10%) 0.2120 
0m 10s (- 1m 1s) (150 15%) 0.2259 
0m 12s (- 0m 49s) (200 20%) 0.2215 
0m 14s (- 0m 42s) (250 25%) 0.1665 
0m 15s (- 0m 36s) (300 30%) 0.1648 
0m 17s (- 0m 31s) (350 35%) 0.2719 
0m 18s (- 0m 28s) (400 40%) 0.1703 
0m 20s (- 0m 25s) (450 45%) 0.1945 
0m 22s (- 0m 22s) (500 50%) 0.2626 41.00%
New best test accuracy! Model Updated!
0m 25s (- 0m 21s) (550 55%) 0.2261 
0m 27s (- 0m 18s) (600 60%) 0.2863 
0m 28s (- 0m 15s) (650 65%) 0.1968 
0m 30s (- 0m 13s) (700 70%) 0.2235 
0m 32s (- 0m 10s) (750 75%) 0.2144 
0m 34s (- 0m 8s) (800 80%) 0.1473 
0m 35s (- 0m 6s) (850 85%) 0.1795 
0m 37s (- 0m 4s) (900 90%) 0.1899 
0m 39s (- 0m 2s) (950 95%) 0.2427 
0m 41s (- 0m 0s) (1000 100%) 0.2064 44.50%
New best test accuracy! Model Updated!


In [40]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 46.40%
0m 7s (- 2m 25s) (50 5%) 0.2124 
0m 9s (- 1m 24s) (100 10%) 0.2538 
0m 11s (- 1m 2s) (150 15%) 0.1588 
0m 12s (- 0m 50s) (200 20%) 0.2729 
0m 14s (- 0m 43s) (250 25%) 0.1815 
0m 15s (- 0m 36s) (300 30%) 0.1614 
0m 16s (- 0m 31s) (350 35%) 0.1456 
0m 18s (- 0m 27s) (400 40%) 0.1608 
0m 19s (- 0m 24s) (450 45%) 0.2019 
0m 21s (- 0m 21s) (500 50%) 0.2446 55.00%
New best test accuracy! Model Updated!
0m 25s (- 0m 20s) (550 55%) 0.1920 
0m 27s (- 0m 18s) (600 60%) 0.3419 
0m 28s (- 0m 15s) (650 65%) 0.2084 
0m 30s (- 0m 13s) (700 70%) 0.2218 
0m 32s (- 0m 10s) (750 75%) 0.1660 
0m 33s (- 0m 8s) (800 80%) 0.2287 
0m 35s (- 0m 6s) (850 85%) 0.1556 
0m 36s (- 0m 4s) (900 90%) 0.2366 
0m 38s (- 0m 2s) (950 95%) 0.2299 
0m 39s (- 0m 0s) (1000 100%) 0.1696 51.00%


In [41]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 46.60%
0m 7s (- 2m 18s) (50 5%) 0.2130 
0m 8s (- 1m 20s) (100 10%) 0.1342 
0m 10s (- 0m 58s) (150 15%) 0.3052 
0m 11s (- 0m 46s) (200 20%) 0.1433 
0m 12s (- 0m 38s) (250 25%) 0.1490 
0m 14s (- 0m 34s) (300 30%) 0.1744 
0m 16s (- 0m 29s) (350 35%) 0.2392 
0m 17s (- 0m 26s) (400 40%) 0.1450 
0m 18s (- 0m 22s) (450 45%) 0.1357 
0m 20s (- 0m 20s) (500 50%) 0.2777 53.00%
New best test accuracy! Model Updated!
0m 24s (- 0m 19s) (550 55%) 0.2106 
0m 25s (- 0m 17s) (600 60%) 0.1826 
0m 27s (- 0m 14s) (650 65%) 0.1605 
0m 29s (- 0m 12s) (700 70%) 0.1257 
0m 31s (- 0m 10s) (750 75%) 0.1791 
0m 32s (- 0m 8s) (800 80%) 0.1490 
0m 34s (- 0m 6s) (850 85%) 0.1264 
0m 36s (- 0m 4s) (900 90%) 0.1492 
0m 37s (- 0m 1s) (950 95%) 0.1262 
0m 39s (- 0m 0s) (1000 100%) 0.2390 46.00%


In [42]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 49.40%
0m 6s (- 2m 10s) (50 5%) 0.1686 
0m 8s (- 1m 15s) (100 10%) 0.1646 
0m 9s (- 0m 56s) (150 15%) 0.2086 
0m 11s (- 0m 45s) (200 20%) 0.2002 
0m 13s (- 0m 39s) (250 25%) 0.2141 
0m 14s (- 0m 33s) (300 30%) 0.1446 
0m 15s (- 0m 29s) (350 35%) 0.2058 
0m 17s (- 0m 26s) (400 40%) 0.1376 
0m 19s (- 0m 23s) (450 45%) 0.1801 
0m 20s (- 0m 20s) (500 50%) 0.1411 46.50%
0m 24s (- 0m 19s) (550 55%) 0.2378 
0m 26s (- 0m 17s) (600 60%) 0.1953 
0m 27s (- 0m 14s) (650 65%) 0.1446 
0m 29s (- 0m 12s) (700 70%) 0.1643 
0m 30s (- 0m 10s) (750 75%) 0.1611 
0m 32s (- 0m 8s) (800 80%) 0.1360 
0m 33s (- 0m 5s) (850 85%) 0.2002 
0m 35s (- 0m 3s) (900 90%) 0.1191 
0m 36s (- 0m 1s) (950 95%) 0.1565 
0m 38s (- 0m 0s) (1000 100%) 0.2138 49.50%
New best test accuracy! Model Updated!


In [43]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 46.80%
0m 6s (- 2m 5s) (50 5%) 0.1598 
0m 7s (- 1m 10s) (100 10%) 0.1711 
0m 9s (- 0m 52s) (150 15%) 0.1739 
0m 10s (- 0m 41s) (200 20%) 0.2292 
0m 11s (- 0m 35s) (250 25%) 0.1600 
0m 13s (- 0m 31s) (300 30%) 0.1859 
0m 14s (- 0m 27s) (350 35%) 0.1444 
0m 15s (- 0m 23s) (400 40%) 0.1297 
0m 17s (- 0m 20s) (450 45%) 0.2255 
0m 18s (- 0m 18s) (500 50%) 0.1686 53.00%
New best test accuracy! Model Updated!
0m 22s (- 0m 18s) (550 55%) 0.3106 
0m 24s (- 0m 16s) (600 60%) 0.1345 
0m 26s (- 0m 14s) (650 65%) 0.1987 
0m 27s (- 0m 11s) (700 70%) 0.1157 
0m 29s (- 0m 9s) (750 75%) 0.2089 
0m 30s (- 0m 7s) (800 80%) 0.1541 
0m 32s (- 0m 5s) (850 85%) 0.2308 
0m 34s (- 0m 3s) (900 90%) 0.1932 
0m 35s (- 0m 1s) (950 95%) 0.1561 
0m 37s (- 0m 0s) (1000 100%) 0.1167 58.50%
New best test accuracy! Model Updated!


In [46]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 52.60%
0m 6s (- 2m 3s) (50 5%) 0.2108 
0m 7s (- 1m 11s) (100 10%) 0.2532 
0m 9s (- 0m 52s) (150 15%) 0.1391 
0m 10s (- 0m 43s) (200 20%) 0.1214 
0m 12s (- 0m 36s) (250 25%) 0.1337 
0m 13s (- 0m 32s) (300 30%) 0.1445 
0m 15s (- 0m 28s) (350 35%) 0.1393 
0m 17s (- 0m 25s) (400 40%) 0.1671 
0m 18s (- 0m 22s) (450 45%) 0.1361 
0m 20s (- 0m 20s) (500 50%) 0.1671 54.50%
New best test accuracy! Model Updated!
0m 24s (- 0m 19s) (550 55%) 0.2198 
0m 25s (- 0m 17s) (600 60%) 0.2037 
0m 27s (- 0m 14s) (650 65%) 0.1366 
0m 29s (- 0m 12s) (700 70%) 0.1753 
0m 30s (- 0m 10s) (750 75%) 0.1868 
0m 32s (- 0m 8s) (800 80%) 0.1816 
0m 33s (- 0m 5s) (850 85%) 0.1395 
0m 35s (- 0m 3s) (900 90%) 0.1649 
0m 37s (- 0m 1s) (950 95%) 0.1438 
0m 38s (- 0m 0s) (1000 100%) 0.2649 43.50%


In [47]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 53.20%
0m 7s (- 2m 13s) (50 5%) 0.1740 
0m 8s (- 1m 16s) (100 10%) 0.2546 
0m 9s (- 0m 56s) (150 15%) 0.1654 
0m 11s (- 0m 45s) (200 20%) 0.1917 
0m 12s (- 0m 38s) (250 25%) 0.1508 
0m 14s (- 0m 33s) (300 30%) 0.1635 
0m 15s (- 0m 28s) (350 35%) 0.1310 
0m 17s (- 0m 25s) (400 40%) 0.1535 
0m 18s (- 0m 22s) (450 45%) 0.1323 
0m 19s (- 0m 19s) (500 50%) 0.1954 45.00%
0m 23s (- 0m 19s) (550 55%) 0.1910 
0m 24s (- 0m 16s) (600 60%) 0.1677 
0m 25s (- 0m 13s) (650 65%) 0.1771 
0m 27s (- 0m 11s) (700 70%) 0.3031 
0m 28s (- 0m 9s) (750 75%) 0.1073 
0m 30s (- 0m 7s) (800 80%) 0.1413 
0m 32s (- 0m 5s) (850 85%) 0.3192 
0m 34s (- 0m 3s) (900 90%) 0.1242 
0m 35s (- 0m 1s) (950 95%) 0.1387 
0m 37s (- 0m 0s) (1000 100%) 0.1304 56.00%
New best test accuracy! Model Updated!


In [48]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 56.00%
0m 7s (- 2m 18s) (50 5%) 0.2061 
0m 8s (- 1m 19s) (100 10%) 0.1653 
0m 10s (- 0m 58s) (150 15%) 0.2188 
0m 11s (- 0m 47s) (200 20%) 0.1021 
0m 13s (- 0m 40s) (250 25%) 0.2094 
0m 14s (- 0m 34s) (300 30%) 0.1334 
0m 16s (- 0m 30s) (350 35%) 0.1180 
0m 17s (- 0m 26s) (400 40%) 0.0960 
0m 19s (- 0m 23s) (450 45%) 0.1640 
0m 21s (- 0m 21s) (500 50%) 0.1659 54.00%
0m 24s (- 0m 20s) (550 55%) 0.1184 
0m 26s (- 0m 17s) (600 60%) 0.1947 
0m 28s (- 0m 15s) (650 65%) 0.1476 
0m 29s (- 0m 12s) (700 70%) 0.1473 
0m 31s (- 0m 10s) (750 75%) 0.1517 
0m 32s (- 0m 8s) (800 80%) 0.1515 
0m 34s (- 0m 6s) (850 85%) 0.1147 
0m 35s (- 0m 3s) (900 90%) 0.1350 
0m 37s (- 0m 1s) (950 95%) 0.1244 
0m 39s (- 0m 0s) (1000 100%) 0.2621 53.50%


In [49]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 53.60%
0m 7s (- 2m 13s) (50 5%) 0.1741 
0m 8s (- 1m 16s) (100 10%) 0.1457 
0m 10s (- 0m 57s) (150 15%) 0.1384 
0m 11s (- 0m 47s) (200 20%) 0.1708 
0m 13s (- 0m 40s) (250 25%) 0.2015 
0m 15s (- 0m 35s) (300 30%) 0.1181 
0m 16s (- 0m 30s) (350 35%) 0.1169 
0m 18s (- 0m 27s) (400 40%) 0.1192 
0m 19s (- 0m 24s) (450 45%) 0.1581 
0m 21s (- 0m 21s) (500 50%) 0.1294 55.50%
New best test accuracy! Model Updated!
0m 25s (- 0m 21s) (550 55%) 0.3228 
0m 26s (- 0m 17s) (600 60%) 0.1564 
0m 28s (- 0m 15s) (650 65%) 0.1851 
0m 30s (- 0m 13s) (700 70%) 0.1400 
0m 32s (- 0m 10s) (750 75%) 0.1157 
0m 33s (- 0m 8s) (800 80%) 0.0919 
0m 35s (- 0m 6s) (850 85%) 0.1791 
0m 36s (- 0m 4s) (900 90%) 0.1821 
0m 38s (- 0m 2s) (950 95%) 0.1660 
0m 39s (- 0m 0s) (1000 100%) 0.1515 57.50%
New best test accuracy! Model Updated!


In [51]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00005)

Best evaluation accuracy: 50.60%
0m 6s (- 1m 56s) (50 5%) 0.1681 
0m 7s (- 1m 8s) (100 10%) 0.1616 
0m 8s (- 0m 50s) (150 15%) 0.1589 
0m 10s (- 0m 41s) (200 20%) 0.1567 
0m 11s (- 0m 34s) (250 25%) 0.1570 
0m 12s (- 0m 29s) (300 30%) 0.1293 
0m 14s (- 0m 26s) (350 35%) 0.1187 
0m 15s (- 0m 23s) (400 40%) 0.1470 
0m 17s (- 0m 20s) (450 45%) 0.1623 
0m 18s (- 0m 18s) (500 50%) 0.1285 66.00%
New best test accuracy! Model Updated!
0m 21s (- 0m 17s) (550 55%) 0.1614 
0m 22s (- 0m 15s) (600 60%) 0.1500 
0m 24s (- 0m 13s) (650 65%) 0.1220 
0m 25s (- 0m 10s) (700 70%) 0.2418 
0m 26s (- 0m 8s) (750 75%) 0.1321 
0m 27s (- 0m 6s) (800 80%) 0.1227 
0m 29s (- 0m 5s) (850 85%) 0.1624 
0m 30s (- 0m 3s) (900 90%) 0.1249 
0m 31s (- 0m 1s) (950 95%) 0.1230 
0m 33s (- 0m 0s) (1000 100%) 0.2338 56.50%


In [53]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00005)

Best evaluation accuracy: 53.80%
0m 5s (- 1m 46s) (50 5%) 0.1352 
0m 6s (- 1m 2s) (100 10%) 0.1649 
0m 8s (- 0m 46s) (150 15%) 0.1558 
0m 9s (- 0m 38s) (200 20%) 0.1909 
0m 10s (- 0m 32s) (250 25%) 0.1292 
0m 11s (- 0m 27s) (300 30%) 0.1942 
0m 13s (- 0m 24s) (350 35%) 0.1828 
0m 14s (- 0m 21s) (400 40%) 0.1905 
0m 15s (- 0m 19s) (450 45%) 0.2455 
0m 17s (- 0m 17s) (500 50%) 0.1309 59.50%
New best test accuracy! Model Updated!
0m 20s (- 0m 16s) (550 55%) 0.1037 
0m 21s (- 0m 14s) (600 60%) 0.1831 
0m 23s (- 0m 12s) (650 65%) 0.1317 
0m 24s (- 0m 10s) (700 70%) 0.1958 
0m 25s (- 0m 8s) (750 75%) 0.1156 
0m 26s (- 0m 6s) (800 80%) 0.1072 
0m 27s (- 0m 4s) (850 85%) 0.1100 
0m 29s (- 0m 3s) (900 90%) 0.1209 
0m 30s (- 0m 1s) (950 95%) 0.1274 
0m 31s (- 0m 0s) (1000 100%) 0.1627 59.50%


In [54]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 58.00%
0m 6s (- 4m 30s) (50 2%) 0.1182 
0m 8s (- 2m 38s) (100 5%) 0.0996 
0m 9s (- 1m 58s) (150 7%) 0.1333 
0m 11s (- 1m 40s) (200 10%) 0.0956 
0m 12s (- 1m 28s) (250 12%) 0.1649 
0m 14s (- 1m 19s) (300 15%) 0.1581 
0m 15s (- 1m 13s) (350 17%) 0.1896 
0m 17s (- 1m 8s) (400 20%) 0.1788 
0m 18s (- 1m 4s) (450 22%) 0.1043 
0m 20s (- 1m 0s) (500 25%) 0.1785 64.50%
New best test accuracy! Model Updated!
0m 24s (- 1m 4s) (550 27%) 0.1784 
0m 27s (- 1m 3s) (600 30%) 0.1202 
0m 28s (- 0m 59s) (650 32%) 0.1119 
0m 30s (- 0m 56s) (700 35%) 0.1697 
0m 31s (- 0m 53s) (750 37%) 0.1251 
0m 33s (- 0m 50s) (800 40%) 0.1143 
0m 35s (- 0m 47s) (850 42%) 0.1176 
0m 36s (- 0m 45s) (900 45%) 0.1110 
0m 38s (- 0m 42s) (950 47%) 0.1630 
0m 39s (- 0m 39s) (1000 50%) 0.1164 58.00%
0m 43s (- 0m 39s) (1050 52%) 0.1215 
0m 45s (- 0m 37s) (1100 55%) 0.1304 
0m 47s (- 0m 34s) (1150 57%) 0.1700 
0m 48s (- 0m 32s) (1200 60%) 0.1142 
0m 50s (- 0m 30s) (1250 62%) 0.1691 
0m 51s (- 0m 27s) (130

In [55]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 59.60%
0m 7s (- 4m 40s) (50 2%) 0.1336 
0m 8s (- 2m 47s) (100 5%) 0.1352 
0m 10s (- 2m 8s) (150 7%) 0.2674 
0m 12s (- 1m 48s) (200 10%) 0.1543 
0m 13s (- 1m 35s) (250 12%) 0.1080 
0m 15s (- 1m 26s) (300 15%) 0.1191 
0m 16s (- 1m 19s) (350 17%) 0.1157 
0m 18s (- 1m 14s) (400 20%) 0.1292 
0m 20s (- 1m 9s) (450 22%) 0.1310 
0m 21s (- 1m 5s) (500 25%) 0.1737 64.50%
New best test accuracy! Model Updated!
0m 25s (- 1m 8s) (550 27%) 0.1331 
0m 27s (- 1m 4s) (600 30%) 0.0967 
0m 29s (- 1m 0s) (650 32%) 0.1792 
0m 30s (- 0m 57s) (700 35%) 0.1233 
0m 32s (- 0m 53s) (750 37%) 0.1973 
0m 33s (- 0m 50s) (800 40%) 0.1209 
0m 35s (- 0m 47s) (850 42%) 0.1149 
0m 36s (- 0m 45s) (900 45%) 0.1116 
0m 38s (- 0m 42s) (950 47%) 0.1702 
0m 40s (- 0m 40s) (1000 50%) 0.1111 61.00%
0m 43s (- 0m 39s) (1050 52%) 0.2020 
0m 45s (- 0m 37s) (1100 55%) 0.1107 
0m 47s (- 0m 34s) (1150 57%) 0.1091 
0m 48s (- 0m 32s) (1200 60%) 0.1524 
0m 49s (- 0m 29s) (1250 62%) 0.1989 
0m 51s (- 0m 27s) (130

In [56]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 60.20%
0m 7s (- 2m 16s) (50 5%) 0.1058 
0m 8s (- 1m 19s) (100 10%) 0.1121 
0m 9s (- 0m 56s) (150 15%) 0.1149 
0m 11s (- 0m 44s) (200 20%) 0.2265 
0m 12s (- 0m 36s) (250 25%) 0.1737 
0m 13s (- 0m 31s) (300 30%) 0.1746 
0m 14s (- 0m 27s) (350 35%) 0.1108 
0m 16s (- 0m 24s) (400 40%) 0.1897 
0m 17s (- 0m 21s) (450 45%) 0.1772 
0m 19s (- 0m 19s) (500 50%) 0.1176 58.50%
0m 23s (- 0m 19s) (550 55%) 0.1222 
0m 25s (- 0m 17s) (600 60%) 0.1084 
0m 27s (- 0m 14s) (650 65%) 0.1299 
0m 29s (- 0m 12s) (700 70%) 0.1034 
0m 30s (- 0m 10s) (750 75%) 0.1028 
0m 32s (- 0m 8s) (800 80%) 0.1743 
0m 34s (- 0m 6s) (850 85%) 0.1880 
0m 35s (- 0m 3s) (900 90%) 0.1671 
0m 37s (- 0m 1s) (950 95%) 0.2005 
0m 38s (- 0m 0s) (1000 100%) 0.1170 64.00%
New best test accuracy! Model Updated!


In [57]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 59.20%
0m 8s (- 2m 44s) (50 5%) 0.1224 
0m 10s (- 1m 35s) (100 10%) 0.1838 
0m 12s (- 1m 10s) (150 15%) 0.1206 
0m 14s (- 0m 56s) (200 20%) 0.1294 
0m 15s (- 0m 46s) (250 25%) 0.1856 
0m 17s (- 0m 39s) (300 30%) 0.1204 
0m 18s (- 0m 35s) (350 35%) 0.1005 
0m 20s (- 0m 30s) (400 40%) 0.1113 
0m 22s (- 0m 27s) (450 45%) 0.1546 
0m 23s (- 0m 23s) (500 50%) 0.1693 59.00%
0m 27s (- 0m 22s) (550 55%) 0.1748 
0m 29s (- 0m 19s) (600 60%) 0.1348 
0m 31s (- 0m 16s) (650 65%) 0.1328 
0m 32s (- 0m 14s) (700 70%) 0.1601 
0m 34s (- 0m 11s) (750 75%) 0.1406 
0m 36s (- 0m 9s) (800 80%) 0.1232 
0m 37s (- 0m 6s) (850 85%) 0.1172 
0m 39s (- 0m 4s) (900 90%) 0.1395 
0m 41s (- 0m 2s) (950 95%) 0.1683 
0m 42s (- 0m 0s) (1000 100%) 0.1467 61.00%
New best test accuracy! Model Updated!


In [58]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 63.00%
0m 8s (- 2m 42s) (50 5%) 0.1006 
0m 10s (- 1m 31s) (100 10%) 0.1370 
0m 11s (- 1m 5s) (150 15%) 0.1563 
0m 13s (- 0m 53s) (200 20%) 0.2312 
0m 14s (- 0m 43s) (250 25%) 0.1033 
0m 15s (- 0m 37s) (300 30%) 0.0932 
0m 17s (- 0m 31s) (350 35%) 0.1271 
0m 18s (- 0m 27s) (400 40%) 0.1340 
0m 19s (- 0m 23s) (450 45%) 0.1161 
0m 21s (- 0m 21s) (500 50%) 0.1882 53.00%
0m 25s (- 0m 20s) (550 55%) 0.1273 
0m 27s (- 0m 18s) (600 60%) 0.1173 
0m 29s (- 0m 15s) (650 65%) 0.1819 
0m 30s (- 0m 13s) (700 70%) 0.1155 
0m 32s (- 0m 10s) (750 75%) 0.1126 
0m 34s (- 0m 8s) (800 80%) 0.1233 
0m 35s (- 0m 6s) (850 85%) 0.1353 
0m 37s (- 0m 4s) (900 90%) 0.1156 
0m 39s (- 0m 2s) (950 95%) 0.0980 
0m 41s (- 0m 0s) (1000 100%) 0.1257 64.50%
New best test accuracy! Model Updated!


In [60]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 56.20%
0m 7s (- 2m 19s) (50 5%) 0.1848 
0m 8s (- 1m 20s) (100 10%) 0.1149 
0m 10s (- 1m 1s) (150 15%) 0.1831 
0m 12s (- 0m 49s) (200 20%) 0.1511 
0m 13s (- 0m 41s) (250 25%) 0.1180 
0m 15s (- 0m 35s) (300 30%) 0.1304 
0m 16s (- 0m 30s) (350 35%) 0.1368 
0m 18s (- 0m 27s) (400 40%) 0.1210 
0m 19s (- 0m 24s) (450 45%) 0.2154 
0m 21s (- 0m 21s) (500 50%) 0.1164 56.50%
New best test accuracy! Model Updated!
0m 24s (- 0m 20s) (550 55%) 0.0987 
0m 26s (- 0m 17s) (600 60%) 0.2404 
0m 27s (- 0m 14s) (650 65%) 0.1223 
0m 29s (- 0m 12s) (700 70%) 0.1378 
0m 30s (- 0m 10s) (750 75%) 0.1087 
0m 32s (- 0m 8s) (800 80%) 0.1716 
0m 34s (- 0m 6s) (850 85%) 0.1426 
0m 35s (- 0m 3s) (900 90%) 0.1665 
0m 37s (- 0m 1s) (950 95%) 0.1370 
0m 38s (- 0m 0s) (1000 100%) 0.1409 59.50%
New best test accuracy! Model Updated!


---

### Samples Evaluation

In [61]:
if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
    encoder2 = torch.load("saved_models/encoder_" + MODEL_VERSION)
    decoder2 = torch.load("saved_models/decoder_" + MODEL_VERSION)
evaluateAccuracy(encoder2, decoder2, n=2000)

0.60150000000000003

In [62]:
evaluateRandomly(encoder2, decoder2)

> look thrice and jump around right
= I_LOOK I_LOOK I_LOOK I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP
< I_LOOK I_LOOK I_LOOK I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP <EOS>

> jump and turn right twice
= I_JUMP I_TURN_RIGHT I_TURN_RIGHT
< I_JUMP I_TURN_RIGHT I_TURN_RIGHT <EOS>

> jump right twice and walk right twice
= I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK
< I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK <EOS>

> jump right thrice and jump right twice
= I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP
< I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP <EOS>

> jump opposite left twice after run left thrice
= I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP
< I_TURN_LEFT I_

---