In [1]:
%matplotlib inline

## SCAN Add-Prim JUMP Experiment
*************************************************************

Reference: http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html


**Requirements**

* Python 3.6
* PyTorch 0.4

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print("Device is using", device)

Device is using cpu


Loading data files
==================

In [4]:
SOS_token = 0
EOS_token = 1
TASK_NAME = "addprim-jump"


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

To read the data file we will split the file into lines, and then split
lines into pairs. 



In [5]:
def readLangs(lang1, lang2, reverse=False, trainOrtest='train'):
    print("Reading lines...")

    # Read the file and split into lines        
    lines = open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/processed/{}-{}_{}-{}.txt'.\
                 format(trainOrtest, TASK_NAME, lang1, lang2), encoding='utf-8').\
                 read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 50
# PRED_LENGTH = 50

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [7]:
def prepareData(lang1, lang2, reverse=False, dataFrom='train'):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse=False, trainOrtest=dataFrom)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('in', 'out', True)
print(random.choice(pairs))

Reading lines...
Read 37046 sentence pairs
Trimmed to 37046 sentence pairs
Counting words...
Counted words:
in 15
out 8
['jump thrice after jump right thrice', 'I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_JUMP I_JUMP I_JUMP']


Model
=================

The model we are using is a GRU encoder-decoder seq2seq model with attention mechanism. In order to solve the zero-shot generalization task, we embed the encoder networks with pre-trained embeddings, from GloVe and Google Word2Vec.  

In [8]:
EMBEDDEING_SOURCE = 'glove'
hidden_size = 50

if EMBEDDEING_SOURCE == 'google':
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_GoogleNews300Negative.pkl', 'rb') as handle:
        b = pickle.load(handle)
else:
    with open('/Users/Viola/CDS/AAI/Project/SCAN-Learn/data/emb_pretrained/embedding_raw{}d.pkl'.format(hidden_size), 'rb') as handle:
        b = pickle.load(handle)

pretrained_emb = np.zeros((input_lang.n_words, hidden_size))
for k, v in input_lang.index2word.items():
    if v == 'SOS':
        pretrained_emb[k] = np.zeros(hidden_size)
    elif (v == 'EOS') and (EMBEDDEING_SOURCE != 'google'):
        pretrained_emb[k] = b['.']
    elif (v == 'and') and (EMBEDDEING_SOURCE == 'google'):
        pretrained_emb[k] = b['AND']
    else:
        pretrained_emb[k] = b[v]

The Encoder
-----------

The encoder of this seq2seq network is a GRU netword. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.




In [10]:
EMBEDDEING_PRETRAINED = True
WEIGHT_UPDATE = False

MODEL_VERSION = 'T0.4_glv50'

In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        if EMBEDDEING_PRETRAINED:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
            self.embedding.weight.requires_grad = WEIGHT_UPDATE
        
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is a GRU network with attention mechanism that takes the last output of the encoder and
outputs a sequence of words to create the translation.

First we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.

Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.




In [12]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

Training
========

Preparing Training Data
-----------------------

To train, for each pair we need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we append the
EOS token to both sequences.




In [13]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

We use teacher forcing to help converge faster with a delay fashion.




In [14]:
teacher_forcing_ratio = 0.8


def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Helper function for timing




In [15]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### Training interation

In [16]:
def trainIters(encoder, decoder, n_iters, print_every=1000, eval_every=1000, learning_rate=0.001):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
        encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
        decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
        
    best_test_acc = evaluateAccuracy(encoder, decoder, 500)
    print("Best evaluation accuracy: {0:.2f}%".format(best_test_acc * 100))

    parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        
    encoder_optimizer = optim.Adam(parameters, lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg), end=' ')
            
            if iter % eval_every == 0:
                test_acc = evaluateAccuracy(encoder, decoder, 200)
                print('{0:.2f}%'.format(test_acc * 100))
                
                if test_acc > best_test_acc:
                    with open("saved_models/encoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(encoder, f)
                    with open("saved_models/decoder_" + MODEL_VERSION, "wb") as f:
                        torch.save(decoder, f)
                    print("New best test accuracy! Model Updated!")
                    best_test_acc = test_acc
#                 elif test_acc < best_test_acc - 0.001:
#                     encoder = torch.load("saved_models/encoder_" + MODEL_VERSION)
#                     decoder = torch.load("saved_models/decoder_" + MODEL_VERSION)
                    
            else:
                print('')

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [17]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [18]:
input_lang, output_lang, pairs_eval = prepareData('in', 'out', True, dataFrom='test')
print(random.choice(pairs_eval))

Reading lines...
Read 15412 sentence pairs
Trimmed to 15412 sentence pairs
Counting words...
Counted words:
in 15
out 8
['turn opposite left and jump around left twice', 'I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP I_TURN_LEFT I_JUMP']


In [19]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs_eval)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [20]:
def evaluateAccuracy(encoder, decoder, n=10):
    ACCs = []
    for i in range(n):
        pair = random.choice(pairs_eval)
        output_words, _ = evaluate(encoder, decoder, pair[0])
        
        if output_words[-1] == '<EOS>':
            output_words = output_words[:-1]
        output_sentence = ' '.join(output_words)
        
        if output_sentence == pair[1]:
            ACCs.append(1)
        else:
            ACCs.append(0)
    return np.array(ACCs).mean()

Training and Evaluating
=======================


The model is initially trained with a higher teacher aid, and relatively large learning rate. Both teacher forcing effect and the learning rate decay over iterations when the model approaches the optimum.  

#### The model achieves 97% accuracy rate for the best test sample evaluation, and is 94% correct on average for the testset.

In [21]:
teacher_forcing_ratio = 0.8

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 5000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 0.00%
0m 15s (- 26m 1s) (50 1%) 1.9284 
0m 17s (- 14m 10s) (100 2%) 1.7474 
0m 19s (- 10m 22s) (150 3%) 1.6616 
0m 20s (- 8m 22s) (200 4%) 1.5682 
0m 22s (- 7m 6s) (250 5%) 1.4619 
0m 23s (- 6m 15s) (300 6%) 1.3767 
0m 25s (- 5m 38s) (350 7%) 1.3701 
0m 26s (- 5m 10s) (400 8%) 1.2315 
0m 28s (- 4m 47s) (450 9%) 1.2413 
0m 30s (- 4m 34s) (500 10%) 1.0623 0.00%
0m 39s (- 5m 16s) (550 11%) 1.3117 
0m 40s (- 4m 59s) (600 12%) 1.2106 
0m 42s (- 4m 43s) (650 13%) 1.1701 
0m 44s (- 4m 30s) (700 14%) 1.1185 
0m 45s (- 4m 17s) (750 15%) 1.1089 
0m 47s (- 4m 8s) (800 16%) 1.0465 
0m 49s (- 4m 0s) (850 17%) 1.1710 
0m 51s (- 3m 53s) (900 18%) 1.1640 
0m 52s (- 3m 45s) (950 19%) 1.0087 
0m 54s (- 3m 37s) (1000 20%) 1.0207 0.00%
1m 1s (- 3m 52s) (1050 21%) 1.0772 
1m 3s (- 3m 44s) (1100 22%) 0.8944 
1m 4s (- 3m 37s) (1150 23%) 0.9568 
1m 6s (- 3m 30s) (1200 24%) 1.1029 
1m 8s (- 3m 24s) (1250 25%) 0.9491 
1m 10s (- 3m 21s) (1300 26%) 1.0088 
1m 13s (- 3m 17s) (1350 27%) 1.

In [22]:
teacher_forcing_ratio = 0.5
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 0.60%
0m 10s (- 3m 19s) (50 5%) 0.6257 
0m 11s (- 1m 47s) (100 10%) 0.7002 
0m 13s (- 1m 16s) (150 15%) 0.6019 
0m 15s (- 1m 0s) (200 20%) 0.7238 
0m 16s (- 0m 49s) (250 25%) 0.6301 
0m 18s (- 0m 42s) (300 30%) 0.6415 
0m 19s (- 0m 36s) (350 35%) 0.7643 
0m 21s (- 0m 31s) (400 40%) 0.6502 
0m 22s (- 0m 27s) (450 45%) 0.6610 
0m 23s (- 0m 23s) (500 50%) 0.5823 1.00%
New best test accuracy! Model Updated!
0m 28s (- 0m 23s) (550 55%) 0.6141 
0m 30s (- 0m 20s) (600 60%) 0.5827 
0m 31s (- 0m 17s) (650 65%) 0.6119 
0m 33s (- 0m 14s) (700 70%) 0.7216 
0m 34s (- 0m 11s) (750 75%) 0.5672 
0m 36s (- 0m 9s) (800 80%) 0.6139 
0m 37s (- 0m 6s) (850 85%) 0.6459 
0m 38s (- 0m 4s) (900 90%) 0.5812 
0m 40s (- 0m 2s) (950 95%) 0.5373 
0m 41s (- 0m 0s) (1000 100%) 0.6255 2.00%
New best test accuracy! Model Updated!


In [23]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 2.40%
0m 5s (- 1m 42s) (50 5%) 0.5556 
0m 6s (- 0m 56s) (100 10%) 0.5471 
0m 7s (- 0m 41s) (150 15%) 0.5851 
0m 8s (- 0m 33s) (200 20%) 0.7022 
0m 9s (- 0m 28s) (250 25%) 0.6876 
0m 10s (- 0m 24s) (300 30%) 0.5488 
0m 11s (- 0m 20s) (350 35%) 0.5184 
0m 12s (- 0m 18s) (400 40%) 0.5934 
0m 13s (- 0m 15s) (450 45%) 0.6456 
0m 14s (- 0m 14s) (500 50%) 0.5957 1.50%
0m 17s (- 0m 14s) (550 55%) 0.5451 
0m 18s (- 0m 12s) (600 60%) 0.6204 
0m 20s (- 0m 10s) (650 65%) 0.5905 
0m 21s (- 0m 9s) (700 70%) 0.4906 
0m 23s (- 0m 7s) (750 75%) 0.5889 
0m 24s (- 0m 6s) (800 80%) 0.6090 
0m 26s (- 0m 4s) (850 85%) 0.5920 
0m 28s (- 0m 3s) (900 90%) 0.6193 
0m 29s (- 0m 1s) (950 95%) 0.6804 
0m 30s (- 0m 0s) (1000 100%) 0.5919 3.00%
New best test accuracy! Model Updated!


In [24]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 4.20%
0m 8s (- 2m 44s) (50 5%) 0.5123 
0m 10s (- 1m 30s) (100 10%) 0.5919 
0m 11s (- 1m 6s) (150 15%) 0.5420 
0m 13s (- 0m 53s) (200 20%) 0.5056 
0m 14s (- 0m 43s) (250 25%) 0.6662 
0m 16s (- 0m 37s) (300 30%) 0.6343 
0m 17s (- 0m 32s) (350 35%) 0.6375 
0m 19s (- 0m 28s) (400 40%) 0.5405 
0m 20s (- 0m 25s) (450 45%) 0.5012 
0m 22s (- 0m 22s) (500 50%) 0.5384 8.00%
New best test accuracy! Model Updated!
0m 26s (- 0m 21s) (550 55%) 0.4594 
0m 28s (- 0m 18s) (600 60%) 0.4834 
0m 29s (- 0m 15s) (650 65%) 0.4953 
0m 31s (- 0m 13s) (700 70%) 0.5596 
0m 32s (- 0m 10s) (750 75%) 0.5847 
0m 34s (- 0m 8s) (800 80%) 0.5266 
0m 35s (- 0m 6s) (850 85%) 0.5564 
0m 36s (- 0m 4s) (900 90%) 0.4839 
0m 38s (- 0m 2s) (950 95%) 0.5087 
0m 40s (- 0m 0s) (1000 100%) 0.5528 7.00%


In [26]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.001)

Best evaluation accuracy: 5.80%
0m 6s (- 1m 54s) (50 5%) 0.4246 
0m 6s (- 1m 2s) (100 10%) 0.4152 
0m 7s (- 0m 45s) (150 15%) 0.4982 
0m 9s (- 0m 36s) (200 20%) 0.5420 
0m 9s (- 0m 29s) (250 25%) 0.4766 
0m 10s (- 0m 25s) (300 30%) 0.5508 
0m 11s (- 0m 21s) (350 35%) 0.4823 
0m 12s (- 0m 18s) (400 40%) 0.4037 
0m 13s (- 0m 16s) (450 45%) 0.4792 
0m 14s (- 0m 14s) (500 50%) 0.5292 4.00%
0m 17s (- 0m 14s) (550 55%) 0.5050 
0m 18s (- 0m 12s) (600 60%) 0.3935 
0m 19s (- 0m 10s) (650 65%) 0.4860 
0m 20s (- 0m 8s) (700 70%) 0.4790 
0m 21s (- 0m 7s) (750 75%) 0.4174 
0m 22s (- 0m 5s) (800 80%) 0.4228 
0m 23s (- 0m 4s) (850 85%) 0.5064 
0m 24s (- 0m 2s) (900 90%) 0.4953 
0m 25s (- 0m 1s) (950 95%) 0.4291 
0m 26s (- 0m 0s) (1000 100%) 0.3717 8.00%
New best test accuracy! Model Updated!


In [27]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 7.20%
0m 6s (- 1m 54s) (50 5%) 0.4579 
0m 6s (- 1m 1s) (100 10%) 0.4575 
0m 7s (- 0m 44s) (150 15%) 0.4038 
0m 8s (- 0m 35s) (200 20%) 0.4190 
0m 9s (- 0m 28s) (250 25%) 0.4342 
0m 10s (- 0m 24s) (300 30%) 0.3723 
0m 12s (- 0m 22s) (350 35%) 0.4845 
0m 13s (- 0m 19s) (400 40%) 0.4840 
0m 14s (- 0m 17s) (450 45%) 0.4308 
0m 15s (- 0m 15s) (500 50%) 0.3744 5.50%
0m 18s (- 0m 14s) (550 55%) 0.5235 
0m 19s (- 0m 12s) (600 60%) 0.4299 
0m 20s (- 0m 10s) (650 65%) 0.4329 
0m 21s (- 0m 9s) (700 70%) 0.4555 
0m 22s (- 0m 7s) (750 75%) 0.4128 
0m 23s (- 0m 5s) (800 80%) 0.3845 
0m 25s (- 0m 4s) (850 85%) 0.4594 
0m 26s (- 0m 2s) (900 90%) 0.4676 
0m 27s (- 0m 1s) (950 95%) 0.3726 
0m 28s (- 0m 0s) (1000 100%) 0.3975 10.50%
New best test accuracy! Model Updated!


In [28]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0005)

Best evaluation accuracy: 9.20%
0m 6s (- 2m 11s) (50 5%) 0.4176 
0m 8s (- 1m 12s) (100 10%) 0.3750 
0m 8s (- 0m 50s) (150 15%) 0.5005 
0m 9s (- 0m 39s) (200 20%) 0.4292 
0m 10s (- 0m 32s) (250 25%) 0.4185 
0m 12s (- 0m 28s) (300 30%) 0.4218 
0m 13s (- 0m 24s) (350 35%) 0.3495 
0m 14s (- 0m 21s) (400 40%) 0.3991 
0m 15s (- 0m 19s) (450 45%) 0.3999 
0m 16s (- 0m 16s) (500 50%) 0.4338 10.00%
New best test accuracy! Model Updated!
0m 19s (- 0m 16s) (550 55%) 0.3504 
0m 21s (- 0m 14s) (600 60%) 0.4174 
0m 22s (- 0m 11s) (650 65%) 0.4201 
0m 23s (- 0m 10s) (700 70%) 0.3734 
0m 24s (- 0m 8s) (750 75%) 0.4381 
0m 25s (- 0m 6s) (800 80%) 0.3593 
0m 26s (- 0m 4s) (850 85%) 0.5190 
0m 28s (- 0m 3s) (900 90%) 0.4696 
0m 29s (- 0m 1s) (950 95%) 0.4379 
0m 30s (- 0m 0s) (1000 100%) 0.3647 11.00%
New best test accuracy! Model Updated!


In [29]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 9.00%
0m 5s (- 1m 46s) (50 5%) 0.3681 
0m 6s (- 1m 0s) (100 10%) 0.3404 
0m 7s (- 0m 43s) (150 15%) 0.3969 
0m 8s (- 0m 35s) (200 20%) 0.4362 
0m 9s (- 0m 29s) (250 25%) 0.3245 
0m 11s (- 0m 25s) (300 30%) 0.3340 
0m 12s (- 0m 22s) (350 35%) 0.4027 
0m 13s (- 0m 19s) (400 40%) 0.3720 
0m 14s (- 0m 17s) (450 45%) 0.2879 
0m 15s (- 0m 15s) (500 50%) 0.3876 22.50%
New best test accuracy! Model Updated!
0m 18s (- 0m 14s) (550 55%) 0.4775 
0m 19s (- 0m 12s) (600 60%) 0.3579 
0m 20s (- 0m 10s) (650 65%) 0.4076 
0m 21s (- 0m 9s) (700 70%) 0.3535 
0m 22s (- 0m 7s) (750 75%) 0.3926 
0m 23s (- 0m 5s) (800 80%) 0.2908 
0m 24s (- 0m 4s) (850 85%) 0.3936 
0m 25s (- 0m 2s) (900 90%) 0.3876 
0m 26s (- 0m 1s) (950 95%) 0.3626 
0m 27s (- 0m 0s) (1000 100%) 0.3500 19.00%


In [30]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 13.20%
0m 6s (- 1m 56s) (50 5%) 0.3677 
0m 6s (- 1m 2s) (100 10%) 0.3660 
0m 7s (- 0m 44s) (150 15%) 0.4475 
0m 8s (- 0m 34s) (200 20%) 0.3972 
0m 9s (- 0m 29s) (250 25%) 0.3881 
0m 10s (- 0m 25s) (300 30%) 0.3622 
0m 12s (- 0m 22s) (350 35%) 0.3845 
0m 13s (- 0m 19s) (400 40%) 0.3785 
0m 13s (- 0m 17s) (450 45%) 0.3637 
0m 15s (- 0m 15s) (500 50%) 0.3296 19.00%
New best test accuracy! Model Updated!
0m 18s (- 0m 14s) (550 55%) 0.2972 
0m 19s (- 0m 12s) (600 60%) 0.2873 
0m 20s (- 0m 11s) (650 65%) 0.2903 
0m 21s (- 0m 9s) (700 70%) 0.3328 
0m 22s (- 0m 7s) (750 75%) 0.3653 
0m 23s (- 0m 5s) (800 80%) 0.3441 
0m 24s (- 0m 4s) (850 85%) 0.3209 
0m 25s (- 0m 2s) (900 90%) 0.3516 
0m 26s (- 0m 1s) (950 95%) 0.3505 
0m 27s (- 0m 0s) (1000 100%) 0.3819 15.00%


In [31]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 14.80%
0m 5s (- 1m 50s) (50 5%) 0.3785 
0m 6s (- 1m 1s) (100 10%) 0.4159 
0m 8s (- 0m 46s) (150 15%) 0.3920 
0m 9s (- 0m 37s) (200 20%) 0.3423 
0m 10s (- 0m 32s) (250 25%) 0.3268 
0m 11s (- 0m 27s) (300 30%) 0.3301 
0m 13s (- 0m 24s) (350 35%) 0.3614 
0m 14s (- 0m 21s) (400 40%) 0.5037 
0m 15s (- 0m 19s) (450 45%) 0.3353 
0m 17s (- 0m 17s) (500 50%) 0.3746 13.00%
0m 22s (- 0m 18s) (550 55%) 0.3070 
0m 23s (- 0m 15s) (600 60%) 0.3506 
0m 25s (- 0m 13s) (650 65%) 0.3548 
0m 26s (- 0m 11s) (700 70%) 0.3524 
0m 27s (- 0m 9s) (750 75%) 0.3401 
0m 29s (- 0m 7s) (800 80%) 0.3377 
0m 31s (- 0m 5s) (850 85%) 0.3205 
0m 32s (- 0m 3s) (900 90%) 0.4070 
0m 33s (- 0m 1s) (950 95%) 0.4408 
0m 35s (- 0m 0s) (1000 100%) 0.3131 17.00%
New best test accuracy! Model Updated!


In [32]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 18.80%
0m 7s (- 2m 31s) (50 5%) 0.2603 
0m 9s (- 1m 26s) (100 10%) 0.3881 
0m 10s (- 1m 2s) (150 15%) 0.3755 
0m 12s (- 0m 48s) (200 20%) 0.3415 
0m 13s (- 0m 40s) (250 25%) 0.3404 
0m 14s (- 0m 34s) (300 30%) 0.3882 
0m 16s (- 0m 30s) (350 35%) 0.3955 
0m 17s (- 0m 26s) (400 40%) 0.3331 
0m 19s (- 0m 24s) (450 45%) 0.3724 
0m 21s (- 0m 21s) (500 50%) 0.2954 19.50%
New best test accuracy! Model Updated!
0m 24s (- 0m 20s) (550 55%) 0.3652 
0m 26s (- 0m 17s) (600 60%) 0.4008 
0m 28s (- 0m 15s) (650 65%) 0.3297 
0m 29s (- 0m 12s) (700 70%) 0.3229 
0m 31s (- 0m 10s) (750 75%) 0.3197 
0m 32s (- 0m 8s) (800 80%) 0.3918 
0m 34s (- 0m 6s) (850 85%) 0.3950 
0m 35s (- 0m 3s) (900 90%) 0.3361 
0m 36s (- 0m 1s) (950 95%) 0.3172 
0m 38s (- 0m 0s) (1000 100%) 0.3350 17.50%


In [33]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 15.60%
0m 5s (- 1m 43s) (50 5%) 0.3388 
0m 6s (- 0m 58s) (100 10%) 0.3973 
0m 7s (- 0m 42s) (150 15%) 0.3323 
0m 8s (- 0m 35s) (200 20%) 0.3246 
0m 10s (- 0m 30s) (250 25%) 0.4174 
0m 11s (- 0m 27s) (300 30%) 0.3779 
0m 13s (- 0m 24s) (350 35%) 0.3093 
0m 14s (- 0m 21s) (400 40%) 0.3645 
0m 15s (- 0m 19s) (450 45%) 0.3071 
0m 16s (- 0m 16s) (500 50%) 0.3279 17.50%
New best test accuracy! Model Updated!
0m 20s (- 0m 16s) (550 55%) 0.3580 
0m 21s (- 0m 14s) (600 60%) 0.3908 
0m 22s (- 0m 12s) (650 65%) 0.3340 
0m 23s (- 0m 10s) (700 70%) 0.3336 
0m 24s (- 0m 8s) (750 75%) 0.3402 
0m 26s (- 0m 6s) (800 80%) 0.2863 
0m 27s (- 0m 4s) (850 85%) 0.3023 
0m 28s (- 0m 3s) (900 90%) 0.4773 
0m 30s (- 0m 1s) (950 95%) 0.3348 
0m 31s (- 0m 0s) (1000 100%) 0.3915 21.00%
New best test accuracy! Model Updated!


In [34]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 19.00%
0m 5s (- 1m 50s) (50 5%) 0.3824 
0m 7s (- 1m 3s) (100 10%) 0.3377 
0m 8s (- 0m 47s) (150 15%) 0.3240 
0m 9s (- 0m 38s) (200 20%) 0.3259 
0m 11s (- 0m 33s) (250 25%) 0.3816 
0m 12s (- 0m 28s) (300 30%) 0.3572 
0m 13s (- 0m 25s) (350 35%) 0.4397 
0m 14s (- 0m 22s) (400 40%) 0.3530 
0m 16s (- 0m 19s) (450 45%) 0.3900 
0m 17s (- 0m 17s) (500 50%) 0.3139 19.50%
New best test accuracy! Model Updated!
0m 20s (- 0m 16s) (550 55%) 0.3662 
0m 21s (- 0m 14s) (600 60%) 0.3256 
0m 22s (- 0m 12s) (650 65%) 0.2986 
0m 23s (- 0m 10s) (700 70%) 0.3300 
0m 24s (- 0m 8s) (750 75%) 0.3259 
0m 26s (- 0m 6s) (800 80%) 0.3206 
0m 26s (- 0m 4s) (850 85%) 0.2795 
0m 28s (- 0m 3s) (900 90%) 0.4466 
0m 28s (- 0m 1s) (950 95%) 0.3099 
0m 29s (- 0m 0s) (1000 100%) 0.3596 16.00%


In [35]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 17.20%
0m 5s (- 1m 49s) (50 5%) 0.2635 
0m 6s (- 1m 1s) (100 10%) 0.4356 
0m 8s (- 0m 46s) (150 15%) 0.3473 
0m 9s (- 0m 38s) (200 20%) 0.3136 
0m 10s (- 0m 31s) (250 25%) 0.3146 
0m 11s (- 0m 27s) (300 30%) 0.3910 
0m 12s (- 0m 23s) (350 35%) 0.3204 
0m 13s (- 0m 20s) (400 40%) 0.3329 
0m 14s (- 0m 18s) (450 45%) 0.3204 
0m 15s (- 0m 15s) (500 50%) 0.3076 21.50%
New best test accuracy! Model Updated!
0m 18s (- 0m 15s) (550 55%) 0.2833 
0m 20s (- 0m 13s) (600 60%) 0.3141 
0m 21s (- 0m 11s) (650 65%) 0.3148 
0m 22s (- 0m 9s) (700 70%) 0.2673 
0m 23s (- 0m 7s) (750 75%) 0.3046 
0m 24s (- 0m 6s) (800 80%) 0.3172 
0m 25s (- 0m 4s) (850 85%) 0.4159 
0m 26s (- 0m 2s) (900 90%) 0.2896 
0m 28s (- 0m 1s) (950 95%) 0.3489 
0m 29s (- 0m 0s) (1000 100%) 0.3206 15.00%


In [36]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 17.80%
0m 6s (- 1m 59s) (50 5%) 0.3584 
0m 7s (- 1m 7s) (100 10%) 0.3784 
0m 8s (- 0m 48s) (150 15%) 0.4138 
0m 9s (- 0m 38s) (200 20%) 0.3838 
0m 10s (- 0m 32s) (250 25%) 0.3354 
0m 11s (- 0m 27s) (300 30%) 0.3232 
0m 13s (- 0m 24s) (350 35%) 0.3491 
0m 14s (- 0m 21s) (400 40%) 0.4598 
0m 15s (- 0m 19s) (450 45%) 0.3980 
0m 16s (- 0m 16s) (500 50%) 0.3701 18.50%
New best test accuracy! Model Updated!
0m 20s (- 0m 16s) (550 55%) 0.3356 
0m 21s (- 0m 14s) (600 60%) 0.3537 
0m 22s (- 0m 11s) (650 65%) 0.2898 
0m 23s (- 0m 10s) (700 70%) 0.3349 
0m 24s (- 0m 8s) (750 75%) 0.3355 
0m 25s (- 0m 6s) (800 80%) 0.3462 
0m 26s (- 0m 4s) (850 85%) 0.3630 
0m 27s (- 0m 3s) (900 90%) 0.3862 
0m 28s (- 0m 1s) (950 95%) 0.3097 
0m 29s (- 0m 0s) (1000 100%) 0.4064 18.00%


In [37]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 16.00%
0m 6s (- 1m 56s) (50 5%) 0.3110 
0m 7s (- 1m 4s) (100 10%) 0.3399 
0m 8s (- 0m 46s) (150 15%) 0.3054 
0m 9s (- 0m 37s) (200 20%) 0.3144 
0m 10s (- 0m 31s) (250 25%) 0.4007 
0m 11s (- 0m 26s) (300 30%) 0.3191 
0m 11s (- 0m 22s) (350 35%) 0.3071 
0m 12s (- 0m 19s) (400 40%) 0.3812 
0m 13s (- 0m 16s) (450 45%) 0.3225 
0m 14s (- 0m 14s) (500 50%) 0.3219 20.00%
New best test accuracy! Model Updated!
0m 16s (- 0m 13s) (550 55%) 0.3160 
0m 18s (- 0m 12s) (600 60%) 0.3214 
0m 19s (- 0m 10s) (650 65%) 0.4255 
0m 20s (- 0m 8s) (700 70%) 0.3353 
0m 21s (- 0m 7s) (750 75%) 0.3410 
0m 21s (- 0m 5s) (800 80%) 0.3159 
0m 22s (- 0m 4s) (850 85%) 0.3694 
0m 23s (- 0m 2s) (900 90%) 0.3775 
0m 24s (- 0m 1s) (950 95%) 0.2476 
0m 25s (- 0m 0s) (1000 100%) 0.3279 16.00%


In [38]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 17.40%
0m 5s (- 1m 38s) (50 5%) 0.3380 
0m 6s (- 0m 54s) (100 10%) 0.3351 
0m 6s (- 0m 38s) (150 15%) 0.2962 
0m 7s (- 0m 31s) (200 20%) 0.3398 
0m 8s (- 0m 26s) (250 25%) 0.3038 
0m 9s (- 0m 22s) (300 30%) 0.4005 
0m 10s (- 0m 19s) (350 35%) 0.3062 
0m 11s (- 0m 16s) (400 40%) 0.3372 
0m 12s (- 0m 14s) (450 45%) 0.3962 
0m 13s (- 0m 13s) (500 50%) 0.3373 18.50%
New best test accuracy! Model Updated!
0m 15s (- 0m 12s) (550 55%) 0.3230 
0m 16s (- 0m 10s) (600 60%) 0.3596 
0m 17s (- 0m 9s) (650 65%) 0.3462 
0m 18s (- 0m 7s) (700 70%) 0.2856 
0m 18s (- 0m 6s) (750 75%) 0.3312 
0m 19s (- 0m 4s) (800 80%) 0.3491 
0m 20s (- 0m 3s) (850 85%) 0.3733 
0m 21s (- 0m 2s) (900 90%) 0.3207 
0m 22s (- 0m 1s) (950 95%) 0.4276 
0m 23s (- 0m 0s) (1000 100%) 0.3381 18.00%


In [39]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 19.00%
0m 5s (- 1m 45s) (50 5%) 0.3035 
0m 6s (- 0m 57s) (100 10%) 0.3249 
0m 7s (- 0m 42s) (150 15%) 0.3026 
0m 8s (- 0m 33s) (200 20%) 0.3037 
0m 9s (- 0m 27s) (250 25%) 0.3133 
0m 10s (- 0m 23s) (300 30%) 0.2588 
0m 11s (- 0m 20s) (350 35%) 0.2949 
0m 12s (- 0m 18s) (400 40%) 0.3268 
0m 13s (- 0m 16s) (450 45%) 0.3512 
0m 14s (- 0m 14s) (500 50%) 0.3235 16.50%
0m 17s (- 0m 13s) (550 55%) 0.4115 
0m 18s (- 0m 12s) (600 60%) 0.3200 
0m 20s (- 0m 11s) (650 65%) 0.3084 
0m 23s (- 0m 9s) (700 70%) 0.3150 
0m 26s (- 0m 8s) (750 75%) 0.3503 
0m 28s (- 0m 7s) (800 80%) 0.3480 
0m 30s (- 0m 5s) (850 85%) 0.3196 
0m 31s (- 0m 3s) (900 90%) 0.2566 
0m 32s (- 0m 1s) (950 95%) 0.3322 
0m 33s (- 0m 0s) (1000 100%) 0.2847 19.00%


In [40]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.0001)

Best evaluation accuracy: 17.20%
0m 5s (- 1m 49s) (50 5%) 0.3437 
0m 7s (- 1m 3s) (100 10%) 0.3441 
0m 8s (- 0m 46s) (150 15%) 0.2922 
0m 9s (- 0m 36s) (200 20%) 0.3489 
0m 9s (- 0m 29s) (250 25%) 0.2643 
0m 10s (- 0m 25s) (300 30%) 0.3513 
0m 11s (- 0m 22s) (350 35%) 0.3026 
0m 13s (- 0m 19s) (400 40%) 0.3472 
0m 14s (- 0m 17s) (450 45%) 0.3261 
0m 15s (- 0m 15s) (500 50%) 0.3724 20.00%
New best test accuracy! Model Updated!
0m 19s (- 0m 15s) (550 55%) 0.3424 
0m 20s (- 0m 13s) (600 60%) 0.3827 
0m 21s (- 0m 11s) (650 65%) 0.3623 
0m 22s (- 0m 9s) (700 70%) 0.3152 
0m 24s (- 0m 8s) (750 75%) 0.3183 
0m 25s (- 0m 6s) (800 80%) 0.3099 
0m 26s (- 0m 4s) (850 85%) 0.3274 
0m 27s (- 0m 3s) (900 90%) 0.3796 
0m 28s (- 0m 1s) (950 95%) 0.3272 
0m 30s (- 0m 0s) (1000 100%) 0.2948 23.50%
New best test accuracy! Model Updated!


In [41]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 19.40%
0m 5s (- 1m 44s) (50 5%) 0.3427 
0m 6s (- 0m 56s) (100 10%) 0.3746 
0m 7s (- 0m 40s) (150 15%) 0.3153 
0m 8s (- 0m 33s) (200 20%) 0.4292 
0m 9s (- 0m 27s) (250 25%) 0.2907 
0m 9s (- 0m 23s) (300 30%) 0.2807 
0m 10s (- 0m 20s) (350 35%) 0.3428 
0m 11s (- 0m 17s) (400 40%) 0.3189 
0m 12s (- 0m 15s) (450 45%) 0.3285 
0m 13s (- 0m 13s) (500 50%) 0.3642 17.50%
0m 16s (- 0m 13s) (550 55%) 0.3492 
0m 16s (- 0m 11s) (600 60%) 0.3662 
0m 17s (- 0m 9s) (650 65%) 0.4229 
0m 18s (- 0m 7s) (700 70%) 0.2988 
0m 19s (- 0m 6s) (750 75%) 0.3677 
0m 20s (- 0m 5s) (800 80%) 0.3462 
0m 21s (- 0m 3s) (850 85%) 0.3012 
0m 22s (- 0m 2s) (900 90%) 0.3678 
0m 22s (- 0m 1s) (950 95%) 0.2642 
0m 23s (- 0m 0s) (1000 100%) 0.3176 23.50%
New best test accuracy! Model Updated!


In [42]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 20.60%
0m 5s (- 3m 22s) (50 2%) 0.3225 
0m 6s (- 1m 55s) (100 5%) 0.3088 
0m 6s (- 1m 26s) (150 7%) 0.3385 
0m 8s (- 1m 12s) (200 10%) 0.3139 
0m 9s (- 1m 3s) (250 12%) 0.3115 
0m 9s (- 0m 56s) (300 15%) 0.2409 
0m 10s (- 0m 51s) (350 17%) 0.2689 
0m 11s (- 0m 47s) (400 20%) 0.3391 
0m 12s (- 0m 43s) (450 22%) 0.3198 
0m 13s (- 0m 40s) (500 25%) 0.2449 25.00%
New best test accuracy! Model Updated!
0m 16s (- 0m 42s) (550 27%) 0.3855 
0m 16s (- 0m 39s) (600 30%) 0.3554 
0m 17s (- 0m 36s) (650 32%) 0.2855 
0m 18s (- 0m 34s) (700 35%) 0.3224 
0m 19s (- 0m 32s) (750 37%) 0.3880 
0m 20s (- 0m 30s) (800 40%) 0.2774 
0m 21s (- 0m 28s) (850 42%) 0.3161 
0m 22s (- 0m 27s) (900 45%) 0.3271 
0m 23s (- 0m 25s) (950 47%) 0.3311 
0m 23s (- 0m 23s) (1000 50%) 0.2949 22.00%
0m 26s (- 0m 23s) (1050 52%) 0.3431 
0m 27s (- 0m 22s) (1100 55%) 0.3931 
0m 28s (- 0m 20s) (1150 57%) 0.3510 
0m 29s (- 0m 19s) (1200 60%) 0.2847 
0m 30s (- 0m 18s) (1250 62%) 0.2829 
0m 31s (- 0m 16s) (13

In [46]:
trainIters(encoder1, attn_decoder1, 2000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 25.20%
0m 4s (- 3m 14s) (50 2%) 0.3846 
0m 6s (- 1m 54s) (100 5%) 0.3449 
0m 7s (- 1m 26s) (150 7%) 0.3311 
0m 7s (- 1m 11s) (200 10%) 0.3377 
0m 8s (- 1m 0s) (250 12%) 0.2966 
0m 9s (- 0m 53s) (300 15%) 0.2885 
0m 10s (- 0m 49s) (350 17%) 0.3440 
0m 11s (- 0m 46s) (400 20%) 0.3012 
0m 12s (- 0m 42s) (450 22%) 0.3777 
0m 13s (- 0m 40s) (500 25%) 0.2823 25.50%
New best test accuracy! Model Updated!
0m 16s (- 0m 42s) (550 27%) 0.2853 
0m 17s (- 0m 40s) (600 30%) 0.2901 
0m 18s (- 0m 37s) (650 32%) 0.2694 
0m 19s (- 0m 35s) (700 35%) 0.2646 
0m 20s (- 0m 33s) (750 37%) 0.3171 
0m 20s (- 0m 31s) (800 40%) 0.3614 
0m 21s (- 0m 29s) (850 42%) 0.3456 
0m 22s (- 0m 27s) (900 45%) 0.3109 
0m 23s (- 0m 25s) (950 47%) 0.3272 
0m 24s (- 0m 24s) (1000 50%) 0.2835 19.50%
0m 26s (- 0m 23s) (1050 52%) 0.3743 
0m 27s (- 0m 22s) (1100 55%) 0.3170 
0m 28s (- 0m 20s) (1150 57%) 0.3413 
0m 29s (- 0m 19s) (1200 60%) 0.2654 
0m 30s (- 0m 18s) (1250 62%) 0.2930 
0m 30s (- 0m 16s) (13

In [47]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 19.40%
0m 6s (- 2m 11s) (50 5%) 0.3655 
0m 7s (- 1m 10s) (100 10%) 0.3454 
0m 8s (- 0m 49s) (150 15%) 0.2668 
0m 9s (- 0m 37s) (200 20%) 0.2534 
0m 10s (- 0m 31s) (250 25%) 0.2702 
0m 11s (- 0m 26s) (300 30%) 0.3126 
0m 12s (- 0m 23s) (350 35%) 0.2670 
0m 13s (- 0m 20s) (400 40%) 0.3051 
0m 14s (- 0m 17s) (450 45%) 0.2991 
0m 15s (- 0m 15s) (500 50%) 0.3651 21.50%
New best test accuracy! Model Updated!
0m 18s (- 0m 14s) (550 55%) 0.3474 
0m 19s (- 0m 12s) (600 60%) 0.3454 
0m 20s (- 0m 11s) (650 65%) 0.2965 
0m 21s (- 0m 9s) (700 70%) 0.3419 
0m 22s (- 0m 7s) (750 75%) 0.3141 
0m 23s (- 0m 5s) (800 80%) 0.3758 
0m 24s (- 0m 4s) (850 85%) 0.3529 
0m 25s (- 0m 2s) (900 90%) 0.3056 
0m 26s (- 0m 1s) (950 95%) 0.3208 
0m 26s (- 0m 0s) (1000 100%) 0.3307 16.50%


In [48]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 20.80%
0m 4s (- 1m 32s) (50 5%) 0.3146 
0m 5s (- 0m 50s) (100 10%) 0.2733 
0m 6s (- 0m 36s) (150 15%) 0.3876 
0m 7s (- 0m 29s) (200 20%) 0.3646 
0m 8s (- 0m 24s) (250 25%) 0.3722 
0m 9s (- 0m 21s) (300 30%) 0.3511 
0m 10s (- 0m 18s) (350 35%) 0.2987 
0m 11s (- 0m 16s) (400 40%) 0.3165 
0m 12s (- 0m 14s) (450 45%) 0.2660 
0m 13s (- 0m 13s) (500 50%) 0.3819 25.50%
New best test accuracy! Model Updated!
0m 15s (- 0m 12s) (550 55%) 0.3084 
0m 16s (- 0m 10s) (600 60%) 0.2547 
0m 17s (- 0m 9s) (650 65%) 0.3697 
0m 17s (- 0m 7s) (700 70%) 0.3418 
0m 18s (- 0m 6s) (750 75%) 0.2785 
0m 19s (- 0m 4s) (800 80%) 0.3691 
0m 20s (- 0m 3s) (850 85%) 0.3179 
0m 21s (- 0m 2s) (900 90%) 0.3320 
0m 22s (- 0m 1s) (950 95%) 0.2796 
0m 23s (- 0m 0s) (1000 100%) 0.3399 16.50%


In [49]:
trainIters(encoder1, attn_decoder1, 1000, print_every=50, eval_every=500, learning_rate=0.00001)

Best evaluation accuracy: 21.20%
0m 4s (- 1m 22s) (50 5%) 0.3552 
0m 5s (- 0m 45s) (100 10%) 0.3284 
0m 5s (- 0m 33s) (150 15%) 0.3918 
0m 6s (- 0m 26s) (200 20%) 0.3271 
0m 7s (- 0m 22s) (250 25%) 0.3019 
0m 8s (- 0m 19s) (300 30%) 0.3032 
0m 9s (- 0m 17s) (350 35%) 0.3509 
0m 10s (- 0m 15s) (400 40%) 0.3662 
0m 11s (- 0m 13s) (450 45%) 0.2358 
0m 12s (- 0m 12s) (500 50%) 0.2846 24.50%
New best test accuracy! Model Updated!
0m 14s (- 0m 11s) (550 55%) 0.2712 
0m 15s (- 0m 10s) (600 60%) 0.3173 
0m 16s (- 0m 8s) (650 65%) 0.2524 
0m 17s (- 0m 7s) (700 70%) 0.3117 
0m 17s (- 0m 5s) (750 75%) 0.3238 
0m 18s (- 0m 4s) (800 80%) 0.3463 
0m 19s (- 0m 3s) (850 85%) 0.3174 
0m 20s (- 0m 2s) (900 90%) 0.2893 
0m 21s (- 0m 1s) (950 95%) 0.2807 
0m 22s (- 0m 0s) (1000 100%) 0.4004 21.50%


---

### Samples Evaluation

In [50]:
if os.path.exists("saved_models/encoder_" + MODEL_VERSION):
    encoder2 = torch.load("saved_models/encoder_" + MODEL_VERSION)
    decoder2 = torch.load("saved_models/decoder_" + MODEL_VERSION)
evaluateAccuracy(encoder2, decoder2, n=2000)

0.20799999999999999

In [51]:
evaluateRandomly(encoder2, decoder2)

> run right thrice and jump opposite left
= I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_TURN_LEFT I_TURN_LEFT I_JUMP
< I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN I_TURN_LEFT I_TURN_LEFT I_JUMP <EOS>

> jump around right and look opposite right
= I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_TURN_RIGHT I_LOOK
< I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_LOOK <EOS>

> turn around left twice after jump opposite left twice
= I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT
< I_TURN_LEFT I_TURN_LEFT I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT <EOS>

> walk opposite right twice after jump opposite right twice
= I_TURN_RIGHT I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_TURN_

---