In [1]:
import pickle as pkl
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time

from utils import asMinutes, timeSince, load_zipped_pickle, corpus_bleu, directories
from langUtils import loadLangPairs, langDataset, langCollateFn, initHybridEmbeddings, tensorToList

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Import Dictionaries and Data

In [2]:
def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = pkl.load(f)
        return loaded_object

In [3]:
# Print sentence given numbers
def ids2sentence(sentence, dictionary):
    return ' '.join([dictionary[i] for i in sentence])
#ids2sentence(en_train_num[0], id2word_en_dic)

def add_symbol(id2word_dic, word2id_dic):
    symbols = ['<pad>', '<unk>', '<sos>', '<eos>']
    for i, symbol in enumerate(symbols):
        id2word_dic[i] = symbol
        word2id_dic[symbol] = i
    return id2word_dic, word2id_dic

In [4]:
# id2word_vi_dic = load_zipped_pickle("../embeddings/id2word_vi_dic.p")
# word2id_vi_dic = load_zipped_pickle("../embeddings/word2id_vi_dic.p")

# id2word_en_dic = load_zipped_pickle("../embeddings/id2word_en_dic.p")
# word2id_en_dic = load_zipped_pickle("../embeddings/word2id_en_dic.p")

# id2word_vi_dic, word2id_vi_dic = add_symbol(id2word_vi_dic, word2id_vi_dic)
# id2word_en_dic, word2id_en_dic = add_symbol(id2word_en_dic, word2id_en_dic)

# vi_train = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok.p")
# en_train = load_zipped_pickle("../data/vi-en-tokens/train_en_tok.p") # Already Processed for symbols

# vi_train_num = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok_num.p")
# en_train_num = load_zipped_pickle("../data/vi-en-tokens/train_en_tok_num.p") # Already Processed for symbols

In [5]:
vi, en = loadLangPairs("vi")
BATCH_SIZE = 32
train_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(len(vi.train_num)) if (2 < len(vi.train[i]) < vi.max_length) & (2 < len(en.train[i]) < en.max_length)])
overfit_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(32)])
overfit_loader = torch.utils.data.DataLoader(dataset=overfit_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
dev_dataset = langDataset([(vi.dev_num[i], en.dev_num[i]) for i in range(len(vi.dev_num)) if (2 < len(vi.dev[i]) < vi.max_length) & (2 < len(en.dev[i]) < en.max_length)])
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)

In [6]:
SPECIAL_SYMBOLS_ID = PAD_ID, UNK_ID, SOS_ID, EOS_ID = 0, 1, 2, 3

## Padding Data

### Sort by input data length

In [7]:
# def sort_by_length(data_input, target_data):
#     input_size = [len(data) for data in data_input]
#     size_index = np.argsort(input_size)
#     return list(np.array(data_input)[size_index]), list(np.array(target_data)[size_index])

# vi_train_num, en_train_num = sort_by_length(vi_train_num, en_train_num)

### Padding Data given batch size

In [8]:
def pad(data, length):
    for i, line in enumerate(data):
        if len(line) < length:
            for i in range(len(line), length):
                line.append(0)
        else:
            data[i] = line[0:length]
    return data

# Return the batch data and target
def get_batch(i, batch_size, train_data, train_target):
    if i * batch_size > len(train_data):
        raise Exception('Incorrect batch index')
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = list(np.array(train_data)[start_idx:end_idx])
    batch_target = list(np.array(train_target)[start_idx:end_idx])
    batch_data = pad(batch_data, len(batch_data[batch_size - 1]))
    max_target = max([len(data) for data in batch_data])
    batch_target = pad(batch_target, max_target)
    return batch_data, batch_target

# get_batch(5, 64, vi_train_num, en_train_num)

## Models

### Encoder

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_size, raw_emb, learn_ids):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        # input_size: input dictionary size
        self.embedding = initHybridEmbeddings(raw_emb, learn_ids)
#         self.embedding = nn.Embedding(input_size, hidden_size)
        self.num_layers = num_layers
        self.gru = nn.GRU(self.hidden_size, 
                          hidden_size, 
                          num_layers= num_layers, 
                          batch_first = True) # BATCH FIRST

    def forward(self, encoder_input, hidden_input):
        # encoder_input: batch * 1 (for 1 word each time)
        embedded_input = self.embedding(encoder_input)
        # embedded_input: batch * 1 * emb_dim
        # hidden_input: batch * 1(layer) * hidden_size
        output, hidden = self.gru(embedded_input, hidden_input)
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

### Decoder

In [10]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, max_length, batch_size, raw_emb, learn_ids, dropout_p=0.1):
        super(AttentionDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        # Max length for a sentence
        self.max_length = max_length
        self.num_layers = num_layers
        
#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding = initHybridEmbeddings(raw_emb, learn_ids)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          num_layers= num_layers, 
                          batch_first = True) # BATCH_FRIST)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, decoder_input, hidden_input, encoder_hiddens):
        # hidden_input: 1 * batch * hidden_size
        hidden_input = hidden_input.squeeze(0)
        # decoder_input: batch * 1
        embedded_input = self.embedding(decoder_input)
        # embedded_input: batch * 1 * embed_size
        embedded_input = self.dropout(embedded_input).squeeze(1)
        
        # embedded_input: batch * embed_size
        # hidden_input: batch * hidden_size 
        # (Use input and newest hidden to decide which encoder hidden is important)
        attn_weights = F.softmax(self.attn(torch.cat((embedded_input, hidden_input), 1)), dim=1).unsqueeze(1)
        # encoder_output: max_length * batch * encoder_hidden_size
        encoder_hiddens_t = encoder_hiddens.transpose(0, 1)
        # attn_weights: batch * 1 * max_length(theoretical)
        cropped_attn_weights = attn_weights[:, :, :encoder_hiddens_t.shape[1]]
        # cropped_attn_weights: batch * 1 * max_length(actual)
        # encoder_hiddens_t: batch * max_length(actual) * encoder_hidden_size
        ## 
        attn_applied = torch.bmm(cropped_attn_weights, encoder_hiddens_t).squeeze(1)
        
        # embedded_input: batch * embed_size
        # attn_applied: batch * encoder_hidden_size
        output = torch.cat((embedded_input, attn_applied), 1)
        output = self.attn_combine(output)
        
        # output: batch * hidden_size
        gru_input = F.relu(output).unsqueeze(1)
        # hidden_input: batch * hidden_size
        hidden_input = hidden_input.unsqueeze(0)
        # gru_input: batch * 1 * hidden_size
        # hidden_input: 1 * batch * hidden_size
        output, hidden = self.gru(gru_input, hidden_input)
        output = self.out(output)
        #output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(self.num_layers,  self.batch_size, self.hidden_size, device=device)

## Training

In [11]:
def trainAttention(inp, output, out_max, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size):
    total_avg_loss = 0
    loss = 0
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_len = inp.shape[1]
    encoder_outputs = torch.zeros(input_len, batch_size, 1, HIDDEN_SIZE, device=device)
    encoder_hiddens = torch.zeros(input_len, 1, batch_size, HIDDEN_SIZE, device=device)
    # Encode
    for ec_idx in range(input_len):
        # input batch_size * 1
        encoder_output, encoder_hidden = encoder(inp[:, ec_idx].unsqueeze(1), encoder_hidden)
        encoder_outputs[ec_idx] = encoder_output
        encoder_hiddens[ec_idx] = encoder_hidden

    # Decode
    decoder_input = torch.tensor([SOS_ID] * batch_size, device=device)
    decoder_hidden = encoder_hidden

    # Always use Teacher Forcing
    for dc_idx in range(out_max):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input.unsqueeze(1), decoder_hidden, encoder_hiddens.squeeze(1))
        decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
        loss += criterion(decoder_output, output[:, dc_idx])
        decoder_input = output[:, dc_idx]

        ## Print Value
#         sample_sentence.append(torch.argmax(decoder_output[0]).item())

    loss.backward()
    total_avg_loss += loss.item() / out_max

    encoder_optimizer.step()
    decoder_optimizer.step()

    ## Print Value
#     print("Predict: ", ids2sentence(sample_sentence, en.id2word))
#     print("Actual: ", ids2sentence(output[0].cpu().numpy(), en.id2word))
        
    return total_avg_loss
#     return 0

In [20]:
def bleuEvalAttention(encoder, decoder, data_loader, batch_size):
    with torch.no_grad():
        true_outputs = []
        decoder_outputs = []
        for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(data_loader):
            if i * batch_size >= 10000 or len(inp[0]) != batch_size:
                continue
            inp = inp.transpose(0,1).to(device)
            output = output.transpose(0,1).to(device)
            true_outputs.append([[str(tok.item()) for tok in out if tok != 0] for out in output])
            encoder_hidden = encoder.initHidden()
            input_len = inp.shape[1]
            encoder_outputs = torch.zeros(input_len, batch_size, 1, HIDDEN_SIZE, device=device)
            encoder_hiddens = torch.zeros(input_len, 1, batch_size, HIDDEN_SIZE, device=device)

            # Encode
            for ec_idx in range(input_len):
                # input batch_size * 1
                encoder_output, encoder_hidden = encoder(inp[:, ec_idx].unsqueeze(1), encoder_hidden)
                encoder_outputs[ec_idx] = encoder_output
                encoder_hiddens[ec_idx] = encoder_hidden

            # Decode
            decoder_input = torch.tensor([SOS_ID] * batch_size, device=device)
            decoder_hidden = encoder_hidden

            # Greedy
            for dc_idx in range(out_max):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input.unsqueeze(1), decoder_hidden, encoder_hiddens.squeeze(1))
                decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
                topv, topi = decoder_output.topk(1)
                decoder_input = torch.LongTensor([topi[i][0] for i in range(inp.size(0))]).to(device)
                ## Print Value
                decoder_outputs.append(list(decoder_input.cpu().numpy()))
            ## Print Value
        predict = []
        for seq in np.array(decoder_outputs).T.astype(str):
            seq_toks = []
            for tok in seq:
                seq_toks.append(tok)
                if tok == '3':
                    break
            predict.append(seq_toks)
        print(np.shape(predict))
        print(np.shape(decoder_outputs))
        print(np.shape(true_outputs))
#         print('Sample True: ', ' '.join([en.id2word[int(i)] for i in true_outputs[0][0]]))
#         print('Sample Predicted: ', ' '.join([en.id2word[int(i)] for i in predict[0]]))
#         for seq in predict:
#             print('Sample Predicted: ', ' '.join([en.id2word[int(i)] for i in seq]))
        bleu_score = corpus_bleu(predict, true_outputs, 4)
        return bleu_score

In [13]:
def fitAttention(train_loader, dev_loader, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size, epochs, print_every):
    start = time.time()
    print('Initializing Model Training + Eval...')
    losses = []
    train_scores = []
    dev_scores = []
    for epoch in range(epochs):
        loss = 0
        for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(train_loader):
            if (len(inp[0]) != batch_size):
                continue
            inp.transpose_(0,1)
            output.transpose_(0,1)
            inp = inp.to(device)
            output = output.to(device)
            loss += trainAttention(inp, output, out_max, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size)
            if i % print_every == 0 and i > 0:
                losses.append(loss/i)
                print("Time Elapsed: {} | Loss: {:.4}".format(asMinutes(time.time() - start),
                                                                                loss/i))
                pkl.dump(encoder, open("./vi-g-attn-encoder-sgd0.01.p", "wb"))
                pkl.dump(decoder, open("./vi-g-attn-decoder-sgd0.01.p", "wb"))
        train_score = bleuEvalAttention(encoder, decoder, train_loader, batch_size)
#         dev_score = bleuEvalATtention(encoder, decoder, dev_loader, batch_size)
        train_scores.append(train_score)
#         dev_scores.append(dev_score)
        print("Epoch: {} | Time Elapsed: {} | Loss: {:.4} | Train BLEU: {:.4}".format(epoch + 1, 
                                                                                                        asMinutes(time.time() - start),
                                                                                                        loss/len(train_loader), 
                                                                                                        train_score))
#                                                                                                         dev_score))

In [14]:
# dic_size_vi = len(id2word_vi_dic.keys())
# dic_size_en = len(id2word_en_dic.keys())
HIDDEN_SIZE = 300
LEARNING_RATE = 0.01
MAX_LENGTH = 100
## Add ignore index
criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)

encoder = EncoderRNN(input_size = vi.n_words, hidden_size = HIDDEN_SIZE, num_layers = 1, batch_size = BATCH_SIZE, raw_emb=vi.emb, learn_ids=vi.learn_ids).to(device)
# decoder = DecoderRNN(hidden_size = HIDDEN_SIZE, output_size = en.n_words, num_layers = 1, batch_size = BATCH_SIZE, raw_emb=en.emb, learn_ids=en.learn_ids).to(device)
decoder = AttentionDecoderRNN(hidden_size = HIDDEN_SIZE, output_size = en.n_words, num_layers = 1, max_length = MAX_LENGTH, batch_size = BATCH_SIZE, raw_emb = en.emb, learn_ids = en.learn_ids, dropout_p=0.1).to(device)

encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LEARNING_RATE)

In [26]:
encoder = pkl.load(open('vi-g-attn-encoder-sgd0.01.p', 'rb'))
decoder = pkl.load(open('vi-g-attn-decoder-sgd0.01.p', 'rb'))

In [18]:
test_dataset = langDataset([(vi.test_num[i], en.train_num[i]) for i in range(len(vi.test_num)) if (2 < len(vi.test[i]) < vi.max_length) & (2 < len(en.test[i]) < en.max_length)])
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)


In [None]:
bleuEvalAttention(encoder, decoder, train_loader, BATCH_SIZE)



In [None]:
fitAttention(train_loader, dev_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE, 100, 300)

Initializing Model Training + Eval...
Time Elapsed: 6m 29s | Loss: 6.931
Time Elapsed: 15m 17s | Loss: 6.272
Time Elapsed: 24m 2s | Loss: 5.961
Time Elapsed: 32m 49s | Loss: 5.763
Time Elapsed: 41m 39s | Loss: 5.62
Time Elapsed: 50m 31s | Loss: 5.507
Time Elapsed: 59m 14s | Loss: 5.42
Time Elapsed: 68m 5s | Loss: 5.345
Time Elapsed: 76m 49s | Loss: 5.279
Time Elapsed: 85m 31s | Loss: 5.227
Time Elapsed: 89m 40s | Loss: 5.181
Epoch: 1 | Time Elapsed: 103m 54s | Loss: 5.161 | Train BLEU: 0.1724
Time Elapsed: 112m 39s | Loss: 4.663
Time Elapsed: 121m 27s | Loss: 4.642
Time Elapsed: 130m 12s | Loss: 4.628
Time Elapsed: 139m 0s | Loss: 4.611
Time Elapsed: 147m 49s | Loss: 4.599
Time Elapsed: 156m 32s | Loss: 4.586
Time Elapsed: 165m 15s | Loss: 4.573
Time Elapsed: 172m 35s | Loss: 4.561
Time Elapsed: 176m 38s | Loss: 4.551
Time Elapsed: 181m 49s | Loss: 4.539
Time Elapsed: 190m 33s | Loss: 4.527
Epoch: 2 | Time Elapsed: 210m 49s | Loss: 4.521 | Train BLEU: 1.213
Time Elapsed: 219m 39s | Los

In [16]:
# vi - en non-hybrid embeddings
fit(train_loader, dev_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE, 10, 300)

Initializing Model Training + Eval...
Time Elapsed: 4m 13s | Loss: 5.741
Time Elapsed: 8m 16s | Loss: 5.55
Time Elapsed: 12m 10s | Loss: 5.456
Time Elapsed: 16m 24s | Loss: 5.368
Time Elapsed: 20m 26s | Loss: 5.299
Time Elapsed: 24m 18s | Loss: 5.24
Time Elapsed: 28m 30s | Loss: 5.197
Time Elapsed: 32m 36s | Loss: 5.156
Time Elapsed: 36m 25s | Loss: 5.124
Time Elapsed: 40m 37s | Loss: 5.094
Time Elapsed: 44m 45s | Loss: 5.067
Epoch: 1 | Time Elapsed: 46m 32s | Loss: 5.054 | Train BLEU: 1.563 | Dev BLEU: 1.069
Time Elapsed: 50m 37s | Loss: 4.608
Time Elapsed: 54m 48s | Loss: 4.597
Time Elapsed: 58m 31s | Loss: 4.575
Time Elapsed: 62m 43s | Loss: 4.55
Time Elapsed: 66m 54s | Loss: 4.54
Time Elapsed: 70m 39s | Loss: 4.529
Time Elapsed: 74m 51s | Loss: 4.518
Time Elapsed: 79m 3s | Loss: 4.51
Time Elapsed: 82m 48s | Loss: 4.501
Time Elapsed: 86m 58s | Loss: 4.493
Time Elapsed: 91m 10s | Loss: 4.486
Epoch: 2 | Time Elapsed: 93m 10s | Loss: 4.481 | Train BLEU: 2.315 | Dev BLEU: 2.702
Time Ela

In [14]:
# pkl.dump(encoder, open("./hybrid-vi-encoder.p", "wb"))
pkl.dump("hi", open("./test.p", "wb"))

In [27]:
len(overfit_dataset)

2

In [91]:
bleuEval(encoder, decoder, train_loader, BATCH_SIZE)

0
1


KeyboardInterrupt: 

In [47]:
with open('./vi-encoder.p', 'rb') as pickle_file:
    baseline_enc = pkl.load(pickle_file)
with open('./vi-decoder.p', 'rb') as pickle_file:
    baseline_dec = pkl.load(pickle_file)


In [92]:
bleuEval(baseline_enc, baseline_dec, overfit_loader, BATCH_SIZE)

0




Sample True:  <sos> in 4 minutes , atmospheric chemist <unk> pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule . <eos>
Sample Predicted:  <sos> <unk> <unk> : the frequent gradients , the frequent revolution -- the themes of the world . <eos>
Sample Predicted:  <sos> we have to introduce the <unk> <unk> and the <unk> . <eos>
Sample Predicted:  <sos> so we went to the <unk> , and we were buddhist <unk> . <eos>
Sample Predicted:  <sos> i went to the other problems , and i m able to solve the other problems . <eos>
Sample Predicted:  <sos> <unk> <unk> , the <unk> , the <unk> , the <unk> , the <unk> . <eos>
Sample Predicted:  <sos> we re able to do the biggest challenge , we re able to do the biggest challenge . <eos>
Sample Predicted:  <sos> i m sure that , i m sure that i m going to see the <unk> . <eos>
Sa

1.4825853903799304

In [44]:
encoder.gru.batch_fi

128

In [40]:
%debug

> [0;32m/opt/anaconda3/lib/python3.7/site-packages/torch/nn/modules/rnn.py[0m(143)[0;36mcheck_hidden_size[0;34m()[0m
[0;32m    141 [0;31m        [0;32mdef[0m [0mcheck_hidden_size[0m[0;34m([0m[0mhx[0m[0;34m,[0m [0mexpected_hidden_size[0m[0;34m,[0m [0mmsg[0m[0;34m=[0m[0;34m'Expected hidden size {}, got {}'[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    142 [0;31m            [0;32mif[0m [0mtuple[0m[0;34m([0m[0mhx[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m)[0m [0;34m!=[0m [0mexpected_hidden_size[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m--> 143 [0;31m                [0;32mraise[0m [0mRuntimeError[0m[0;34m([0m[0mmsg[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0mexpected_hidden_size[0m[0;34m,[0m [0mtuple[0m[0;34m([0m[0mhx[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    144 [0;31m[0;34m[0m[0m
[0m[0;32m    145 [0;31m        [0;32mif[0m 

ipdb>  encoder


*** NameError: name 'encoder' is not defined


ipdb>  exit


In [34]:
%debug

> [0;32m<ipython-input-31-f7ca4d53521a>[0m(15)[0;36mtrain[0;34m()[0m
[0;32m     13 [0;31m        [0;31m# input batch_size * 1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     14 [0;31m        [0mencoder_output[0m[0;34m,[0m [0mencoder_hidden[0m [0;34m=[0m [0mencoder[0m[0;34m([0m[0minp[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0mec_idx[0m[0;34m][0m[0;34m.[0m[0munsqueeze[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m,[0m [0mencoder_hidden[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m        [0mencoder_outputs[0m[0;34m[[0m[0mec_idx[0m[0;34m][0m [0;34m=[0m [0mencoder_output[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m        [0mencoder_hiddens[0m[0;34m[[0m[0mec_idx[0m[0;34m][0m [0;34m=[0m [0mencoder_hidden[0m[0;34m[0m[0m
[0m[0;32m     17 [0;31m[0;34m[0m[0m
[0m


ipdb>  enoder_outputs


*** NameError: name 'enoder_outputs' is not defined


ipdb>  encoder_outputs


tensor([[[[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]],

         [[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]],

         [[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]],

         ...,

         [[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]],

         [[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]],

         [[ 0.0170,  0.1717,  0.3893,  ...,  0.0381, -0.1810, -0.2559]]],


        [[[-0.0219, -0.0105, -0.2507,  ...,  0.1992, -0.4429, -0.4246]],

         [[-0.0108,  0.4253,  0.3838,  ..., -0.2613,  0.1234, -0.1874]],

         [[-0.2046, -0.4536,  0.5676,  ...,  0.0421,  0.3484, -0.3940]],

         ...,

         [[-0.0036, -0.4598,  0.2498,  ..., -0.1157,  0.1377,  0.2163]],

         [[-0.0036, -0.4598,  0.2498,  ..., -0.1157,  0.1377,  0.2163]],

         [[-0.0856,  0.0131,  0.2655,  ...,  0.2046, -0.2525, -0.0867]]],


        [[[ 0.4918, -0.1363,  0.1370,  ...,  0.0146, -0.2607, -0.5579]],

    

ipdb>  len(encoder_outputs)


23


ipdb>  encoder_outputs[23]


*** IndexError: index 23 is out of bounds for dimension 0 with size 23


ipdb>  exit


In [12]:
# fit(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE, 100)

Predict:  ['<sos>', '<sos>', '<sos>', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  8.810749053955078
Predict:  ['<sos>', '<sos>', 'the', 'the', 'the', 'two', 'the', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  4.840537643432617
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'the', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  1.9151222229003906
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  0.9088132858276368
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  0.5086045742034913
P

KeyboardInterrupt: 

In [46]:
# for i in range(1000):
#     train(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE)

Training Loss:  8.852307891845703
Predict:  ['<sos>', '<sos>', 'the', 'just', 'these', 'the', 'notes', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  4.9451759338378904
Predict:  ['<sos>', '<sos>', 'coming', 'just', 'these', 'notes', 'notes', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  1.9719776153564452
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  0.9234449386596679
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  0.5161348819732666
Predict:  ['<sos>', 'it', 's', 'just', 'thes

KeyboardInterrupt: 

In [None]:
# def train(train_input, train_target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size):
#     # Batch
#     total_avg_loss = 0
#     for i in range(len(train_input) // batch_size):
#         loss = 0
#         encoder_hidden = encoder.initHidden()
        
#         batch = get_batch(i, batch_size, train_input, train_target)
#         # size batch_size * seq_length
#         batch_input = torch.tensor(batch[0], device=device)
#         batch_target = torch.tensor(batch[1], device=device)
#         input_length = batch_input.shape[1] ## should be seq length
#         target_length = batch_target.shape[1]
#         print(input_length, target_length)

#         encoder_optimizer.zero_grad()
#         decoder_optimizer.zero_grad()
        
#         encoder_outputs = torch.zeros(input_length, batch_size, 1, 256, device=device)
#         encoder_hiddens = torch.zeros(input_length, 1, batch_size, 256, device=device)
        
#         # Encode
#         for ec_idx in range(input_length):
#             # input batch_size * 1
#             encoder_output, encoder_hidden = encoder(batch_input[:, ec_idx].unsqueeze(1), encoder_hidden)
#             encoder_outputs[ec_idx] = encoder_output
#             encoder_hiddens[ec_idx] = encoder_hidden
        
#         # Decode
#         decoder_input = torch.tensor([2] * batch_size, device=device) # SOS token 2
#         decoder_hidden = encoder_hidden
        
#         ## Print Value
#         sample_sentence = []
        
#         # Always use Teacher Forcing
#         for dc_idx in range(target_length):
#             decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
#             decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
#             loss += criterion(decoder_output, batch_target[:, dc_idx])
#             decoder_input = batch_target[:, dc_idx]
            
#             ## Print Value
#             sample_sentence.append(torch.argmax(decoder_output[0]).item())
            
#         loss.backward()
#         total_avg_loss += loss.item() / target_length
        
#         encoder_optimizer.step()
#         decoder_optimizer.step()
        
# #         print('Training Loss: ', loss.item() / target_length)
        
#         ## Print Value
#         print("Predict: ", ids2sentence(sample_sentence, id2word_en_dic))
#         print("Actual: ", ids2sentence(batch_target[0].cpu().numpy(), id2word_en_dic))
        
#     return total_avg_loss