In [19]:
import BasicModel
import pickle as pkl
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time

from utils import asMinutes, timeSince, load_zipped_pickle, corpus_bleu, directories
from langUtils import loadLangPairs, langDataset, langCollateFn, initHybridEmbeddings, tensorToList

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

zh, en = loadLangPairs("zh")
BATCH_SIZE = 32
train_dataset = langDataset([(zh.train_num[i], en.train_num[i]) for i in range(len(zh.train_num)) if (len(zh.train[i]) < zh.max_length) & (len(en.train[i]) < en.max_length)])
# overfit_dataset = langDataset([(zh.train_num[i], en.train_num[i]) for i in range(32)])
# overfit_loader = torch.utils.data.DataLoader(dataset=overfit_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=langCollateFn,
#                                            shuffle=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
dev_dataset = langDataset([(zh.dev_num[i], en.dev_num[i]) for i in range(len(zh.dev_num)) if (len(zh.dev[i]) < zh.max_length) & (len(en.dev[i]) < en.max_length)])
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)

In [20]:
SPECIAL_SYMBOLS_ID = PAD_ID, UNK_ID, SOS_ID, EOS_ID = 0, 1, 2, 3

In [21]:
HIDDEN_SIZE = 128
learning_rate = 0.01

## Add ignore index
zh_criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)

zh_encoder = BasicModel.EncoderRNN(input_size = zh.n_words, hidden_size = HIDDEN_SIZE, num_layers = 1, batch_size = BATCH_SIZE, raw_emb=zh.emb, learn_ids=zh.learn_ids).to(device)
zh_decoder = BasicModel.DecoderRNN(hidden_size = HIDDEN_SIZE, output_size = en.n_words, num_layers = 1, batch_size = BATCH_SIZE, raw_emb=en.emb, learn_ids=en.learn_ids).to(device)

zh_encoder_optimizer = optim.SGD(zh_encoder.parameters(), lr=learning_rate)
zh_decoder_optimizer = optim.SGD(zh_decoder.parameters(), lr=learning_rate)

In [22]:
def fit(train_loader, dev_loader, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size, epochs, print_every, hidden_size):
    start = time.time()
    print('Initializing Model Training + Eval...')
    losses = []
    train_scores = []
    dev_scores = []
    for epoch in range(epochs):
        loss = 0
        for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(train_loader):
            if (len(inp[0]) != batch_size):
                continue
            inp.transpose_(0,1)
            output.transpose_(0,1)
            inp = inp.to(device)
            output = output.to(device)
            loss += BasicModel.train(inp, output, out_max, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size, hidden_size)
            if i % print_every == 0 and i > 0:
                losses.append(loss/i)
                print("Time Elapsed: {} | Loss: {:.4}".format(asMinutes(time.time() - start),
                                                                                loss/i))
                pkl.dump(encoder, open("./zh-g-base-encoder-sgd0.01.p", "wb"))
                pkl.dump(decoder, open("./zh-g-base-decoder-sgd0.01.p", "wb"))
        train_score = BasicModel.bleuEval(encoder, decoder, train_loader, batch_size, hidden_size)
        train_scores.append(train_score)
        print("Epoch: {} | Time Elapsed: {} | Loss: {:.4} | Train BLEU: {:.4}".format(epoch + 1, 
                                                                                                        asMinutes(time.time() - start),
                                                                                                        loss/len(train_loader), 
                                                                                                        train_score)) 


In [23]:
def bleuEval(encoder, decoder, data_loader, batch_size):
    with torch.no_grad():
        true_outputs = []
        decoder_outputs = []
        for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(data_loader):
            if i * batch_size >= 10000 or len(inp[0]) != batch_size:
                continue
            inp = inp.transpose(0,1).to(device)
            output = output.transpose(0,1).to(device)
            true_outputs.append([[str(tok.item()) for tok in out if tok != 0] for out in output])
            encoder_hidden = encoder.initHidden()
            input_len = inp.shape[1]
            encoder_outputs = torch.zeros(input_len, batch_size, 1, HIDDEN_SIZE, device=device)
            encoder_hiddens = torch.zeros(input_len, 1, batch_size, HIDDEN_SIZE, device=device)

            # Encode
            for ec_idx in range(input_len):
                # input batch_size * 1
                encoder_output, encoder_hidden = encoder(inp[:, ec_idx].unsqueeze(1), encoder_hidden)
                encoder_outputs[ec_idx] = encoder_output
                encoder_hiddens[ec_idx] = encoder_hidden

            # Decode
            decoder_input = torch.tensor([SOS_ID] * batch_size, device=device)
            decoder_hidden = encoder_hidden

            # Greedy
            for dc_idx in range(out_max):
                decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
                decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
                topv, topi = decoder_output.topk(1)
                decoder_input = torch.LongTensor([topi[i][0] for i in range(inp.size(0))]).to(device)
                ## Print Value
                decoder_outputs.append(list(decoder_input.cpu().numpy()))
            ## Print Value
        predict = []
        for seq in np.array(decoder_outputs).T.astype(str):
            seq_toks = []
            for tok in seq:
                seq_toks.append(tok)
                if tok == '3':
                    break
            predict.append(seq_toks)
#         print('Sample True: ', ' '.join([en.id2word[int(i)] for i in true_outputs[0][0]]))
#         print('Sample Predicted: ', ' '.join([en.id2word[int(i)] for i in predict[0]]))
#         for seq in predict:
#             print('Sample Predicted: ', ' '.join([en.id2word[int(i)] for i in seq]))
        bleu_score = corpus_bleu(predict, true_outputs, 4)
        return bleu_score

In [24]:
vi, en = loadLangPairs("vi")
BATCH_SIZE = 32
train_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(len(vi.train_num)) if (2 < len(vi.train[i]) < vi.max_length) & (2 < len(en.train[i]) < en.max_length)])
overfit_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(32)])
overfit_loader = torch.utils.data.DataLoader(dataset=overfit_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
dev_dataset = langDataset([(vi.dev_num[i], en.dev_num[i]) for i in range(len(vi.dev_num)) if (2 < len(vi.dev[i]) < vi.max_length) & (2 < len(en.dev[i]) < en.max_length)])
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)

In [25]:
encoder = pkl.load(open('hybrid-vi-encoder.p', 'rb'))
decoder = pkl.load(open('hybrid-vi-decoder.p', 'rb'))

AttributeError: Can't get attribute 'EncoderRNN' on <module '__main__'>

In [None]:
bleuEval(encoder, decoder, dev_loader, BATCH_SIZE)

In [None]:
bleuEvalAttention(encoder, decoder, dev_loader, BATCH_SIZE)

In [None]:
fit(train_loader, dev_loader, zh_encoder, zh_decoder, zh_encoder_optimizer, zh_decoder_optimizer, zh_criterion, BATCH_SIZE, 15, 300, HIDDEN_SIZE)

Initializing Model Training + Eval...
Time Elapsed: 4m 38s | Loss: 6.943
Time Elapsed: 9m 13s | Loss: 6.334
Time Elapsed: 13m 51s | Loss: 6.062
Time Elapsed: 18m 27s | Loss: 5.899
Time Elapsed: 23m 4s | Loss: 5.785
Time Elapsed: 27m 40s | Loss: 5.693
Time Elapsed: 32m 16s | Loss: 5.614
Time Elapsed: 36m 54s | Loss: 5.549
Time Elapsed: 41m 35s | Loss: 5.494
Time Elapsed: 46m 13s | Loss: 5.444
Time Elapsed: 50m 49s | Loss: 5.401
Time Elapsed: 55m 25s | Loss: 5.363
Time Elapsed: 60m 4s | Loss: 5.328
Time Elapsed: 64m 41s | Loss: 5.296
Time Elapsed: 69m 17s | Loss: 5.267
Time Elapsed: 73m 55s | Loss: 5.24
Time Elapsed: 78m 29s | Loss: 5.214
Time Elapsed: 83m 4s | Loss: 5.191
Epoch: 1 | Time Elapsed: 90m 44s | Loss: 5.187 | Train BLEU: 2.168
Time Elapsed: 93m 54s | Loss: 4.779
Time Elapsed: 97m 10s | Loss: 4.767
Time Elapsed: 100m 25s | Loss: 4.757
Time Elapsed: 104m 14s | Loss: 4.749
Time Elapsed: 108m 50s | Loss: 4.744
Time Elapsed: 113m 25s | Loss: 4.736
Time Elapsed: 118m 3s | Loss: 4.7

In [None]:
# ^output got cut off due to accidentally exiting jupyter notebook