In [1]:
import pickle as pkl
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from utils import asMinutes, timeSince, load_zipped_pickle, corpus_bleu, directories
from langUtils import loadLangPairs, langDataset, langCollateFn, initHybridEmbeddings, tensorToList

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Import Dictionaries and Data

In [2]:
def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = pkl.load(f)
        return loaded_object

In [17]:
# Print sentence given numbers
def ids2sentence(sentence, dictionary):
    return ' '.join([dictionary[i] for i in sentence])
#ids2sentence(en_train_num[0], id2word_en_dic)

def add_symbol(id2word_dic, word2id_dic):
    symbols = ['<pad>', '<unk>', '<sos>', '<eos>']
    for i, symbol in enumerate(symbols):
        id2word_dic[i] = symbol
        word2id_dic[symbol] = i
    return id2word_dic, word2id_dic

In [None]:
# id2word_vi_dic = load_zipped_pickle("../embeddings/id2word_vi_dic.p")
# word2id_vi_dic = load_zipped_pickle("../embeddings/word2id_vi_dic.p")

# id2word_en_dic = load_zipped_pickle("../embeddings/id2word_en_dic.p")
# word2id_en_dic = load_zipped_pickle("../embeddings/word2id_en_dic.p")

# id2word_vi_dic, word2id_vi_dic = add_symbol(id2word_vi_dic, word2id_vi_dic)
# id2word_en_dic, word2id_en_dic = add_symbol(id2word_en_dic, word2id_en_dic)

# vi_train = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok.p")
# en_train = load_zipped_pickle("../data/vi-en-tokens/train_en_tok.p") # Already Processed for symbols

# vi_train_num = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok_num.p")
# en_train_num = load_zipped_pickle("../data/vi-en-tokens/train_en_tok_num.p") # Already Processed for symbols

In [3]:
vi, en = loadLangPairs("vi")
BATCH_SIZE = 2
train_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(len(vi.train_num)) if (len(vi.train[i]) < vi.max_length) & (len(en.train[i]) < en.max_length)])
overfit_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(2)])
train_loader = torch.utils.data.DataLoader(dataset=overfit_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=False)
dev_dataset = langDataset([(vi.dev_num[i], en.dev_num[i]) for i in range(len(vi.dev_num)) if (len(vi.dev[i]) < vi.max_length) & (len(en.dev[i]) < en.max_length)])
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)

In [4]:
SPECIAL_SYMBOLS_ID = PAD_ID, UNK_ID, SOS_ID, EOS_ID = 0, 1, 2, 3

## Padding Data

### Sort by input data length

In [5]:
# def sort_by_length(data_input, target_data):
#     input_size = [len(data) for data in data_input]
#     size_index = np.argsort(input_size)
#     return list(np.array(data_input)[size_index]), list(np.array(target_data)[size_index])

# vi_train_num, en_train_num = sort_by_length(vi_train_num, en_train_num)

NameError: name 'vi_train_num' is not defined

### Padding Data given batch size

In [6]:
def pad(data, length):
    for i, line in enumerate(data):
        if len(line) < length:
            for i in range(len(line), length):
                line.append(0)
        else:
            data[i] = line[0:length]
    return data

# Return the batch data and target
def get_batch(i, batch_size, train_data, train_target):
    if i * batch_size > len(train_data):
        raise Exception('Incorrect batch index')
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = list(np.array(train_data)[start_idx:end_idx])
    batch_target = list(np.array(train_target)[start_idx:end_idx])
    batch_data = pad(batch_data, len(batch_data[batch_size - 1]))
    max_target = max([len(data) for data in batch_data])
    batch_target = pad(batch_target, max_target)
    return batch_data, batch_target

# get_batch(5, 64, vi_train_num, en_train_num)

## Models

### Encoder

In [7]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        # input_size: input dictionary size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.num_layers = num_layers
        self.gru = nn.GRU(hidden_size, 
                          hidden_size, 
                          num_layers= num_layers, 
                          batch_first = True) # BATCH FIRST

    def forward(self, encoder_input, hidden_input):
        # encoder_input: batch * 1 (for 1 word each time)
        embedded_input = self.embedding(encoder_input)
        # embedded_input: batch * 1 * emb_dim
        # hidden_input: batch * 1(layer) * hidden_size
        output, hidden = self.gru(embedded_input, hidden_input)
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

EncoderRNN(5, 5, 4, 64)

EncoderRNN(
  (embedding): Embedding(5, 5)
  (gru): GRU(5, 5, num_layers=4, batch_first=True)
)

### Decoder

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, batch_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        # output_size: input dictionary size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, 
                          hidden_size,
                          num_layers= num_layers, 
                          batch_first = True) # BATCH_FRIST
        self.out = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1) # Use cross entropy loss outside

    def forward(self, decoder_input, hidden_input):
        # decoder_input: batch * 1
        embedded_input = self.embedding(decoder_input)
        # embedded_input: batch * 1 * emb_dim
        embedded_input = F.relu(embedded_input)
        # hidden_input: batch * hidden_size
        output, hidden = self.gru(embedded_input, hidden_input)
        output = self.out(output)
        # output = self.softmax(output) # not using softmax
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

DecoderRNN(10, 10, 4, 64)

DecoderRNN(
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=4, batch_first=True)
  (out): Linear(in_features=10, out_features=10, bias=True)
)

## Training

In [21]:
def train(inp, output, max_len, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size):
    total_avg_loss = 0
    loss = 0
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_len, batch_size, 1, 256, device=device)
    encoder_hiddens = torch.zeros(max_len, 1, batch_size, 256, device=device)

    # Encode
    for ec_idx in range(max_len):
        # input batch_size * 1
        encoder_output, encoder_hidden = encoder(inp[:, ec_idx].unsqueeze(1), encoder_hidden)
        encoder_outputs[ec_idx] = encoder_output
        encoder_hiddens[ec_idx] = encoder_hidden

    # Decode
    decoder_input = torch.tensor([SOS_ID] * batch_size, device=device)
    decoder_hidden = encoder_hidden

    ## Print Value
    sample_sentence = []

    # Always use Teacher Forcing
    for dc_idx in range(max_len):
        decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
        decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
        loss += criterion(decoder_output, output[:, dc_idx])
        decoder_input = output[:, dc_idx]

        ## Print Value
        sample_sentence.append(torch.argmax(decoder_output[0]).item())

    loss.backward()
    total_avg_loss += loss.item() / max_len

    encoder_optimizer.step()
    decoder_optimizer.step()

    ## Print Value
    print("Predict: ", ids2sentence(sample_sentence, en.id2word))
    print("Actual: ", ids2sentence(output[0].cpu().numpy(), en.id2word))
        
    return total_avg_loss

In [38]:
def bleuEval(data_loader, batch_size):
    with torch.no_grad():
        true_outputs = []
        decoder_outputs = []
        for i, (inp, inp_lens, output, out_mask, max_len) in enumerate(train_loader):
            inp = inp.transpose(0,1).to(device)
            output = output.transpose(0,1).to(device)
            encoder_hidden = encoder.initHidden()

            encoder_outputs = torch.zeros(max_len, batch_size, 1, 256, device=device)
            encoder_hiddens = torch.zeros(max_len, 1, batch_size, 256, device=device)

            # Encode
            for ec_idx in range(max_len):
                # input batch_size * 1
                encoder_output, encoder_hidden = encoder(inp[:, ec_idx].unsqueeze(1), encoder_hidden)
                encoder_outputs[ec_idx] = encoder_output
                encoder_hiddens[ec_idx] = encoder_hidden

            # Decode
            decoder_input = torch.tensor([SOS_ID] * batch_size, device=device)
            decoder_hidden = encoder_hidden

            ## Print Value
            sample_sentence = []

            # Greedy
            for dc_idx in range(max_len):
                decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
                decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
                topv, topi = decoder_output.topk(1)
                decoder_input = torch.LongTensor([[topi[i][0] for i in range(inp.size(0))]]).to(device)

                ## Print Value
                sample_sentence.append(torch.argmax(decoder_output[0]).item())

            ## Print Value
            print("Predict: ", ids2sentence(sample_sentence, en.id2word))
            print("Actual: ", ids2sentence(output[0].cpu().numpy(), en.id2word))

        return total_avg_loss

In [30]:
def fit(train_loader, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size, epochs):
    for epoch in range(epochs):
        for i, (inp, inp_lens, output, out_mask, max_len) in enumerate(train_loader):
            inp.transpose_(0,1)
            output.transpose_(0,1)
            inp = inp.to(device)
            output = output.to(device)
            loss = train(inp, output, max_len, encoder, decoder, encoder_opt, decoder_opt, criterion, batch_size)
        train_score = bleuEval(train_loader, batch_size)
        print("Loss: ", loss)
#         train_score = eval(train_data, target_data, encoder, decoder, )

In [15]:
# dic_size_vi = len(id2word_vi_dic.keys())
# dic_size_en = len(id2word_en_dic.keys())
hidden_size = 256
learning_rate = 0.01
BATCH_SIZE = 2

## Add ignore index
criterion = nn.CrossEntropyLoss()

encoder = EncoderRNN(input_size = vi.n_words, hidden_size = hidden_size, num_layers = 1, batch_size = BATCH_SIZE).to(device)
decoder = DecoderRNN(hidden_size = hidden_size, output_size = en.n_words, num_layers = 1, batch_size = BATCH_SIZE).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

In [40]:
%debug

> [0;32m<ipython-input-38-27d84892d6fa>[0m(31)[0;36mbleuEval[0;34m()[0m
[0;32m     29 [0;31m                [0mdecoder_output[0m[0;34m,[0m [0mdecoder_hidden[0m [0;34m=[0m [0mdecoder[0m[0;34m([0m[0mdecoder_input[0m[0;34m.[0m[0munsqueeze[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m,[0m [0mdecoder_hidden[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     30 [0;31m                [0mdecoder_output[0m [0;34m=[0m [0mdecoder_output[0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mdevice[0m[0;34m)[0m [0;31m# get rid of the seq dimention[0m[0;34m[0m[0m
[0m[0;32m---> 31 [0;31m                [0;36m1[0m[0;34m/[0m[0;36m0[0m[0;34m[0m[0m
[0m[0;32m     32 [0;31m                [0mprint[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0mshape[0m[0;34m([0m[0mdecoder_output[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     33 [0;31m                [0mprint[0m[0;34m([0m[0mdecoder_output[

ipdb>  decoder_output


tensor([[ 4.9296, 12.6264, 25.1569,  ..., -6.4543, -6.5707, -6.9671],
        [ 5.5839, 18.3771, 25.3459,  ..., -7.3874, -7.5899, -8.2216]],
       device='cuda:0')


ipdb>  np.shape(decoder_output)


torch.Size([2, 100004])


ipdb>  decoder_output.topk(1)


(tensor([[25.1569],
        [25.3459]], device='cuda:0'), tensor([[2],
        [2]], device='cuda:0'))


ipdb>  decoder_output.topk(10)


(tensor([[25.1569, 18.4606, 13.4730, 12.6264, 10.6833,  9.8819,  8.6341,  8.5149,
          8.4437,  7.9167],
        [25.3459, 18.3771, 13.8979, 12.9491, 12.2044,  9.8487,  9.1137,  9.0788,
          8.4472,  7.9396]], device='cuda:0'), tensor([[    2,    10,   227,     1,  1542, 33849,     4, 11333, 26922,     5],
        [    2,     1, 33849,    10,    13,   227,     5,  2251,     4,    71]],
       device='cuda:0'))


ipdb>  np.shape(output[:])


torch.Size([2, 52])


ipdb>  _, test_i = decoder_output.topk(10)
ipdb>  test_i[0][0]


tensor(2, device='cuda:0')


ipdb>  test_i[1][0]


tensor(2, device='cuda:0')


ipdb>  test_i[2][0]


*** IndexError: index 2 is out of bounds for dimension 0 with size 2


ipdb>  inp.size(1)


80


ipdb>  inp.size(0)


2


ipdb>  test_i


tensor([[    2,    10,   227,     1,  1542, 33849,     4, 11333, 26922,     5],
        [    2,     1, 33849,    10,    13,   227,     5,  2251,     4,    71]],
       device='cuda:0')


ipdb>  test_i[0][0]


tensor(2, device='cuda:0')


ipdb>  test_i[1][0]


tensor(2, device='cuda:0')


ipdb>  test_i[2][0]


*** IndexError: index 2 is out of bounds for dimension 0 with size 2


ipdb>  exit


In [39]:
fit(train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE, 100)

Predict:  <sos> in 4 minutes , atmospheric chemist <unk> pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule . <eos>
Actual:  <sos> in 4 minutes , atmospheric chemist <unk> pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule . <eos>


ZeroDivisionError: division by zero

In [12]:
# fit(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE, 100)

Predict:  ['<sos>', '<sos>', '<sos>', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  8.810749053955078
Predict:  ['<sos>', '<sos>', 'the', 'the', 'the', 'two', 'the', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  4.840537643432617
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'the', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  1.9151222229003906
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  0.9088132858276368
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Loss:  0.5086045742034913
P

KeyboardInterrupt: 

In [46]:
# for i in range(1000):
#     train(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, BATCH_SIZE)

Training Loss:  8.852307891845703
Predict:  ['<sos>', '<sos>', 'the', 'just', 'these', 'the', 'notes', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  4.9451759338378904
Predict:  ['<sos>', '<sos>', 'coming', 'just', 'these', 'notes', 'notes', 'the', 'the', 'the']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  1.9719776153564452
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'the', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  0.9234449386596679
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'freaky']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Training Loss:  0.5161348819732666
Predict:  ['<sos>', 'it', 's', 'just', 'thes

KeyboardInterrupt: 

In [None]:
# def train(train_input, train_target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size):
#     # Batch
#     total_avg_loss = 0
#     for i in range(len(train_input) // batch_size):
#         loss = 0
#         encoder_hidden = encoder.initHidden()
        
#         batch = get_batch(i, batch_size, train_input, train_target)
#         # size batch_size * seq_length
#         batch_input = torch.tensor(batch[0], device=device)
#         batch_target = torch.tensor(batch[1], device=device)
#         input_length = batch_input.shape[1] ## should be seq length
#         target_length = batch_target.shape[1]
#         print(input_length, target_length)

#         encoder_optimizer.zero_grad()
#         decoder_optimizer.zero_grad()
        
#         encoder_outputs = torch.zeros(input_length, batch_size, 1, 256, device=device)
#         encoder_hiddens = torch.zeros(input_length, 1, batch_size, 256, device=device)
        
#         # Encode
#         for ec_idx in range(input_length):
#             # input batch_size * 1
#             encoder_output, encoder_hidden = encoder(batch_input[:, ec_idx].unsqueeze(1), encoder_hidden)
#             encoder_outputs[ec_idx] = encoder_output
#             encoder_hiddens[ec_idx] = encoder_hidden
        
#         # Decode
#         decoder_input = torch.tensor([2] * batch_size, device=device) # SOS token 2
#         decoder_hidden = encoder_hidden
        
#         ## Print Value
#         sample_sentence = []
        
#         # Always use Teacher Forcing
#         for dc_idx in range(target_length):
#             decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
#             decoder_output = decoder_output.squeeze(1).to(device) # get rid of the seq dimention
#             loss += criterion(decoder_output, batch_target[:, dc_idx])
#             decoder_input = batch_target[:, dc_idx]
            
#             ## Print Value
#             sample_sentence.append(torch.argmax(decoder_output[0]).item())
            
#         loss.backward()
#         total_avg_loss += loss.item() / target_length
        
#         encoder_optimizer.step()
#         decoder_optimizer.step()
        
# #         print('Training Loss: ', loss.item() / target_length)
        
#         ## Print Value
#         print("Predict: ", ids2sentence(sample_sentence, id2word_en_dic))
#         print("Actual: ", ids2sentence(batch_target[0].cpu().numpy(), id2word_en_dic))
        
#     return total_avg_loss