In [1]:
import pickle as pkl
import numpy as np
import gzip
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import time
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Import Dictionaries and Data

In [2]:
def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = pkl.load(f)
        return loaded_object

In [3]:
# Print sentence given numbers
def ids2sentence(sentence, dictionary):
    return [dictionary[i] for i in sentence]
#ids2sentence(en_train_num[0], id2word_en_dic)

def add_symbol(id2word_dic, word2id_dic):
    symbols = ['<pad>', '<unk>', '<sos>', '<eos>']
    for i, symbol in enumerate(symbols):
        id2word_dic[i] = symbol
        word2id_dic[symbol] = i
    return id2word_dic, word2id_dic

id2word_vi_dic = load_zipped_pickle("../embeddings/id2word_vi_dic.p")
word2id_vi_dic = load_zipped_pickle("../embeddings/word2id_vi_dic.p")

id2word_en_dic = load_zipped_pickle("../embeddings/id2word_en_dic.p")
word2id_en_dic = load_zipped_pickle("../embeddings/word2id_en_dic.p")

id2word_vi_dic, word2id_vi_dic = add_symbol(id2word_vi_dic, word2id_vi_dic)
id2word_en_dic, word2id_en_dic = add_symbol(id2word_en_dic, word2id_en_dic)

vi_train = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok.p")
en_train = load_zipped_pickle("../data/vi-en-tokens/train_en_tok.p") # Already Processed for symbols

vi_train_num = load_zipped_pickle("../data/vi-en-tokens/train_vi_tok_num.p")
en_train_num = load_zipped_pickle("../data/vi-en-tokens/train_en_tok_num.p") # Already Processed for symbols

## Padding Data

### Sort by input data length

In [4]:
def sort_by_length(data_input, target_data):
    input_size = [len(data) for data in data_input]
    size_index = np.argsort(input_size)
    return list(np.array(data_input)[size_index]), list(np.array(target_data)[size_index])

vi_train_num, en_train_num = sort_by_length(vi_train_num, en_train_num)

### Padding Data given batch size

In [5]:
def pad(data, length, max_length):
    # Cap maximum length at 100
    length = min(max_length, length)
    for i, line in enumerate(data):
        if len(line) < length:
            for i in range(len(line), length):
                line.append(0)
        else:
            data[i] = line[0:length]
    return data

# Return the batch data and target
def get_batch(i, batch_size, train_data, train_target, max_length):
    if i * batch_size > len(train_data):
        raise Exception('Incorrect batch index')
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_data = list(np.array(train_data)[start_idx:end_idx])
    batch_target = list(np.array(train_target)[start_idx:end_idx])
    batch_data = pad(batch_data, len(batch_data[batch_size - 1]), max_length)
    max_target = max([len(data) for data in batch_data])
    batch_target = pad(batch_target, max_target, max_length)
    return batch_data, batch_target

# get_batch(5, 64, vi_train_num, en_train_num, 100)

## Models

### Encoder

In [6]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        # input_size: input dictionary size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.num_layers = num_layers
        self.gru = nn.GRU(hidden_size, 
                          hidden_size, 
                          num_layers= num_layers, 
                          batch_first = True) # BATCH FIRST

    def forward(self, encoder_input, hidden_input):
        # encoder_input: batch * 1 (for 1 word each time)
        embedded_input = self.embedding(encoder_input)
        # embedded_input: batch * 1 * emb_dim
        # hidden_input: batch * 1(layer) * hidden_size
        output, hidden = self.gru(embedded_input, hidden_input)
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

### CNN Encoder

In [23]:
class ConvEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, max_len, dropout=0.2,
                 num_channels_attn=512, num_channels_conv=512,
                 kernel_size=3, num_layers=5):
        super(ConvEncoder, self).__init__()
        self.hidden_size = embedding_size
        self.position_embedding = nn.Embedding(max_len, embedding_size)
        self.word_embedding = nn.Embedding(vocab_size, embedding_size)
        self.num_layers = num_layers
        self.dropout = dropout

        self.conv = nn.ModuleList([nn.Conv1d(num_channels_conv, num_channels_conv, kernel_size,
                                      padding=kernel_size // 2) for _ in range(num_layers)])

    def forward(self, position_ids, sentence_as_wordids):
        # Retrieving position and word embeddings
        position_embedding = self.position_embedding(position_ids)
        word_embedding = self.word_embedding(sentence_as_wordids)
        
        # Applying dropout to the sum of position + word embeddings
        embedded = F.dropout(position_embedding + word_embedding, self.dropout, self.training)
        
        # Transform the input to be compatible for Conv1d as follows
        # Length * Channel ==> Num Batches * Channel * Length
        embedded = torch.unsqueeze(embedded.transpose(0, 1), 0)
        
        # Successive application of convolution layers followed by residual connection
        # and non-linearity
        
        cnn = embedded
        for i, layer in enumerate(self.conv):
            # layer(cnn) is the convolution operation on the input cnn after which
            # we add the original input creating a residual connection
            print(cnn.shape)
            cnn = F.tanh(layer(cnn)+cnn)        
        print(cnn.shape)
        return cnn

### Decoder

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, batch_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        # output_size: input dictionary size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, 
                          hidden_size,
                          num_layers= num_layers, 
                          batch_first = True) # BATCH_FRIST
        self.out = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1) # Use cross entropy loss outside

    def forward(self, decoder_input, hidden_input):
        # decoder_input: batch * 1
        embedded_input = self.embedding(decoder_input)
        # embedded_input: batch * 1 * emb_dim
        embedded_input = F.relu(embedded_input)
        # hidden_input: batch * hidden_size
        output, hidden = self.gru(embedded_input, hidden_input)
        output = self.out(output)
        # output = self.softmax(output) # not using softmax
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

### Decoder with Attention

In [9]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, max_length, batch_size, dropout_p=0.1):
        super(AttentionDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        # Max length for a sentence
        self.max_length = max_length
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          num_layers= num_layers, 
                          batch_first = True) # BATCH_FRIST)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, decoder_input, hidden_input, encoder_hiddens):
        # hidden_input: 1 * batch * hidden_size
        hidden_input = hidden_input.squeeze(0)
        # decoder_input: batch * 1
        embedded_input = self.embedding(decoder_input)
        # embedded_input: batch * 1 * embed_size
        embedded_input = self.dropout(embedded_input).squeeze(1)
        
        # embedded_input: batch * embed_size
        # hidden_input: batch * hidden_size 
        # (Use input and newest hidden to decide which encoder hidden is important)
        attn_weights = F.softmax(self.attn(torch.cat((embedded_input, hidden_input), 1)), dim=1).unsqueeze(1)
        # encoder_output: max_length * batch * encoder_hidden_size
        encoder_hiddens_t = encoder_hiddens.transpose(0, 1)
        # attn_weights: batch * 1 * max_length(theoretical)
        cropped_attn_weights = attn_weights[:, :, :encoder_hiddens_t.shape[1]]
        # cropped_attn_weights: batch * 1 * max_length(actual)
        # encoder_hiddens_t: batch * max_length(actual) * encoder_hidden_size
        ## 
        attn_applied = torch.bmm(cropped_attn_weights, encoder_hiddens_t).squeeze(1)
        
        # embedded_input: batch * embed_size
        # attn_applied: batch * encoder_hidden_size
        output = torch.cat((embedded_input, attn_applied), 1)
        output = self.attn_combine(output)
        
        # output: batch * hidden_size
        gru_input = F.relu(output).unsqueeze(1)
        # hidden_input: batch * hidden_size
        hidden_input = hidden_input.unsqueeze(0)
        # gru_input: batch * 1 * hidden_size
        # hidden_input: 1 * batch * hidden_size
        output, hidden = self.gru(gru_input, hidden_input)
        output = self.out(output)
        #output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(self.num_layers,  self.batch_size, self.hidden_size, device=device)

## Training

In [20]:
def train(train_input, train_target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_length, use_cnn, use_attention):
    
    start_time = time.time()
    
    # Batch
    for i in range(len(train_input) // batch_size):
        loss = 0
        
        batch = get_batch(i, batch_size, train_input, train_target, max_length)
        # size batch_size * seq_length
        batch_input = torch.tensor(batch[0], device=device)
        batch_target = torch.tensor(batch[1], device=device)
        input_length = batch_input.shape[1]
        target_length = batch_target.shape[1]

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        encoder_outputs = torch.zeros(input_length, batch_size, 1, encoder.hidden_size, device=device)
        encoder_hiddens = torch.zeros(input_length, 1, batch_size, encoder.hidden_size, device=device)
        
        # Encode
        encoder_hidden = None
        
        if use_cnn:
            position_ids = torch.LongTensor(range(0, input_length), device=device)
            #encoder_outputs = encoder()
            encoder_hiddens = encoder(position_ids, batch_input)
        else:
            encoder_hidden = encoder.initHidden()
            for ec_idx in range(input_length):
                # input batch_size * 1
                encoder_output, encoder_hidden = encoder(batch_input[:, ec_idx].unsqueeze(1), encoder_hidden)
                encoder_outputs[ec_idx] = encoder_output
                encoder_hiddens[ec_idx] = encoder_hidden
        
        # Decode
        decoder_hidden = None
        if use_cnn:
            decoder_hidden = decoder.initHidden()
        else:
            decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([2] * batch_size, device=device) # SOS token 2
        
        
        ## Print Value
        sample_sentence = []
        
        # Always use Teacher Forcing
        for dc_idx in range(target_length):
            if use_attention: 
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input.unsqueeze(1), decoder_hidden, encoder_hiddens.squeeze(1))
            else:
                decoder_output, decoder_hidden = decoder(decoder_input.unsqueeze(1), decoder_hidden)
            decoder_output = decoder_output.squeeze(1) # get rid of the seq dimention
            loss += criterion(decoder_output, batch_target[:, dc_idx])
            decoder_input = batch_target[:, dc_idx]
            
            if i % 1 == 0:
                ## Print Value
                sample_sentence.append(torch.argmax(decoder_output[0]).item())
            
        loss.backward()
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        if i % 1 == 0:
            s = int(time.time() - start_time)
            m = math.floor(s / 60)
            s = s - m * 60
            print('Time: ', m, 'mins', s, 'seconds' , ' Training Loss: ', loss.item() / target_length, 'Progress: ', round(i / (len(train_input) // batch_size) * 100, 2), '%')
            if i % 1 == 0:
                print("Predict: ", ids2sentence(sample_sentence, id2word_en_dic))
                print("Actual: ", ids2sentence(batch_target[0].cpu().numpy(), id2word_en_dic))
        
    print('Training Complete')

## Encoder Decoder Baseline

dic_size_vi = len(id2word_vi_dic.keys())
dic_size_en = len(id2word_en_dic.keys())
hidden_size = 256
learning_rate = 0.01
batch_size = 64
max_length = 100

## Add ignore index
criterion = nn.CrossEntropyLoss()

#encoder = EncoderRNN(input_size = dic_size_vi, hidden_size = hidden_size, num_layers = 1, batch_size = batch_size).to(device)
#decoder = DecoderRNN(hidden_size = hidden_size, output_size = dic_size_en, num_layers = 1, batch_size = batch_size).to(device)

encoder = pkl.load(open("./model/encoder.p", "rb"))
decoder = pkl.load(open("./model/decoder.p", "rb"))

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

for i in range(50):
    train(vi_train_num, en_train_num, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_length)
    if i % 2 == 0:
        pkl.dump(encoder, open("./model/encoder.p", "wb"))
        pkl.dump(decoder, open("./model/decoder.p", "wb"))

## Encoder Decoder with Attention

In [30]:
dic_size_vi = len(id2word_vi_dic.keys())
dic_size_en = len(id2word_en_dic.keys())
hidden_size = 256
learning_rate = 0.01
batch_size = 2
max_length = 100

## Add ignore index
criterion = nn.CrossEntropyLoss()

encoder = EncoderRNN(input_size = dic_size_vi, hidden_size = hidden_size, num_layers = 1, batch_size = batch_size).to(device)
decoder = AttentionDecoderRNN(hidden_size = hidden_size, output_size = dic_size_en, num_layers = 1, max_length = max_length, batch_size = batch_size, dropout_p=0.1).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

In [31]:
overfit_vi_train = vi_train_num[10000:10002]
overfit_en_train = en_train_num[10000:10002]

for i in range(100):
    train(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_length = max_length,use_attention = True)

Time:  0 mins 0 seconds  Training Loss:  11.510273742675782 Progress:  0.0 %
Predict:  ['thrash', 'physiotherapist', 'growths', 'benefice', 'logical', 'riverine', 'sunscreens', 'Fergus', 'ripping', 'vain']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  9.280722045898438 Progress:  0.0 %
Predict:  ['<sos>', '<sos>', '<sos>', '<sos>', 'these', 'notes', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  4.904550170898437 Progress:  0.0 %
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'notes', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  1.6712812423706054 Progress:  0.0 %
Predict:  ['<sos>', '<sos>', 's', 'just', 'these', 'two', 'notes

Time:  0 mins 0 seconds  Training Loss:  0.0007944107055664062 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.0007666587829589844 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.0007195472717285156 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.0006924629211425781 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', '

Time:  0 mins 0 seconds  Training Loss:  0.0003936767578125 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.0003780364990234375 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.0003757476806640625 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete
Time:  0 mins 0 seconds  Training Loss:  0.00037059783935546877 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it

Time:  0 mins 0 seconds  Training Loss:  0.0003008842468261719 Progress:  0.0 %
Predict:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Actual:  ['<sos>', 'it', 's', 'just', 'these', 'two', 'notes', 'in', 'the', 'middle']
Training Complete


In [13]:
# overfit_vi_train = vi_train_num[10000:10002]
# overfit_en_train = en_train_num[10000:10002]

# for i in range(100):
#     train(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size)

In [25]:
dic_size_vi = len(id2word_vi_dic.keys())
dic_size_en = len(id2word_en_dic.keys())
hidden_size = 256
learning_rate = 0.01
batch_size = 2
max_length = 100

## Add ignore index
criterion = nn.CrossEntropyLoss()

encoder = ConvEncoder(vocab_size = dic_size_vi, embedding_size = hidden_size, max_len=max_length, dropout=0.2, num_channels_attn=512, num_channels_conv=512)
decoder = AttentionDecoderRNN(hidden_size = hidden_size, output_size = dic_size_en, num_layers = 1, max_length = max_length, batch_size = batch_size, dropout_p=0.1).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

In [26]:
overfit_vi_train = vi_train_num[10000:10002]
overfit_en_train = en_train_num[10000:10002]

for i in range(100):
    train(overfit_vi_train, overfit_en_train, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_length = max_length, use_cnn= True, use_attention = True)

torch.Size([1, 10, 2, 256])


RuntimeError: Expected 3-dimensional input for 3-dimensional weight [512, 512, 3], but got input of size [1, 10, 2, 256] instead