In [1]:
import load_data
import argparse
import rnn_models
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchtext import data
from collections import defaultdict
import numpy as np

In [2]:
parser = argparse.ArgumentParser(description='Testing')
parser.add_argument("--max_sentence_length", help="maximum sentence length", type=int, default=50)
parser.add_argument("--min_freq", help="filter out tokens less than min frequency", type=int, default=3)
parser.add_argument("--max_vocab_size", help="at most n tokens in vocabulary", type=int, default=100000)

_StoreAction(option_strings=['--max_vocab_size'], dest='max_vocab_size', nargs=None, const=None, default=100000, type=<class 'int'>, choices=None, help='at most n tokens in vocabulary', metavar=None)

In [3]:
class Args():
    
    max_sentence_length = 50
    min_freq = 3
    max_vocab_size = 100000
    data = 'data'
    hidden_size = 256
    embedding_size = 256
    bidirectional = True
    num_encoder_layers = 2
    num_decoder_layers = 2
    attn_model = 'default'
    lr = 1e-3
    epochs = 5
    batch_size = 32
    
args = Args()
device = 'cpu'

In [4]:
train_data, val_data, test_data, src, trg = load_data.load_data(args)

most common source vocabs: [(',', 128638), ('.', 120849), ('là', 51451), ('và', 47993), ('một', 40378), ('tôi', 38381), ('những', 37809), ('của', 36330), ('có', 26166), ('bạn', 26111)]
source vocab size: 20125
most common english vocabs: [(',', 156165), ('.', 132505), ('the', 109723), ('and', 79673), ('to', 65979), ('of', 60510), ('a', 55374), ('that', 49320), ('i', 43629), ('in', 41318)]
english vocab size: 22443


In [5]:
src_padding_idx = src.vocab.stoi['<pad>']
trg_padding_idx = trg.vocab.stoi['<pad>']

src_vocab_size = len(src.vocab)
trg_vocab_size = len(trg.vocab)

encoder = rnn_models.Encoder(args, src_padding_idx, src_vocab_size)
decoder = rnn_models.LuongAttnDecoderRNN(args, trg_padding_idx, trg_vocab_size)

# initialize weights using gaussian with 0 mean and 0.01 std, just like the paper said
# TODO: Better initialization. Xavier?
for net in [encoder, decoder]:
    for name, param in net.named_parameters(): 
        #print(name, type(param), param)
        if 'bias' in name:
            nn.init.constant_(param, 0.0)
        elif 'weight' in name:
            nn.init.xavier_normal_(param)
            
encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr)

loss_func = nn.NLLLoss()

loss_history = defaultdict(list)
bleu_history = defaultdict(list)

# for i in range(args.epochs):
#     train_loss, val_loss, val_bleu = train_and_val(args, encoder, decoder, encoder_optimizer, 
#                                                    decoder_optimizer, loss_function, device, i, 
#                                                    train_data, val_data, trg, encoder_embedding_dict, 
#                                                    decoder_embedding_dict)

In [11]:
def run_batch(phase, args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, batch, device):
    if phase == "train":
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
    
    loss = 0
    
    # TODO: it seems that currently batch size is always the same. Make sure to use the last batch
    max_trg_seq_len, batch_size = batch.trg[0].shape
    
    hidden = encoder.random_init_hidden(device, batch_size)
    encoder_outputs, hidden = encoder(hidden, batch.src[0], batch.src[1])
    
    return 5
    
    

In [12]:
train_iter = data.BucketIterator(
        dataset=train_data, 
        batch_size=args.batch_size,
        repeat=False,
        sort_key=lambda x: len(x.src),
        sort_within_batch=True,
        device=device,
        train=True
    )
    
val_iter = data.BucketIterator(
    dataset=val_data, 
    batch_size=args.batch_size,
    train=False,
    shuffle=False,
    #A key to use for sorting examples in order to batch together 
    # examples with similar lengths and minimize padding.
    sort=True,
    sort_key=lambda x: len(x.src),
    repeat=False,
    sort_within_batch=True,
    device=device
)

encoder.train()
decoder.train()

train_losses = []
for i in range(np.random.randint(20)):
    batch = next(iter(train_iter))
#for i, batch in enumerate(iter(train_iter)):

loss = run_batch('train', args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, batch, device)
    

# `batch` represents a batch. 
# `batch.src` consists of two tensors. 
# The first, `b.src[0]`, is the contents of your batch; it's a tensor with the shape (max_seq_len, batch_size). 
# Your sequences have already been indexed and padded. 
# The second, `b.src[1]`, is the actual lengths of each sequence. It is of shape (batch_size, 1). 

# data.BucketIterator automatically batches sequences of similar lengths together. 
# it also automatically sorts in reverse order. 

# Say you have a bidirectional, 2-layer RNN encoder. A single batch has max length 19 and batch size 32. 
# The encoder_outputs will have shape: (19, 32, 512). 
# Basically, it only returns the topmost layer's hidden states at each step of the sequence. 
# And it concatenates both directional outputs (hidden states) for the topmost layer. 

x.shape torch.Size([50, 32])
lengths tensor([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12])
embedded:  torch.Size([50, 32, 256])
after encoder, encoder_outputs:  torch.Size([12, 32, 512])
encoder hidden:  torch.Size([4, 32, 256])


In [8]:
b = next(iter(train_iter))

In [9]:
b.trg[0].shape

torch.Size([50, 32])

In [10]:
# def train_and_val(args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_function, device, i, 
#                   train_data, val_data, trg, encoder_embedding_dict, decoder_embedding_dict):
    
    
        
    
    