In [1]:
# Experiment 9: Actually use the scheduler (by doing scheduler.step() at the end of every epoch). 
# I use bleu score, instead of val loss. 

In [2]:
import loader
import argparse
import rnn_models
# from beam_search import *
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch
from torchtext import data
from collections import defaultdict
import numpy as np
import pdb
import sacrebleu
from torch.optim.lr_scheduler import ReduceLROnPlateau
import apex
from apex import amp
from apex.fp16_utils import FP16_Optimizer

from torchtext import data
from torchtext import datasets

import io
import os
import string
import time

In [3]:
parser = argparse.ArgumentParser(description='Testing')
parser.add_argument("--max_sentence_length", help="maximum sentence length", type=int, default=50)
parser.add_argument("--min_freq", help="filter out tokens less than min frequency", type=int, default=3)
parser.add_argument("--max_vocab_size", help="at most n tokens in vocabulary", type=int, default=100000)

_StoreAction(option_strings=['--max_vocab_size'], dest='max_vocab_size', nargs=None, const=None, default=100000, type=<class 'int'>, choices=None, help='at most n tokens in vocabulary', metavar=None)

In [4]:
EXPERIMENT_NO = 9 
OUTPUT_FILE = 'output_{}.txt'.format(EXPERIMENT_NO)

In [5]:
class Args():
    
    #########
    # Paths #
    #########
    
    data = '/scratch/vr1059/vi-en/'
    train_prefix = 'train'
    val_prefix = 'dev'
    test_prefix = 'test'
    src_ext = '.tok.vi'
    trg_ext = '.tok.en'

    max_sentence_length = 50
    min_freq = 1
    max_vocab_size = 100000
    
    ################
    # Model params #
    ################
    
    hidden_size = 500
    embedding_size = 500
    bidirectional = True
    num_encoder_layers = 2
    num_decoder_layers = 2
    attn_model = 'general'
    lr = 5e-4
    epochs = 12
    batch_size = 64
    print_every = 1000
    clip = 0.1
    
args = Args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [6]:
hasattr(args, 'hidden_size')

True

In [7]:
train_data, val_data, test_data, src, trg = loader.load_data(args)

most common source vocabs: [(',', 128638), ('.', 120849), ('là', 51451), ('và', 47993), ('một', 40378), ('tôi', 38381), ('những', 37809), ('của', 36330), ('có', 26166), ('bạn', 26111)]
source vocab size: 42152
most common english vocabs: [(',', 156165), ('.', 132505), ('the', 109723), ('and', 79673), ('to', 65979), ('of', 60510), ('a', 55374), ('that', 49320), ('i', 43629), ('in', 41318)]
english vocab size: 47859


In [8]:
print(len(train_data))
print(len(val_data))

133166
1268


In [9]:
def train_batch(phase, args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, batch, device):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    ###########
    # Encoder #
    ###########
    
    seq_len, batch_size = batch.trg[0].shape
    hidden = encoder.random_init_hidden(device, batch_size)
    encoder_outputs, hidden, encoder_padding_mask = encoder(hidden, batch.src[0], batch.src[1])
        
    ###########
    # Decoder #
    ###########
    
    # Teacher-forcing always ON
    
    # [2, 2, 2, ..., 2]. List of SOS tokens, batch-sized. 
    decoder_input = batch.trg[0][0,:] 
    eos_encountered_list = [False]*batch_size
    
    i = 0
    loss = 0
    number_of_loss_calculation = 0
    
    # decoder.hidden = encoder.hidden[:decoder.n_layers] 
    # Use last (forward) hidden state from encoder #TODO: verify
    hidden = hidden[:decoder.n_layers]
    
    while ((i+1 < seq_len) and (sum(eos_encountered_list) < batch_size)):
        
        logits, _, hidden = decoder(hidden, decoder_input, encoder_outputs, encoder_padding_mask)
        logits = logits.unsqueeze(0)
        class_probs = F.log_softmax(logits, dim = 2)
        decoder_input = batch.trg[0][i+1,:]
        
        # i+1 represents the current index in all sequences
        for j in range(batch_size):
            if not eos_encountered_list[j]:
                loss += loss_func(class_probs[0, j, :].view(1, -1), batch.trg[0][i+1, j].view(1))
                number_of_loss_calculation += 1
                
                if batch.trg[0][i+1, j] == EOS_IDX:
                    eos_encountered_list[j] = True
                    
        i += 1
        
     
    # calculate gradients on each parameter
    loss.backward()
    
    # clip if too large
    nn.utils.clip_grad_norm_(encoder.parameters(), args.clip)
    nn.utils.clip_grad_norm_(decoder.parameters(), args.clip)

    # take gradient step
    encoder_optimizer.step()
    decoder_optimizer.step()
        
    # report avg loss over minibatch
    return loss.item()/number_of_loss_calculation


#               #
# Loss function #
#               #

# loss += loss_func(output[0, j, :].view(1, -1), batch.trg[0][i+1, j].view(1))
                
# so the way NLLLoss is set up, the target is simply the index that you want to predict. 
# and the input can be a softmax over the entire output vocabulary space
# and nllloss calculate loss value between that index between predicted and 
# elementary vector e_target_idx (zeroes everywhere except 1 in target index position)
    

In [10]:
def train(args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, device, epoch_idx, 
                 train_data, val_data, trg):
    
    
    # Create batches with pre-sorted, similar-length sequences in each
    train_iter = data.BucketIterator(
        dataset=train_data, 
        batch_size=args.batch_size,
        repeat=False,
        sort_key=lambda x: len(x.src),
        sort_within_batch=True,
        device=device,
        train=True
    )

    # Set training flag
    encoder.train()
    decoder.train()

    train_losses = []
    for i, batch in enumerate(iter(train_iter)):
        avg_loss = train_batch('train', args, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, batch, device)
        train_losses.append(avg_loss)
        if args.print_every and i % args.print_every == 0:
            print("train, epoch: {}, batch number: {}, batch loss: {}".format(
            epoch_idx, i, avg_loss))
            with open(OUTPUT_FILE, 'a') as f:
                f.write("\ntrain, epoch: {}, batch number: {}, batch loss: {}".format(
                    epoch_idx, i, avg_loss))
                f.close()
            
            
    print("epoch: {}, average loss for epoch: {}, size of last batch {}".format(
    epoch_idx, np.mean(train_losses), batch.src[0].shape[1]))
    with open(OUTPUT_FILE, 'a') as f:
        f.write("\nepoch: {}, average loss for epoch: {}, size of last batch {}".format(
            epoch_idx, np.mean(train_losses), batch.src[0].shape[1]))
        f.close()
        
    return np.mean(train_losses)

In [11]:
def calculate_bleu(predictions, labels):
    """
    Only pass a list of strings 
    """
    # n_gram = 4
    for i in range(len(predictions)): 
        predictions[i] = predictions[i].replace('SOS ', '').replace(' EOS', '')
    for i in range(len(labels)):
        labels[i] = labels[i].replace('SOS ', '').replace(' EOS', '')
        
    bleu = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score
    return bleu

In [12]:
calculate_bleu([' i am rich . '], [' i am rich . '])
# calculate_bleu(['SOS burma knows what i love .'], ['remi knows what love is . EOS'])

100.00000000000004

In [13]:
def beam_search(decoder, decoder_input, encoder_outputs, hidden, max_length, k, trg, mask = None):
    
    candidates = [(decoder_input, 0, hidden)]
    potential_candidates = []
    completed_translations = []

    # put a cap on sentence length
    for m in range(max_length):
        for c in candidates:
            # unpack the tuple
            c_sequence, c_score, c_hidden = c
            
            # EOS token
            if c_sequence[-1] == EOS_IDX:
                completed_translations.append((c_sequence, c_score))
                k = k - 1
            else:
                logits, _, hidden = decoder(c_hidden.contiguous()[:decoder.n_layers], c_sequence.contiguous()[-1].unsqueeze(0), encoder_outputs, mask)
                next_word_probs = F.log_softmax(logits, dim = 1)
                # in the worst-case, one sequence will have the highest k probabilities
                # so to save computation, only grab the k highest_probability from each candidate sequence
                top_probs, top_idx = torch.topk(next_word_probs, k)
                top_probs.squeeze_()
                top_idx.squeeze_()
                top_probs = [top_probs] if len(top_probs.size()) == 0 else top_probs
                top_idx = [top_idx] if len(top_idx.size()) == 0 else top_idx
                for i in range(len(top_probs)):
                    word = top_idx[i].reshape(1, 1).to(device)
                    new_score = c_score + top_probs[i]
                    potential_candidates.append((torch.cat((c_sequence, word)).to(device), new_score, hidden))

        candidates = sorted(potential_candidates, key= lambda x: x[1], reverse=True)[0:k] 
        potential_candidates = []

    completed = completed_translations + candidates
    completed = sorted(completed, key= lambda x: x[1], reverse=True)[0] 
    return completed[0]

In [14]:
def ids_to_words(ids, trg):
    words = ""
    for x in ids:
        words += trg.vocab.itos[x.squeeze().item()] + ' '
    return words.strip()

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [None]:
# Chaitra said the best way to get hyperparameters for your model
# was to take like 100, 200, 500 examples from your dataset
# and see which hyperparameter combination overfits the best/fastest on that example. 
# And then use that as your hyperparameter combination to train the model. 

In [None]:
def val_batch(args, encoder, decoder, batch, trg, device):
    
    encoder.eval()
    decoder.eval()
    
    ############
    #  encode  #
    ############
    
    _, batch_size = batch.trg[0].shape
    hidden = encoder.random_init_hidden(device, batch_size)
    encoder_outputs, hidden, _ = encoder(hidden, batch.src[0], batch.src[1])
    
    #################
    #  beam search  #
    #################
    
    max_length = 30
    k = 10 
    
    translations = []
    trg_translations = []
    for i in range(batch_size):
        decoder_input = torch.tensor([[src.vocab.stoi['SOS']]], device=device)
        decoder_hidden = hidden[:, i, :].unsqueeze(1)
        encoder_outputs_i = encoder_outputs[:, i, :].unsqueeze(1)
        
        seq_of_ids = beam_search(decoder, decoder_input, encoder_outputs_i, decoder_hidden, max_length, k, trg)
        translations.append(ids_to_words(seq_of_ids, trg))
        trg_translations.append(ids_to_words(batch.trg[0][:batch.trg[1][i], i], trg))
        
    return translations, trg_translations 

In [None]:
def val(args, encoder, decoder, device, epoch_idx, val_data, trg):
    
    # Create minibatches over validation data
    val_iter = data.BucketIterator(
        dataset=val_data, 
        batch_size=args.batch_size,
        train=False,
        shuffle=False,
        # A key to use for sorting examples in order to batch together 
        # examples with similar lengths and minimize padding.
        sort=True,
        sort_key=lambda x: len(x.src),
        repeat=False,
        sort_within_batch=True,
        device=device
    )
    
    val_losses = []
    val_bleus = []
    val_references = []
    
    all_predicted = []
    all_trg = []
    times = []
    for i, batch in enumerate(iter(val_iter)):
        start = time.time()
        predicted_trans, trg_trans = val_batch(args, encoder, decoder, batch, trg, device)
        times.append(time.time() - start)
        all_predicted += predicted_trans
        all_trg += trg_trans
    print("Average val_batch time", np.mean(times))
    print(all_predicted[:10])
    print(all_trg[:10])    
    bleu = calculate_bleu(all_predicted, all_trg)
    print(bleu)
    with open(OUTPUT_FILE, 'a') as f:
        f.write("\nbleu score: {}\n".format(bleu))
        if epoch_idx % 3 == 0:
            f.write(str(all_predicted[:10]) + '\n')
            f.write(str(all_trg[:10]) + '\n')
        f.close()
    return bleu
    
    
#                    #
# Batch & Dimensions #
#                    #
# `batch` represents a batch of examples. 
# `batch.src` consists of two tensors. 
# The first, `b.src[0]`, is the `src` examples from your batch; it's a tensor with the shape (max_seq_len, batch_size). 
# Your sequences have already been indexed and padded. 
# The second, `b.src[1]`, is the actual lengths of each sequence. It is of shape (batch_size, 1). 

# data.BucketIterator automatically batches sequences of similar lengths together. 
# it also automatically sorts in reverse order. 

# Say you have a bidirectional, 2-layer RNN encoder. A single batch has max length 19 and batch size 32. 
# The encoder_outputs will have shape: (19, 32, 512). 
# Basically, it only returns the topmost layer's hidden states at each step of the sequence. 
# And it concatenates both directional outputs (hidden states) for the topmost layer. 

In [None]:
src_padding_idx = src.vocab.stoi['<pad>']
trg_padding_idx = trg.vocab.stoi['<pad>']
EOS_IDX = trg.vocab.stoi['EOS']

encoder = rnn_models.Encoder(args, src_padding_idx, len(src.vocab)).to(device)
decoder = rnn_models.LuongAttnDecoderRNN(args, trg_padding_idx, len(trg.vocab)).to(device)

# initialize weights using gaussian with 0 mean and 0.01 std, just like the paper said
# TODO: Better initialization. Xavier?
for net in [encoder, decoder]:
    for name, param in net.named_parameters(): 
        #print(name, type(param), param)
        if 'bias' in name:
            nn.init.constant_(param, 0.0)
        elif 'weight' in name:
            nn.init.xavier_normal_(param)
            
# loading in saved models
encoder.load_state_dict(torch.load('vi-en_encoder_exp_9.pth'))
decoder.load_state_dict(torch.load('vi-en_decoder_exp_9.pth'))
    
### Apex Nvidia fp16 stuff
# encoder_optimizer = FP16_Optimizer(optim.Adam(encoder.parameters(), lr=args.lr))
# decoder_optimizer = FP16_Optimizer(optim.Adam(decoder.parameters(), lr=args.lr))

# enc_scheduler = ReduceLROnPlateau(encoder_optimizer.optimizer, min_lr=1e-10,factor = 0.5,  patience=0)
# dec_scheduler = ReduceLROnPlateau(decoder_optimizer.optimizer, min_lr=1e-10,factor = 0.5,  patience=0)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.lr)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=args.lr)

enc_scheduler = ReduceLROnPlateau(encoder_optimizer, min_lr=1e-10,factor = 0.5,  patience=0)
dec_scheduler = ReduceLROnPlateau(decoder_optimizer, min_lr=1e-10,factor = 0.5,  patience=0)

### Apex Nvidia fp16 stuff
# [encoder, decoder], [encoder_optimizer, decoder_optimizer] = amp.initialize(
#     [encoder, decoder], 
#     [encoder_optimizer, decoder_optimizer]
# )

loss_func = nn.NLLLoss()

loss_history = []
bleu_history = []

for i in range(args.epochs):
    avg_epoch_loss = train(args, encoder, decoder, encoder_optimizer, 
                                     decoder_optimizer, loss_func, device, i, 
                                    train_data, val_data, trg)
    
    loss_history.append(avg_epoch_loss)
    bleu = val(args, encoder, decoder, device, i, val_data, trg)
    bleu_history.append(bleu)
    
    # Because this originally works with val_loss, whenever val_loss plateaus or goes up, it decreases lr. 
    # We want it to decrease lr when BLEU score decreases or plateaus. 
    enc_scheduler.step(-bleu)
    dec_scheduler.step(-bleu)
    
    with open(OUTPUT_FILE, 'a') as f:
        f.write("\nLearning rate of encoder_optimizer: {}".format(get_lr(encoder_optimizer)))
        f.write("\nLearning rate of decoder_optimizer: {}".format(get_lr(decoder_optimizer)))
        f.close()
    print("Learning rate of encoder_optimizer: ", get_lr(encoder_optimizer))
    print("Learning rate of decoder_optimizer: ", get_lr(decoder_optimizer))
    
    torch.save(encoder.state_dict(), 'vi-en_encoder_exp_{}.pth'.format(EXPERIMENT_NO))
    torch.save(decoder.state_dict(), 'vi-en_decoder_exp_{}.pth'.format(EXPERIMENT_NO))
    

train, epoch: 0, batch number: 0, batch loss: 1.1633004730325998


In [21]:
calculate_bleu('this is the kitchen.', ['this is a family portrait.'])

4.184379521845803

In [24]:
calculate_bleu('SOS i was thrilled . EOS', ['SOS and i was distraught . EOS'])

3.455747170954951

In [26]:
print(calculate_bleu("SOS you 've got a perfect memory . EOS", ['SOS he has the most <unk> memory . EOS']))
print(calculate_bleu("you've got a perfect memory.", ['he has the most <unk> memory.']))

7.171781507224268
0.782250396784411


In [None]:
boom = ["i am a foon", "boondawg in the moon.", "Yippe."]

In [None]:
%%capture cap --no-stderr
func()
with open('output.txt', 'a') as f:
    f.write(cap.stdout)

In [None]:
def func():
    for i in range(10):
        print(i)
        time.sleep(3)