In [None]:
import os
import pdb
import argparse
import pickle as pkl

from collections import defaultdict

import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

from easydict import EasyDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

# Local imports
import utils
import data_handler
from data_handler import *
import attn_vis


In [None]:
# Options/ Hyperparameters required to train and test the model
opts = EasyDict()

opts.n_epochs = 100
opts.batch_size = 16
opts.learning_rate = 0.002
opts.lr_decay = 0.99
opts.hidden_layer_size = 10
opts.model_name = "attention_rnn"
opts.checkpoints_dir = "./checkpoints/"+opts.model_name 

TEST_SENTENCE = 'i love deep learning'
TEST_WORD_ATTENTION = "attention"

In [None]:
utils.create_dir_if_not_exists(opts.checkpoints_dir)

In [None]:
line_pairs, vocab_size, idx_dict = load_data()

In [None]:
# dividing the line pairs into 8:2, train and val split
num_lines = len(line_pairs)
num_train = int(0.8 * num_lines)
train_pairs, val_pairs = line_pairs[:num_train], line_pairs[num_train:]

In [None]:
train_dict = create_dict(train_pairs)
val_dict = create_dict(val_pairs)

# Study the structure of the created train_dict and val_dict variables

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, opts):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.opts = opts

        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm_cell = nn.LSTMCell(hidden_size, hidden_size)

    def forward(self, inputs):
        """Forward pass of the encoder RNN.

        Arguments:
            inputs: Input token indexes across a batch for all time steps in the sequence. (batch_size x seq_len)

        Returns:
            annotations: The hidden states computed at each step of the input sequence. (batch_size x seq_len x hidden_size)
            hidden: The final hidden state of the encoder, for each sequence in a batch. (batch_size x hidden_size)
        """

        batch_size, seq_len = inputs.size()
        hidden = self.init_hidden(batch_size)
        cell = self.init_hidden(batch_size)
        encoded = self.embedding(inputs)  # batch_size x seq_len x hidden_size
        annotations = []

        for i in range(seq_len):
            x = encoded[:,i,:]  # Get the current time step, across the whole batch
            hidden, cell = self.lstm_cell(x, (hidden, cell) )
            annotations.append(hidden)

        annotations = torch.stack(annotations, dim=1)
        return annotations, hidden

    def init_hidden(self, bs):
        """Creates a tensor of zeros to represent the initial hidden states
        of a batch of sequences.

        Arguments:
            bs: The batch size for the initial hidden state.

        Returns:
            hidden: An initial hidden state of all zeros. (batch_size x hidden_size)
        """
        return Variable(torch.zeros(bs, self.hidden_size))


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.hidden_size = hidden_size

        # ------------
        # FILL THIS IN
        # ------------

        # Create a two layer fully-connected network. 
        # [[hidden_size*2 --> hidden_size], [ReLU], [hidden_size --> 1]]
        
        # self.attention_network = ...

        self.softmax = nn.Softmax(dim=1)

    def forward(self, hidden, annotations):
        """The forward pass of the attention mechanism.

        Arguments:
            hidden: The current decoder hidden state. (batch_size x hidden_size)
            annotations: The encoder hidden states for each step of the input sequence. (batch_size x seq_len x hidden_size)

        Returns:
            output: Normalized attention weights for each encoder hidden state. (batch_size x seq_len x 1)

            The output must be a softmax weighting over the seq_len annotations.
        """

        batch_size, seq_len, hid_size = annotations.size()
        expanded_hidden = hidden.unsqueeze(1).expand_as(annotations)

        # ------------
        # FILL THIS IN
        # ------------

        # You are free to follow the code template below, or do it a different way,
        # as long as the output is correct.

        # concat = ...
        # reshaped_for_attention_net = ...
        # attention_net_output = ...
        # unnormalized_attention = ...  # Reshape attention net output to have dimension batch_size x seq_len x 1

        return self.softmax(unnormalized_attention)


class AttentionDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(AttentionDecoder, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, hidden_size)

        self.lstm_cell = nn.LSTMCell(input_size=hidden_size*2, hidden_size=hidden_size)
        self.attention = Attention(hidden_size=hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h_prev, c_prev, annotations):
        """Forward pass of the attention-based decoder RNN.

        Arguments:
            x: Input token indexes across a batch for a single time step. (batch_size x 1)
            h_prev: The hidden states from the previous step, across a batch. (batch_size x hidden_size)
            annotations: The encoder hidden states for each step of the input.
                         sequence. (batch_size x seq_len x hidden_size)

        Returns:
            output: Un-normalized scores for each token in the vocabulary, across a batch. (batch_size x vocab_size)
            h_new: The new hidden states, across a batch. (batch_size x hidden_size)
            attention_weights: The weights applied to the encoder annotations, across a batch. (batch_size x encoder_seq_len x 1)
        """
        embed = self.embedding(x)    # batch_size x 1 x hidden_size
        embed = embed.squeeze(1)     # batch_size x hidden_size

        # ------------
        # FILL THIS IN
        # ------------
        # attention_weights = ...
        # context = ...
        # embed_and_context = ...
        # h_new, c_new = ...
        # output = ...
        return output, h_new, c_new, attention_weights

In [None]:
##########################################################################
### Setup: Create Encoder, Decoder Objects ###
##########################################################################
encoder = Encoder(vocab_size=vocab_size, hidden_size=opts.hidden_layer_size, opts = opts)
decoder = AttentionDecoder(vocab_size=vocab_size, hidden_size=opts.hidden_layer_size)


In [None]:
def train_model(train_dict, val_dict, idx_dict, encoder, decoder, opts):
    """Runs the main training loop; evaluates the model on the val set every epoch.
        * Prints training and val loss each epoch.
        * Prints qualitative translation results each epoch using TEST_SENTENCE

    Arguments:
        train_dict: The training word pairs, organized by source and target lengths.
        val_dict: The validation word pairs, organized by source and target lengths.
        idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes.
        encoder: An encoder model to produce annotations for each step of the input sequence.
        decoder: A decoder model to generate output tokens.
        opts: The input arguments for hyper-parameters and others.
    """
    
    # Define your loss function and optimizers
    # ....

    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    char_to_index = idx_dict['char_to_index']

    loss_log = open(os.path.join(opts.checkpoints_dir, 'loss_log.txt'), 'w')

    best_val_loss = 1e6
    train_losses = []
    val_losses = []

    for epoch in range(opts.n_epochs):
        
        # decay the learning rate of the optimizer
        # ....
#         optimizer.param_groups[0]['lr'] *= opts.lr_decay

        epoch_losses = []

        for key in train_dict:

            input_strings, target_strings = zip(*train_dict[key])
            
            # Make your input tensor and the target tensors
            # HINT : use the function string_to_index_list given in data_handler.py
            # input_tensors = ....
            # output_tensors = ....
            
            num_tensors = len(input_tensors)
            num_batches = int(np.ceil(num_tensors / float(opts.batch_size)))

            for i in range(num_batches):

                start = i * opts.batch_size
                end = start + opts.batch_size

                # Define inputs and targets for THIS batch, beginning at index 'start' to 'end'
                # inputs = ....
                # outputs = ....
                
                # The batch size may be different in each epoch
                BS = inputs.size(0)

                encoder_annotations, encoder_hidden = encoder.forward(inputs)

                # The last hidden state of the encoder becomes the first hidden state of the decoder
                # decoder_hidden = ....
                # decoder_cell = .. either zeros, or last encoder hidden state
                # Define the first decoder input. This would essentially be the start_token
                # decoder_input = ....

                loss = 0.0

                seq_len = targets.size(1)  # Gets seq_len from BS x seq_len

                for i in range(seq_len):
                    decoder_output, decoder_hidden, decoder_cell, attention_weights = decoder.forward(decoder_input, \
                                                                                        decoder_hidden, \
                                                                                        decoder_cell, \
                                                                                        encoder_annotations)

                    current_target = targets[:,i]
                    
                    # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
                    # loss += ....
                    
                    # Find out the most probable character (ni) from the softmax distribution produced
                    # ni = ....

                    decoder_input = targets[:,i].unsqueeze(1)

                loss /= float(seq_len)
                epoch_losses.append(loss.item())

                # Compute gradients
#                 loss.backward()

                # Update the parameters of the encoder and decoder
#                 optimizer.step()

        train_loss = np.mean(epoch_losses)
        val_loss = evaluate(val_dict, encoder, decoder, idx_dict, criterion, opts)

        if val_loss < best_val_loss:
            utils.store_checkpoints(encoder, decoder, idx_dict, opts)
        
        attn_vis.visualize_attention(TEST_WORD_ATTENTION,
                                      encoder,
                                      decoder,
                                      idx_dict,
                                      opts,
                                      save=os.path.join(opts.checkpoint_path,\
                                                        'train_attns/attn-epoch-{}.png'.format(epoch)))
        gen_string = find_pig_latin(TEST_SENTENCE, encoder, decoder, idx_dict, opts)
        print("Epoch: {:3d} | Train loss: {:.3f} | Val loss: {:.3f} | Gen: {:20s}".format(epoch, train_loss, val_loss, gen_string))

        loss_log.write('{} {} {}\n'.format(epoch, train_loss, val_loss))
        loss_log.flush()

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        utils.store_loss_plots(train_losses, val_losses, opts)


In [None]:
def evaluate(data_dict, encoder, decoder, idx_dict, criterion, opts):
    """Evaluates the model on a held-out validation or test set. 
    This should be pretty straight-forward if you have figured out how to do the training correctly.
    From then, it's just copy and paste.

    Arguments:
        data_dict: The validation/test word pairs, organized by source and target lengths.
        encoder: An encoder model to produce annotations for each step of the input sequence.
        decoder: A decoder model to generate output tokens.
        idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes.
        criterion: Used to compute the CrossEntropyLoss for each decoder output.
        opts: The command-line arguments.

    Returns:
        mean_loss: The average loss over all batches from data_dict.
    """

    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    char_to_index = idx_dict['char_to_index']

    losses = []

    for key in data_dict:

        input_strings, target_strings = zip(*data_dict[key])
        # Make your input tensor and the target tensors
        # HINT : use the function string_to_index_list given in data_handler.py
        # input_tensors = ....
        # output_tensors = ....
        
        num_tensors = len(input_tensors)
        num_batches = int(np.ceil(num_tensors / float(opts.batch_size)))

        for i in range(num_batches):

            start = i * opts.batch_size
            end = start + opts.batch_size

            # Define inputs and targets for THIS batch, beginning at index 'start' to 'end'
            # inputs = ....
            # outputs = ....

            # The batch size may be different in each epoch
            BS = inputs.size(0)

            encoder_annotations, encoder_hidden = encoder.forward(inputs)
            
            # The last hidden state of the encoder becomes the first hidden state of the decoder
            # decoder_hidden = ....
            # decoder_cell = .. either zeros, or last encoder hidden state
            # Define the first decoder input. This would essentially be the start_token
            # decoder_input = ....

            loss = 0.0

            seq_len = targets.size(1)  # Gets seq_len from BS x seq_len

            for i in range(seq_len):
                decoder_output, decoder_hidden, decoder_cell, attention_weights = decoder.forward(decoder_input,\
                                                                                    decoder_hidden,\
                                                                                    decoder_cell, \
                                                                                    encoder_annotations)

                current_target = targets[:,i]

                # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
                # loss += ....

                # Find out the most probable character (ni) from the softmax distribution produced
                # ni = ....
                
                # Update decoder_input at the next time step to be this time-step's target 
                # decoder_input = ....

            loss /= float(seq_len)
            losses.append(loss.item())

    mean_loss = np.mean(losses)

    return mean_loss


In [None]:
def find_pig_latin(sentence, encoder, decoder, idx_dict, opts):
    """Translates a sentence from English to Pig-Latin, by splitting the sentence into
    words (whitespace-separated), running the encoder-decoder model to translate each
    word independently, and then stitching the words back together with spaces between them.
    """
    return ' '.join([translate(word, encoder, decoder, idx_dict, opts) for word in sentence.split()])


def translate(input_string, encoder, decoder, idx_dict, opts):
    """Translates a given string from English to Pig-Latin.
    Not much to do here as well. Follows basically the same structure as that of the function evaluate.
    """

    char_to_index = idx_dict['char_to_index']
    index_to_char = idx_dict['index_to_char']
    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']

    max_generated_chars = 20
    gen_string = ''

    # convert given string to an array of indexes
    # HINT: use the function string_to_index_list provided in data_handler
    # indexes = ....

    encoder_annotations, encoder_last_hidden = encoder.forward(indexes)

    # The last hidden state of the encoder becomes the first hidden state of the decoder
    # decoder_hidden = ....
    # decoder_cell = ... zeros, or last encoder hidden state

    # Define the first decoder input. This would essentially be the start_token
    # decoder_input = ....

    for i in range(max_generated_chars):
        decoder_output, decoder_hidden, decoder_cell, attention_weights = decoder.forward(decoder_input,\
                                                                            decoder_hidden,\
                                                                            decoder_cell, \
                                                                            encoder_annotations)
    
        # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
        # loss += ....

        # Find out the most probable character (ni) from the softmax distribution produced
        # ni = ....

        if ni == end_token:
            break
        else:
            gen_string += index_to_char[ni.item()]
            
            # update decoder_input at the next time step to be ni 
            # decoder_input = ....

    return gen_string

In [None]:
try:
    train_model(train_dict, val_dict, idx_dict, encoder, decoder, opts)
except KeyboardInterrupt:
    print('Exiting early from training.')