In [4]:
import os
import pdb
import argparse
import pickle as pkl

from collections import defaultdict

import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt

from easydict import EasyDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Local imports
import utils
import data_handler
from data_handler import *
from torch.distributions import Categorical

In [6]:
# Options/ Hyperparameters required to train and test the model
opts = EasyDict()

opts.n_epochs = 50
opts.batch_size = 16
opts.learning_rate = 0.005
opts.lr_decay = 0.99
opts.hidden_layer_size = 100
opts.model_name = "simple_rnn"
opts.checkpoints_dir = "./checkpoints/"+opts.model_name 
opts.temp = 0.4
TEST_SENTENCE = 'i love deep learning'

In [7]:
utils.create_dir_if_not_exists(opts.checkpoints_dir)

In [8]:
line_pairs, vocab_size, idx_dict = load_data()

In [9]:
# dividing the line pairs into 8:2, train and val split
num_lines = len(line_pairs)
num_train = int(0.8 * num_lines)
train_pairs, val_pairs = line_pairs[:num_train], line_pairs[num_train:]

In [None]:
train_dict = create_dict(train_pairs)
val_dict = create_dict(val_pairs)

# Study the structure of the created train_dict and val_dict variables

In [None]:
# Implement your own LSTM cell. A sample class definition is given to you.

class LSTMCell():
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        # ------------
        # FILL THIS IN
        # HINT: Define weight matrices/linear layers needed LSTM computations
        # ------------
        
    def forward(self, x, h_prev, c_prev):
        """Forward pass of the LSTM computation for one time step.

        Arguments
            x: batch_size x input_size
            h_prev: batch_size x hidden_size
            c_prev: batch_size x hidden_size
        Returns:
            h_new: batch_size x hidden_size
            c_new: batch_size x hidden_size
        """
        # ------------
        # FILL THIS IN
        # ------------
        # f = 
        # i = 
        # o = 
        # c_dash = 
        # c_new = 
        # h_new = 
        
        
        return h_new, c_new
    

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # embedding layer
        # Initialize LSTMCell

    def forward(self, inputs):
        """Forward pass of the encoder RNN.

        Arguments:
            inputs: Input token indexes across a batch for all time steps in the sequence. (batch_size x seq_len)

        Returns:
            annotations: The hidden states computed at each step of the input sequence. (batch_size x seq_len x hidden_size)
            hidden: The final hidden state of the encoder, for each sequence in a batch. (batch_size x hidden_size)
        """

        batch_size, seq_len = inputs.size()
        hidden = self.init_hidden(batch_size)
        cell = self.init_hidden(batch_size)
        
        # The encoded embeddings should be of size batch_size x seq_len x hidden_size
        # Complete the forward pass 
        # HINT: Loop over the time-steps do forward pass through LSTM cell at each ti
        
        return annotations, hidden

    def init_hidden(self, bs):
        """Creates a tensor of zeros to represent the initial hidden states
        of a batch of sequences.

        Arguments:
            bs: The batch size for the initial hidden state.

        Returns:
            hidden: An initial hidden state of all zeros. (batch_size x hidden_size)
        """
        return torch.zeros(bs, self.hidden_size)


# Implement your Decoder RNN using instances of LSTM Cell you just created.
# You would need a character embedding layer for this. 
# In addition you would also require an activation function applied to the output of the LSTM Cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        # define embedding layer
        # define LSTM cell
        # define output linear layer

    def forward(self, x, h_prev, c_prev):
        """Forward pass of the decoder RNN.

        Arguments:
            x: Input token indexes across a batch for a single time step. (batch_size x 1)
            h_prev: The hidden states from the previous step, across a batch. (batch_size x hidden_size)
            c_prev: The cell states from the previous step, across a batch. (batch_size x hidden_size)

        Returns:
            output: Un-normalized scores for each token in the vocabulary, across a batch. (batch_size x vocab_size)
            h_new: The new hidden states, across a batch. (batch_size x hidden_size)
            c_new: The new cell states, across a batch. (batch_size x hidden_size)
        """
        # Implement the forward pass of this network
        # ....
        
        
        return output, h_new, c_new

In [None]:
##########################################################################
### Setup: Create Encoder, Decoder Objects ###
##########################################################################
encoder = Encoder(vocab_size=vocab_size, hidden_size=opts.hidden_layer_size)
decoder = Decoder(vocab_size=vocab_size, hidden_size=opts.hidden_layer_size)


In [None]:
def train_model(train_dict, val_dict, idx_dict, encoder, decoder, opts):
    """Runs the main training loop; evaluates the model on the val set every epoch.
        * Prints training and val loss each epoch.
        * Prints qualitative translation results each epoch using TEST_SENTENCE

    Arguments:
        train_dict: The training word pairs, organized by source and target lengths.
        val_dict: The validation word pairs, organized by source and target lengths.
        idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes.
        encoder: An encoder model to produce annotations for each step of the input sequence.
        decoder: A decoder model to generate output tokens.
        opts: The input arguments for hyper-parameters and others.
    """
    
    # Define your loss function and optimizers
    # ....
    
    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    char_to_index = idx_dict['char_to_index']

    loss_log = open(os.path.join(opts.checkpoints_dir, 'loss_log.txt'), 'w')

    best_val_loss = 1e6
    train_losses = []
    val_losses = []

    for epoch in range(opts.n_epochs):
        
        # decay the learning rate of the optimizer
        # ....
#         optimizer.param_groups[0]['lr'] *= opts.lr_decay

        epoch_losses = []

        for key in train_dict:

            input_strings, target_strings = zip(*train_dict[key])
            
            # Make your input tensor and the target tensors
            # HINT : use the function string_to_index_list given in data_handler.py
            # input_tensors = ....
            # output_tensors = ....
            
            num_tensors = len(input_tensors)
            num_batches = int(np.ceil(num_tensors / float(opts.batch_size)))
            
            criterion = nn.CrossEntropyLoss()
            for i in range(num_batches):

                start = i * opts.batch_size
                end = start + opts.batch_size

                # Define inputs and targets for THIS batch, beginning at index 'start' to 'end'
                # inputs = ....
                # outputs = ....
                # targets = ...
                
                #print ("targets", targets.size(), targets)
                # The batch size may be different in each epoch
                BS = inputs.size(0)

                encoder_annotations, encoder_hidden = encoder.forward(inputs)

                # The last hidden state of the encoder becomes the first hidden state of the decoder
                # decoder_hidden = ....
                # decoder_cell = torch.zeros(BS, encoder.hidden_size)

                # Define the first decoder input. This would essentially be the start_token
                # decoder_input = ....
                

                loss = 0.0

                seq_len = targets.size(1)  # Gets seq_len from BS x seq_len
                
                for si in range(seq_len):
                    decoder_output, decoder_hidden, decoder_cell = decoder.forward(decoder_input, decoder_hidden, decoder_cell)

                    current_target = targets[:,si]
                    
                    # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
                    # loss += ....
                    
                    decoder_input = targets[:,si].unsqueeze(1)

                loss /= float(seq_len)
                epoch_losses.append(loss.item())
                
                # Compute gradients
#                 loss.backward()

                # Update the parameters of the encoder and decoder
#                 optimizer.step()

        train_loss = np.mean(epoch_losses)
        val_loss = evaluate(val_dict, encoder, decoder, idx_dict, criterion, opts)

        if val_loss < best_val_loss:
            utils.store_checkpoints(encoder, decoder, idx_dict, opts)

        gen_string = find_pig_latin(TEST_SENTENCE, encoder, decoder, idx_dict, opts)
        #print()
        print("Epoch: {:3d} | Train loss: {:.3f} | Val loss: {:.3f} | Gen: {:20s}".format(epoch, train_loss, val_loss, gen_string))

        loss_log.write('{} {} {}\n'.format(epoch, train_loss, val_loss))
        loss_log.flush()

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        utils.store_loss_plots(train_losses, val_losses, opts)


In [None]:
def evaluate(data_dict, encoder, decoder, idx_dict, criterion, opts):
    """Evaluates the model on a held-out validation or test set. 
    This should be pretty straight-forward if you have figured out how to do the training correctly.
    From then, it's just copy and paste.

    Arguments:
        data_dict: The validation/test word pairs, organized by source and target lengths.
        encoder: An encoder model to produce annotations for each step of the input sequence.
        decoder: A decoder model to generate output tokens.
        idx_dict: Contains char-to-index and index-to-char mappings, and start & end token indexes.
        criterion: Used to compute the CrossEntropyLoss for each decoder output.
        opts: The command-line arguments.

    Returns:
        mean_loss: The average loss over all batches from data_dict.
    """

    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    char_to_index = idx_dict['char_to_index']

    losses = []

    for key in data_dict:

        input_strings, target_strings = zip(*data_dict[key])
        
        # Make your input tensor and the target tensors
        # HINT : use the function string_to_index_list given in data_handler.py
        # input_tensors = ....
        # output_tensors = .
            
        num_tensors = len(input_tensors)
        num_batches = int(np.ceil(num_tensors / float(opts.batch_size)))

        for i in range(num_batches):

            start = i * opts.batch_size
            end = start + opts.batch_size

            # Define inputs and targets for THIS batch, beginning at index 'start' to 'end'
            # inputs = ....
            # outputs = ....
            # targets = ...
            
            # The batch size may be different in each epoch
            BS = inputs.size(0)

            encoder_annotations, encoder_hidden = encoder.forward(inputs)
            
            # The last hidden state of the encoder becomes the first hidden state of the decoder
            # decoder_hidden = ....
            # decoder_cell = torch.zeros(BS, encoder.hidden_size)

            # Define the first decoder input. This would essentially be the start_token
            # decoder_input = ...

            loss = 0.0

            seq_len = targets.size(1)  # Gets seq_len from BS x seq_len
            #print("seq len", seq_len)
            for i in range(seq_len):
                decoder_output, decoder_hidden, decoder_cell = decoder.forward(decoder_input, decoder_hidden, decoder_cell)

                current_target = targets[:,i]

                # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
                # loss += ....

                # Find out the most probable character (ni) from the softmax distribution produced
                # ni = ....
                
                # Update decoder_input at the next time step to be this time-step's target 
                # decoder_input = ....

            loss /= float(seq_len)
            losses.append(loss.item())

    mean_loss = np.mean(losses)

    return mean_loss


In [None]:
def find_pig_latin(sentence, encoder, decoder, idx_dict, opts):
    """Translates a sentence from English to Pig-Latin, by splitting the sentence into
    words (whitespace-separated), running the encoder-decoder model to translate each
    word independently, and then stitching the words back together with spaces between them.
    """
    return ' '.join([translate(word, encoder, decoder, idx_dict, opts) for word in sentence.split()])


def translate(input_string, encoder, decoder, idx_dict, opts):
    """Translates a given string from English to Pig-Latin.
    Not much to do here as well. Follows basically the same structure as that of the function evaluate.
    """

    char_to_index = idx_dict['char_to_index']
    index_to_char = idx_dict['index_to_char']
    start_token = idx_dict['start_token']
    end_token = idx_dict['end_token']
    #print("end token", end_token)
    max_generated_chars = 20
    gen_string = ''

    # convert given string to an array of indexes
    # HINT: use the function string_to_index_list provided in data_handler
    # indexes = ....

    encoder_annotations, encoder_last_hidden = encoder.forward(indexes)

    # The last hidden state of the encoder becomes the first hidden state of the decoder
    # decoder_hidden = ....
    # decoder_cell = torch.zeros(1, encoder.hidden_size)

    # Define the first decoder input. This would essentially be the start_token
    # decoder_input = ....
    
    for i in range(max_generated_chars):
        decoder_output, decoder_hidden, decoder_cell = decoder.forward(decoder_input, decoder_hidden, decoder_cell)
        # Calculate the cross entropy between the decoder distribution and Ground truth (current_target)
        # loss += ....
        
        # Find out the most probable character (ni) from the softmax distribution produced
        # ni = ....

        if int(new_token[0]) == end_token:
            break
        else:
            gen_string += index_to_char[ni.item()]
            
            # update decoder_input at the next time step to be ni 
            # decoder_input = ....

    return gen_string

In [None]:
try:
    train_model(train_dict, val_dict, idx_dict, encoder, decoder, opts)
except KeyboardInterrupt:
    print('Exiting early from training.')