# Assignment 5 


## Student Name/ID: ____

---

## Improving training of sequence-to-sequence network. ##



In this assignment you will build an improved version of Neural Machine Translation (NMT) model to translate human readable dates ("25th of June, 2009") into machine readable dates ("2009-06-25"). 

As you know from the last lecture, a Sequence to Sequence network, or seq2seq network, or Encoder Decoder network, is a model consisting of two RNNs called the encoder and decoder. The encoder reads an input sequence and outputs a single vector, and the decoder reads that vector to produce an output sequence. See lecture slides 32~ .

In [None]:
import torch
import numpy as np
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

We will need several helper functions to load data.

In [None]:
from faker import Faker
from tqdm import tqdm
from babel.dates import format_date

Faker.seed(12345)
random.seed(12345)
fake = Faker()

# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

LOCALES = ['en_US']


def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS), locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',', '')
        machine_readable = dt.isoformat()

    except AttributeError:
        return None, None, None

    return human_readable, machine_readable, dt


def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """

    human_vocab = set()
    machine_vocab = set()
    dataset = []

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))

    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'],
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v: k for k, v in inv_machine.items()}

    return dataset, human, machine, inv_machine


def string_to_int(string, length, vocab):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"

    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"

    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """

    # make lower to standardize
    string = string.lower()
    string = string.replace(',', '')

    if len(string) > length:
        string = string[:length]

    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))

    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))

    return rep


def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(nbatch, bsz, data.size(1)).contiguous()
    return data


def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty, batch_size=40):

    X, Y = zip(*dataset)

    X = torch.tensor(np.array([string_to_int(i, Tx, human_vocab) for i in X]), dtype=torch.long, device=device)
    Y = torch.tensor(np.array([string_to_int(t, Ty, machine_vocab) for t in Y]), dtype=torch.long, device=device)

    return batchify(X, batch_size), batchify(Y, batch_size)

In [None]:
dataset, human_vocab, machine_vocab, inv_machine = load_dataset(10000)
print(random.choice(dataset))

Let's preprocess the data and map the raw text data into the index values. We will also use Tx=30 (which we assume is the maximum length of the human readable date; if we get a longer input, we'd have to truncate it) and Ty=10 (since "YYYY-MM-DD" is 10 characters long).

In [None]:
Tx = 30
Ty = 10
X, Y = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)

## The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word. This part of your model will be the same as in the Exercise 5 part 2.

In [None]:
class EncoderRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.gru = torch.nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, input.size(0), -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

## The Decoder with attention

The decoder is another RNN that takes the encoder output vector(s) and outputs a sequence of words to create the translation.

## Problem 1:

Insert batch normalization layer before GRU layer. 

Check [documentation for batch normalization layers](https://pytorch.org/docs/stable/nn.html#normalization-layers). 

**Hint**
Be carefull with layer sizes. RNN layers expects tensors of size (sequence_length x batch_size x input_size), in this case it's ( 1 x 40 x 256 ). While `BatchNorm1D` layer expects tensor of size (batch_size x input_size). Since sequence length in the decoder part is always 1, you only need to remove first dimension before batch normalization and add it back after.

In [None]:
class AttnDecoderRNN(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p, max_length=Tx):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = torch.nn.Embedding(self.output_size, self.hidden_size)

        self.attn = torch.nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = torch.nn.Dropout(self.dropout_p)

        # !!! Your code here:
        self.batch_norm = 

        self.gru = torch.nn.GRU(self.hidden_size, self.hidden_size)
        self.out = torch.nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, input.size(0), -1)
        embedded = self.dropout(embedded)

        attn_weights = torch.nn.functional.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                 encoder_outputs)

        output = torch.cat((embedded[0], attn_applied[:, 0, :]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = torch.nn.functional.relu(output)

        # !!! Your code here:
        output = 

        output, hidden = self.gru(output, hidden)
        output = torch.nn.functional.log_softmax(self.out(output[0]), dim=1)

        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

## Train the model

First let's make some helper functions to plot losses while training and to print time elapsed and estimated time remaining given the current time and progress %.

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import time
import math

%matplotlib inline


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

**Train step** 
First, we write `train_step()` function to perform one training step over sentence pair. In the Exercise 5 part 2, we used model output as decoder input for next time step. There's a ["Scheduled sampling"](https://arxiv.org/abs/1506.03099) concept of using the real target outputs as each next input, instead of using the decoder’s guess as the next input. Using scheduled sampling causes model to converge faster.

## Problem 2

Implement random sampling of the decoder input.
- Get sampling probability and decide whether to use sampling or not.
- Feed the target as the next input

Because of the freedom PyTorch’s autograd gives us, we can randomly choose to use sampling or not with a simple if statement. Turn `sampling_ratio` up to use more or less of it.

**Hint**

Use `random.random()` to get sampling probability and compare its value with `sampling_ratio` to decide whether to use target or decoder's guess as decoder input.

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, 
          criterion, sampling_ratio=0.5, max_length=Tx):
    encoder_hidden = encoder.initHidden(40)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(1)
    target_length = target_tensor.size(1)

    encoder_outputs = torch.zeros(40, max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[:, ei], encoder_hidden)
        encoder_outputs[:, ei, :] = encoder_output

    decoder_input = torch.tensor(np.array([len(machine_vocab)]*40), dtype=torch.long, device=device)
    decoder_input = decoder_input.view(40, -1)

    decoder_hidden = encoder_hidden

    # !!! Your code here:
    if   :
        use_sampling = True
    else:
        use_sampling = False

    if use_sampling:
        # Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[:, di])

            # !!! Your code here:
            decoder_input =

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            loss += criterion(decoder_output, target_tensor[:, di])

            decoder_input = topi.squeeze().detach()  # detach from history as input

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Epoch training

## Problem 3

Use [`Adam`](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam) optimizer to train encoder and decoder. You need to adjust your learning rate. It's most sensitive hyper-parameter and might differ a lot depending on optimizer and other modules.

In [None]:
def trainIters(encoder, decoder, n_epochs, print_every=1000, plot_every=100, learning_rate=0.005):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    n_iters = n_epochs * X.size(0)

    # !!! Your code here:
    encoder_optimizer =
    decoder_optimizer =

    criterion = torch.nn.NLLLoss()
    for ep in range(n_epochs):
        for b_id in range(X.size(0)):
            input_tensor = X[b_id]
            target_tensor = Y[b_id]

            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer,
                         criterion, sampling_ratio=0.5)
            print_loss_total += loss
            plot_loss_total += loss

            if (b_id % print_every == 0) and b_id != 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, (b_id + ep*X.size(0)) / n_iters),
                      (b_id + ep*X.size(0)), (b_id + ep*X.size(0)) / n_iters * 100, print_loss_avg))
                evaluateRandomly(encoder, decoder, 1)
                # Set training mode for encoder and decoder
                encoder.train()
                decoder.train()

            if (b_id % plot_every == 0) and b_id != 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    showPlot(plot_losses)

### Evaluation

Evaluation is mostly the same as training, but there are no targets so we simply feed the decoder’s predictions back to itself for each step. Every time it predicts a word we add it to the output string, and if it predicts the EOS token we stop there. We also store the decoder’s attention outputs for display later.

In [None]:
def evaluate(encoder, decoder, input_tensor, max_length=Tx):
    with torch.no_grad():
        input_tensor = input_tensor.unsqueeze(0)
        input_length = input_tensor.size(1)
        encoder_hidden = encoder.initHidden(1)

        encoder_outputs = torch.zeros(1, max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(
                input_tensor[:, ei], encoder_hidden)
            encoder_outputs[:, ei, :] = encoder_output

        decoder_input = torch.tensor(np.array([len(machine_vocab)]*1), dtype=torch.long, device=device)
        decoder_input = decoder_input.view(1, -1)

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(Ty):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            decoded_words.append(inv_machine[topi.item()])

            decoder_input = topi.detach()

        return decoded_words, decoder_attentions[:di + 1]


def evaluateRandomly(encoder, decoder, n=10):
    # This disables Dropout and BatchNormalization operation during the test.
    encoder.eval()
    decoder.eval()

    for i in range(n):
        pair = random.choice(dataset)
        idx = [dataset.index(pair) // 40, dataset.index(pair) % 40]
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder,
                                            X[dataset.index(pair) // 40, dataset.index(pair) % 40])
        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print('')

### Training and Evaluating

With all these helper functions in place (it looks like extra work, but it makes it easier to run multiple experiments) we can actually initialize a network and start training.


In [None]:
hidden_size = 256
encoder1 = EncoderRNN(len(human_vocab), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(machine_vocab)+1, dropout_p=0.2).to(device)

trainIters(encoder1, attn_decoder1, 5, print_every=50, plot_every=30)

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

---