## Attention Seq2Seq

* **Task**: toy "translation" task --- translating a list of letters (from A to H) to the next-letter-list (e.g. ['A', 'B', 'C'] translates as ['B', 'C', 'D']. 
* **Type**: Luong et al. (2016). No bidirection or stacking. Clear-to-the-boot step-by-step demo.
* **PyTorch Version**: 0.3.1
* **Rant**: showy people on Github write convoluted tutorial code (although efficient, sophisticated and all). Doesn't help for beginners at all! This tutorial tells you all you need to know!!

In [1]:
from __future__ import division

import unicodedata
import string
import re
import random
import time
import math
import numpy as np

from io import open

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

### Data Prep

In [2]:
class Indexer:
    """Token-Index mapping."""
    
    def __init__(self, name):
        """
        Args:
            name: name of the indexer.
        """
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1} # str -> int
        self.index2word = {0: "SOS", 1: "EOS"}
        self.word2count = {"SOS": 0, "EOS": 0} # str -> int
        self.nWords = 0  # Count SOS and EOS
    
    def add_sentence(self, sentence):
        """Add a sentence to the dictionary.
        
        Args:
            sentence: a list of tokens (in string).
        """
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        """Add a word to the dictionary.
        
        Args:
            word: a token (in string).
        """
        if word not in self.word2index:
            self.word2index[word] = self.nWords
            self.word2count[word] = 1
            self.index2word[self.nWords] = word
            self.nWords += 1
        else:
            self.word2count[word] += 1 
            
    def get_index(self, word):
        """Word->Index lookup.
        
        Args:
            word: a token (string).
        Returns:
            The index of the word.
        """
        return self.word2index[word] if word in self.word2index else -1
    
    def get_word(self, index):
        """Index->Word lookup.
        
        Args:
            index: index of a token.
        Returns:
            The token under the index. -1 if the index is out of bound.
        """
        return self.index2word[index] if index<self.nWords else -1
    
    def get_sentence_index(self, sentence):
        """Words->Indexs lookup.
        
        Args:
            sentence: a list of token (string).
        Returns:
            A list of indices.
        """
        return [self.get_index(word) for word in sentence]
    
    def get_sentence_word(self, indexSentence):
        """Indexs->Words lookup.
        
        Args:
            indexSentence: a list of indices.
        Returns:
            A list of tokens (string).
        """
        return [self.get_word(index) for index in indexSentence]

In [3]:
# Toy data generation
#   vocab -> A to I
#   length -> 3 to 8
#   task -> translate for the next letter (e.g. A -> B)

VOCAB = [chr(i) for i in range(65,74)] # 'A' -> 'I'
FROM_LEN, TO_LEN = 3, 8
MAX_LENGTH = TO_LEN + 2
SOS, EOS = 'SOS', 'EOS'
INDEXER = Indexer('LetterTranslator')
DATA_SIZE = 3000

def translate_word(word):
    """Find the next letter.
    
    Args:
        word: a letter word (e.g. 'A').
    Returns:
        The next letter to word.
    """
    return VOCAB[VOCAB.index(word)+1]

def translate_sent(sent):
    """Find the next-letter translation of a sentence.
    
    Args:
        sent: a list of letter words.
    Returns:
        The next letters.
    """
    return [translate_word(word) for word in sent]

def generate_pair():
    """Randomly generate a pair of sentences (arg1 translates to arg2).
    
    Returns:
        randInput: a list of letter words.
        randTarget: a list of translation letter words of randInput.
        randInputLen, randTargetLen: lengths of the lists above.
    """
    randInput = list(np.random.choice(VOCAB[:-1], size=random.randint(FROM_LEN,TO_LEN)))
    randTarget = translate_sent(randInput)
    randInputLen, randTargetLen = len(randInput), len(randTarget)
    return randInput, randTarget+[str('EOS')], \
           randInputLen, randTargetLen+1
        # str(): default is utf-8

def generate_data():
    """Randomly generate a set of pairs of sentences (arg1 translates to arg2).
    
    Returns:
        pairs: a pair of lists of torch Variables (torch.LongTensor).
        lengths: lengths of the corresponding lists in pairs.
    """
    pairs, lengths = [], []
    for _ in range(DATA_SIZE):
        randInput,randTarget,randInputLen,randTargetLen = generate_pair()
        INDEXER.add_sentence(randInput)
        INDEXER.add_sentence(randTarget)
        pairs.append([Variable(torch.LongTensor(INDEXER.get_sentence_index(randInput)).view(-1,1)),
                      Variable(torch.LongTensor(INDEXER.get_sentence_index(randTarget)).view(-1,1))])
            # convert sentences to <mt,bc> shape.
            # here bc=1.
        lengths.append([randInputLen,randTargetLen])
    return pairs, lengths

In [4]:
pairs, lengths = generate_data()

### Model

In [50]:
# Seq2Seq with attention, similar to Luong et al. (2016)
#   Comment notation: mt = max-time; bc = batch-size; h = hidden-size.

HIDDEN_SIZE = 20

class EncoderRNN(nn.Module):
    """Simple GRU encoder."""
    
    def __init__(self, inputSize, hiddenSize, nLayers=1):
        # inputSize: vocabulary size.
        # hiddenSize: size for both embedding and GRU hidden.
        super(EncoderRNN, self).__init__()
        self.inputSize = inputSize
        self.hiddenSize = hiddenSize
        self.nLayers = nLayers
        self.embedding = nn.Embedding(inputSize, hiddenSize)
        self.gru = nn.GRU(hiddenSize, hiddenSize, nLayers)
    
    def forward(self, inputs, inputsLen, hidden):
        # inputs: <mt,bc>
        # hidden: <n_layer*n_direction,bc,h>
        embedded = self.embedding(inputs).view(inputsLen,1,-1) # <mt,bc=1,h>
        output,hidden = self.gru(embedded, hidden)
            # output: <mt,bc=1,h>
            # hidden: <n_layer*n_direction,bc,h>
        return output, hidden
    
    def init_hidden(self):
        return Variable(torch.zeros(self.nLayers,1,self.hiddenSize))

class LinearAttention(nn.Module):
    """Basic linear attention layer."""
    
    def __init__(self, hiddenSize):
        super(LinearAttention, self).__init__()
        self.hiddenSize = hiddenSize
        self.attention = nn.Linear(hiddenSize, hiddenSize)
    
    def forward(self, hidden, encoderOutput):
        encoderOutputLen = len(encoderOutput)
        attentionEnergies = Variable(torch.zeros(encoderOutputLen))
        for i in range(encoderOutputLen):
            attentionEnergies[i] = self.score(hidden, encoderOutput[i])
                # encoderOutput[i]: 1 time step from <mt,bc,h>
        return F.softmax(attentionEnergies,dim=-1).unsqueeze(0).unsqueeze(0)
            # <mt,> -> <bc=1,1,mt>, technical convenience.
        
    def score(self, hidden, encoderOutput):
            # hidden: <bc=1,h>
            # encoderOutput: <bc=1,h> (1 time step).
        energy = self.attention(encoderOutput)
            # linear attention: <bc,h> * <h,h> -> <bc,h>
        energy = hidden.dot(energy)
            # dot: <bc,h> * <bc,h> -> <bc,h>
            # .dot smartly find fitting dimensions.
        return energy
    
class LuongDecoderRNN(nn.Module):
    """Luong attention."""
    
    def __init__(self, hiddenSize, outputSize, nLayers=1):
        super(LuongDecoderRNN, self).__init__()
        self.hiddenSize = hiddenSize
        self.outputSize = outputSize
        self.nLayers = nLayers
        self.embedding = nn.Embedding(outputSize, hiddenSize)
        self.gru = nn.GRU(2*hiddenSize, hiddenSize) 
        self.out = nn.Linear(2*hiddenSize, outputSize)
            # inputSize doubles because concatted context of same hiddenSize.
        self.linearAttention = LinearAttention(hiddenSize)
        
    def forward(self, input, hidden, context, encoderOutput):
            # input: <mt=1,bc=1>
            # hidden: <n_layer*n_direction,bc,h>
            # context: <bc=1,h>
            # encoderOutput: <mt,bc,h>
        embedded = self.embedding(input).view(1,1,-1) # <mt=1,bc=1,h>
        input = torch.cat((embedded,context.unsqueeze(0)),2)
            # unsqueeze: <bc,h> -> <mt=1,bc,h>
            # concat: <mt,bc,h> & <mt,bc,h> @2 -> <mt,bc,2h>
        output, hidden = self.gru(input, hidden)
            # IN: <mt=1,bc,2h>, <n_layer*n_direction,bc,h>
            # OUT: <mt=1,bc,h>, <n_layer*n_direction,bc,h>
        attentionWeights = self.linearAttention(output.squeeze(0),
                                                encoderOutput)
            # squeeze: <mt=1,bc,h> -> <bc,h>
            # attentionWeights: <bc=1,1,mt>
        context = attentionWeights.bmm(encoderOutput.transpose(0,1))
            # transpose: <mt,bc,h> -> <bc,mt,h>
            # bmm (batched matrix multiplication): 
            #   <bc=1,1,mt> & <bc,mt,h> -> <bc=1,1,h>
        output = output.squeeze(0)
        context = context.squeeze(1)
            # output squeeze: <mt=1,bc=1,h> -> <bc,h>
            # context squeeze: <bc=1,1,h> -> <bc,h>
        output = F.log_softmax(F.tanh(self.out(torch.cat((output,context),1))),dim=-1)
            # concat: <bc,h> & <bc,h> @1 -> <bc,2h>
            # linear->tahn/out: <bc,2h> * <2h,vocab> -> <bc,vocab>
            # softmax: along dim=-1, i.e. vocab.
        return output, hidden, context, attentionWeights
            # full output for visualization:
            #   output: <bc,vocab>
            #   hidden: <n_layer*n_direction,bc,h>
            #   context: <bc,h>
            #   attentionWeights: <bc=1,1,mt>

### Trainer

In [51]:
def train_step(inputs, inputsLen, targets, targetsLen,
               encoder, decoder, 
               encoderOptim, decoderOptim, criterion,
               enforcingRatio, clip):
    """One training step (on a single pair of sentences)."""
    # Clear previous grads
    # WHY: Since the backward() function accumulates gradients, 
    #      and you don’t want to mix up gradients between minibatches, 
    #      you have to zero them out at the start of a new minibatch. 
    #      This is exactly like how a general (additive) accumulator 
    #      variable is initialized to 0 in code.
    encoderOptim.zero_grad()
    decoderOptim.zero_grad()
    # Set up loss
    loss = 0
    # Run encoder
    encoderHidden = encoder.init_hidden()
    encoderOutput, encoderHidden = encoder(inputs, inputsLen, encoderHidden)
    # Run decoder
    decoderInput = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
    decoderContext = Variable(torch.zeros(1,decoder.hiddenSize))
    decoderHidden = encoderHidden
    enforce = random.random() < enforcingRatio
    for di in range(targetsLen):
        decoderOutput,decoderHidden,decoderContext,attentionWeights = decoder(decoderInput,
                                                                              decoderHidden,
                                                                              decoderContext, 
                                                                              encoderOutput)
        loss += criterion(decoderOutput, targets[di])
        if enforce: # i.e. feed gold target tokens in training.
            decoderInput = targets[di] # decoderInput can be 1 or 1x1 
        else:
            topValue,topIndex = decoderOutput.data.topk(1)
            decoderInput = Variable(topIndex)
    # Backprop
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoderOptim.step()
    decoderOptim.step()
    return loss.data[0] / targetsLen

def train(pairs, lengths,
          nEpochs=1, epochSize=1000, lr=1e-4,
          enforcingRatio=0.5, clip=5.0,
          printEvery=100):
    """Train multiple steps."""
    dataSize = len(pairs)
    encoder = EncoderRNN(INDEXER.nWords, HIDDEN_SIZE)
    decoder = LuongDecoderRNN(HIDDEN_SIZE, INDEXER.nWords)
    encoderOptim = optim.Adam(encoder.parameters(),lr)
    decoderOptim = optim.Adam(decoder.parameters(),lr)
    criterion = nn.CrossEntropyLoss()
    averageLoss = 0
    for e in range(nEpochs):
        start = time.time()
        epochLoss = 0
        for step in range(epochSize):
            i = random.choice(range(0,dataSize))
            inputs, targets = pairs[i]
            inputsLen, targetsLen = lengths[i]
            loss = train_step(inputs, inputsLen, targets, targetsLen,
                              encoder, decoder,
                              encoderOptim, decoderOptim, criterion,
                              enforcingRatio, clip)
            if step!=0 and step%printEvery==0:
                print("Step %d average loss = %.4f" % (step, loss))
            epochLoss += loss
        epochLoss /= epochSize
        averageLoss += epochLoss
        print("\nEpoch %d loss = %.4f (time: %.2f)\n" % (e+1,epochLoss,
                                                         time.time()-start))
    averageLoss /= nEpochs
    print("\nGrand average loss = %.4f\n" % averageLoss)
    return encoder, decoder

In [52]:
encoder, decoder = train(pairs, lengths, 
                         nEpochs=5, epochSize=len(pairs),
                         printEvery=500)

Step 500 average loss = 1.9816
Step 1000 average loss = 2.0165
Step 1500 average loss = 2.1684
Step 2000 average loss = 1.9741
Step 2500 average loss = 1.8433

Epoch 1 loss = 1.9717 (time: 72.23)

Step 500 average loss = 1.7264
Step 1000 average loss = 1.4872
Step 1500 average loss = 1.4274
Step 2000 average loss = 1.6146
Step 2500 average loss = 1.4620

Epoch 2 loss = 1.6652 (time: 67.30)

Step 500 average loss = 1.6495
Step 1000 average loss = 1.4078
Step 1500 average loss = 1.6295
Step 2000 average loss = 1.7015
Step 2500 average loss = 1.2893

Epoch 3 loss = 1.4910 (time: 66.06)

Step 500 average loss = 1.2579
Step 1000 average loss = 1.5437
Step 1500 average loss = 1.7422
Step 2000 average loss = 1.1936
Step 2500 average loss = 1.2038

Epoch 4 loss = 1.3755 (time: 64.57)

Step 500 average loss = 1.2677
Step 1000 average loss = 1.2960
Step 1500 average loss = 1.3592
Step 2000 average loss = 1.3739
Step 2500 average loss = 0.9951

Epoch 5 loss = 1.2489 (time: 64.97)


Grand average 

### Evaluation

In [None]:
def train_step(inputs, inputsLen, targets, targetsLen,
               encoder, decoder, 
               encoderOptim, decoderOptim, criterion,
               enforcingRatio, clip):
    """One training step (on a single pair of sentences)."""
    # Clear previous grads
    # WHY: Since the backward() function accumulates gradients, 
    #      and you don’t want to mix up gradients between minibatches, 
    #      you have to zero them out at the start of a new minibatch. 
    #      This is exactly like how a general (additive) accumulator 
    #      variable is initialized to 0 in code.
    encoderOptim.zero_grad()
    decoderOptim.zero_grad()
    # Set up loss
    loss = 0
    # Run encoder
    encoderHidden = encoder.init_hidden()
    encoderOutput, encoderHidden = encoder(inputs, inputsLen, encoderHidden)
    # Run decoder
    decoderInput = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
    decoderContext = Variable(torch.zeros(1,decoder.hiddenSize))
    decoderHidden = encoderHidden
    enforce = random.random() < enforcingRatio
    for di in range(targetsLen):
        decoderOutput,decoderHidden,decoderContext,attentionWeights = decoder(decoderInput,
                                                                              decoderHidden,
                                                                              decoderContext, 
                                                                              encoderOutput)
        loss += criterion(decoderOutput, targets[di])
        if enforce: # i.e. feed gold target tokens in training.
            decoderInput = targets[di] # decoderInput can be 1 or 1x1 
        else:
            topValue,topIndex = decoderOutput.data.topk(1)
            decoderInput = Variable(topIndex)
    # Backprop
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoderOptim.step()
    decoderOptim.step()
    return loss.data[0] / targetsLen

In [44]:
def evaluate(sent, sentLen, target, targetLen,
             encoder, decoder, 
             maxLength):
    encoderHidden = encoder.init_hidden()
    encoderOutput, encoderHidden = encoder(sent, sentLen, encoderHidden)
    decoderInput = Variable(torch.LongTensor([[INDEXER.get_index('SOS')]]))
    decoderContext = Variable(torch.zeros(1,decoder.hiddenSize))
    decoderHidden = encoderHidden
    prediction = []
    lengthGen = 0
    while True:
        lengthGen += 1
        decoderOutput,decoderHidden,decoderContext,attentionWeights = decoder(decoderInput,
                                                                              decoderHidden,
                                                                              decoderContext, 
                                                                              encoderOutput)
        topValue,topIndex = decoderOutput.data.topk(1)
        decoderInput = Variable(topIndex)
        prediction += list(topIndex.squeeze().numpy())
        if prediction[0] == INDEXER.get_index('EOS') or lengthGen>=maxLength:
            break
    sent = list(sent.data.squeeze().numpy())
    target = list(target.data.squeeze().numpy())
    print("INPUT >> %s" % ' '.join(INDEXER.get_sentence_word(sent)))
    print("PRED >> %s" % ' '.join(INDEXER.get_sentence_word(prediction[:targetLen])))
    print("TRUE >> %s" % ' '.join(INDEXER.get_sentence_word(target)))
    
def random_evaluate(pairs, lengths,
                    encoder, decoder,
                    maxLength=15):
    i = random.choice(range(0,len(pairs)))
    sent, target = pairs[i]
    sentLen, targetLen = lengths[i]
    evaluate(sent, sentLen, target, targetLen, encoder, decoder, maxLength)

In [46]:
random_evaluate(pairs, lengths, encoder, decoder)

INPUT >> B F A D E E H
PRED >> C G B F B F I B
TRUE >> C G B E F F I B
