In [6]:
from __future__ import unicode_literals, print_function, division
from io import open

import time
import unicodedata
import string
import re
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### A. DATA PREP

* **Raw Data**
    * Fake translation data for sanity check.
    * The first sentence is A -> H, the second one letter off, i.e. B -> I.
    
* **Desired Format for Model**
    * An indexer that maps between words and word indices.
    * The data: a list of lists, where each sublist is a pair of index-coded sentences.
    * The lengths: for each sentence pair, we have their lengths.

In [51]:
class Indexer:
    
    def __init__(self, name):
        self.name = name
        self.word2index = {} # str -> int
        self.index2word = {}
        self.word2count = {} # str -> int
        self.nWords = 0  # Count SOS and EOS
    
    def add_sentence(self, sentence):
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.nWords
            self.word2count[word] = 1
            self.index2word[self.nWords] = word
            self.nWords += 1
        else:
            self.word2count[word] += 1 
            
    def get_index(self, word):
        return self.word2index[word] if word in self.word2index else -1
    
    def get_word(self, index):
        return self.index2word[index] if index<self.nWords else ""
    
    def get_sentence_index(self, sentence):
        return [self.get_index(word) for word in sentence]
    
    def get_sentence_word(self, indexSentence):
        return [self.get_word(index) for index in indexSentence]

In [62]:
VOCAB = [chr(i) for i in range(65,74)] # 'A' -> 'I'
FROM_LEN, TO_LEN = 3, 8
MAX_LENGTH = TO_LEN + 2
SOS, EOS = 'SOS', 'EOS'
INDEXER = Indexer('LetterTranslator')
DATA_SIZE = 3000

def translate_word(word):
    return VOCAB[VOCAB.index(word)+1]

def translate_sent(sent):
    return [translate_word(word) for word in sent]

def generate_pair():
    randInput = list(np.random.choice(VOCAB[:-1], size=random.randint(FROM_LEN,TO_LEN)))
    randTarget = translate_sent(randInput)
    randInputLen, randTargetLen = len(randInput), len(randTarget)
    return [str('SOS')]+randInput+[str('EOS')], [str('SOS')]+randTarget+[str('EOS')], \
           randInputLen+2, randTargetLen+2
        # str(): default is utf-8

def generate_data():
    pairs, lengths = [], []
    for _ in range(DATA_SIZE):
        randInput,randTarget,randInputLen,randTargetLen = generate_pair()
        INDEXER.add_sentence(randInput)
        INDEXER.add_sentence(randTarget)
        pairs.append([INDEXER.get_sentence_index(randInput),
                      INDEXER.get_sentence_index(randTarget)])
        lengths.append([randInputLen,randTargetLen])
    return pairs, lengths

In [63]:
pairs, lengths = generate_data()

##### B. MODEL

In [65]:
def to_tensor(sentence):
    """Convert a sentence into a <max-time,batch-size> shaped torch tensor.
    Args
        sentence: a list of word indices.
    Returns
        the same sentence in torch.Tensor.
    """
    return torch.tensor(sentence, dtype=torch.long, device=device).view(-1,1) 
        # <mt,> -> <mt,bc=1>

In [66]:
class Encoder(nn.Module):
    """A simple unidirectional GRU encoder."""
    
    def __init__(self, inputSize, hiddenSize):
        """
        Args
            inputSize: (combined) vocabulary size of the translated languages.
            hiddenSize: hidden vector size, used for both embeddings and GRU here.
        """
        super(Encoder, self).__init__()
        self.hiddenSize = hiddenSize
        self.embedding = nn.Embedding(inputSize, hiddenSize)
        self.gru = nn.GRU(hiddenSize, hiddenSize)
    
    def forward(self, input, hidden):
        """The function called by Encoder().
        Args
            input: a word index.
            hidden: an initial word index (i.e. SOS).
        Returns
            output: outputs of GRU (max-time,batch-size,hidden).
            hidden: the last hidden state of the GRU.
        """
        output = self.embedding(input).view(1, 1, -1)
            # format into <max-time,batch-size,hidden>
        output, hidden = self.gru(output, hidden)
            # output: <max-time,batch-size,hidden>
            # hidden: <n_layers*n_directions,batch-size,hidden>
        return output, hidden
    
    def init_hidden(self):
        """Initializer of the first input to a GRU.
        Returns
            A (max-time=1,batch-size=1,hidden) shaped zero tensor. # TODO: revise this.
        """
        return torch.zeros(1, 1, self.hiddenSize, device=device)

In [56]:
class Decoder(nn.Module):
    """A simple unidirectional GRU decoder."""
    
    def __init__(self, inputSize, hiddenSize):
        """
        Args
            inputSize: (combined) vocabulary size of the translated languages.
            hiddenSize: hidden vector size, used for both embeddings and GRU here.
        """
        super(Decoder, self).__init__()
        self.hiddenSize = hiddenSize
        self.embedding = nn.Embedding(inputSize, hiddenSize)
        self.gru = nn.GRU(hiddenSize, hiddenSize)
        self.fc = nn.Linear(hiddenSize, inputSize) # inputSize=outputSize=vocab
        self.softmax = nn.LogSoftmax(dim=1) # it will be applied on <bc,outputSize> tensor.

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(output, hidden)
        output = self.fc(output[0])
        output = F.relu(output) # only .nn.functional.relu works, .nn.ReLU doesn't
        output = self.softmax(output)
            # input size: torch.Size([1]) <- 1 word's index (1 time step).
            # to gru: torch.Size([1, 1, 5]) <max-time,batch-size,hidden>
            # after gru: torch.Size([1, 1, 5]) <max-time,batch-size,hidden>
            # after fc: torch.Size([1, 7119]) <batch-size,vocab-size>
            # after relu: torch.Size([1, 7119]) <batch-size,vocab-size>
            # after softmax: torch.Size([1, 7119]) <batch-size,vocab-size>
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [87]:
# Demo how the Encoder and Decoder classes work

print('ENCODER STEP\n\n')
print('Read 1 source sentence (in code):')
a1 = pairs[0][0]
print(a1, '\n')
a1 = to_tensor(a1)
print('Convert to torch.tensor:')
print(a1, a1.shape, '\n')
print('Run through encoder (10 max-time, hidden = 5):')
e = Encoder(INDEXER.nWords, hiddenSize=5)
eh = e.init_hidden()
eos = torch.zeros(10,5,device=device) # <mt,h>
for i in range(len(a1)):
    eo,eh = e(a1[i], eh) # outshape = <mt,bc,h> [1,1,5] 
    if i==0:
        print('shape of encoder outputs: (output, hidden)')
        print('output =', eo.shape, '| hidden =', eh.shape, '\n')
    eos[i] = eo[0][0] # a <5,> vector.
print('Final content in encoder output container:')
print(eos)
print(eos.shape)
print('\n====================\n\n')
print('DECODER STEP\n\n')
print('Read 1 target sentence (in code):')
a2 = pairs[0][1]
print(a2, '\n')
a2 = to_tensor(a2)
print('Convert to torch.tensor:')
print(a2, a2.shape, '\n')
print('Set up loss criterion:')
l = 0 
crit = nn.NLLLoss()
print(crit, '\n')
print('Run through decoder (10 max-time, hidden = 5):')
d = Decoder(INDEXER.nWords, hiddenSize=5)
di = torch.tensor([[INDEXER.get_index(SOS)]],device=device) # SOS token as the first
dh = eh # init decoder hidden as encoder last hidden
for i in range(len(a2)):
    do,dh = d(di, dh)
    if i==0:
        print('shape of decoder outputs: (output, hidden)')
        print('output =', do.shape, '| hidden =', dh.shape, '\n')
        print('Loss trace:')
    tv,ti = do.topk(1)
#     di = ti.squeeze().detach()
    di = ti
#     print(ti); assert 1==0
        # rid of all 1-dims, then detach from graph (no grad)
        # tv, ti shapes: [1,1]
        # .squeeze(): Returns a tensor with all the dimensions of input of size 1 removed.
        # .detach(): Returns a new Tensor, detached from the current graph.
        #            The result will never require gradient.
    l += crit(do,a2[i]) # do: <1,outputSize> (softmax); a2[i]: <1,>
    print('loss =', l)

ENCODER STEP


Read 1 source sentence (in code):
[0, 1, 2, 3, 4, 3, 5, 6] 

Convert to torch.tensor:
tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 3],
        [ 5],
        [ 6]]) torch.Size([8, 1]) 

Run through encoder (10 max-time, hidden = 5):
shape of encoder outputs: (output, hidden)
output = torch.Size([1, 1, 5]) | hidden = torch.Size([1, 1, 5]) 

Final content in encoder output container:
tensor([[-0.0720, -0.3249,  0.0433,  0.4857, -0.2185],
        [-0.4414, -0.1157,  0.3488,  0.2779, -0.0563],
        [-0.5602,  0.0517,  0.5414,  0.1268, -0.0166],
        [ 0.1437,  0.2044,  0.5803,  0.4327,  0.3196],
        [-0.1980,  0.1108,  0.5918,  0.1437,  0.2102],
        [ 0.2579,  0.2426,  0.5940,  0.4475,  0.4277],
        [ 0.0358,  0.0343,  0.4197,  0.2064, -0.3539],
        [-0.0381, -0.3561,  0.3072,  0.5485, -0.2567],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])
torch.Size([10,

##### C. TRAINING

In [88]:
def train(pairs, lengths, encoder, decoder, encoderOptim, decoderOptim, criterion,
          nEpochs=5, printEvery=100):
    totalLosses = []
    start = time.time()
    for e in range(nEpochs):
        currLosses = []
        for i in range(len(pairs)):
            encoderOptim.zero_grad()
            decoderOptim.zero_grad()
            currSource, currTarget = pairs[i]
            currSource, currTarget = to_tensor(currSource), to_tensor(currTarget)
            currSourceLength, currTargetLength = lengths[i]
            encoderOptim.zero_grad()
            decoderOptim.zero_grad()
            loss = 0
            encoderOutputs = torch.zeros(MAX_LENGTH, encoder.hiddenSize, device=device) # for attention later
            encoderHidden = encoder.init_hidden()
            for ei in range(currSourceLength):
                encoderOutput,encoderHidden = encoder(currSource[ei],encoderHidden)
                encoderOutputs[ei] = encoderOutput[0][0] # <mt=bc=1,h>
            decoderInput = torch.tensor([[INDEXER.get_index(SOS)]], device=device)
            decoderHidden = encoderHidden
            for di in range(currTargetLength):
                decoderOutput,decoderHidden = decoder(decoderInput,decoderHidden)
                topValue,topIndex = decoderOutput.topk(1)
                decoderInput = topIndex
#                 decoderInput = topIndex.squeeze().detach()
                loss += criterion(decoderOutput,currTarget[di]) 
                    # decoderOutput: <batch-size,vocab-size>, a dist'n over vocab.
                    # currTarget[di]: <batch-size>
                if decoderInput.item() == INDEXER.get_index(EOS):
                    break # hitting end of sentence
            loss.backward()
            torch.nn.utils.clip_grad_norm_(encoder.parameters(),5.0)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(),5.0)
            encoderOptim.step()
            decoderOptim.step()
            currAverageLoss = loss.item()/currTargetLength
            if i!=0 and i%printEvery==0:
                print('Loss at epoch %d step %d = %.4f (time=%.4f)' % (e+1,i,currAverageLoss,
                                                                       time.time()-start))
                start = time.time()
            currLosses.append(currAverageLoss)
        totalLosses.append(np.mean(currLosses))
    return np.mean(totalLosses), encoder, decoder

In [89]:
enc = Encoder(inputSize=INDEXER.nWords, hiddenSize=20)
dec = Decoder(inputSize=INDEXER.nWords, hiddenSize=20)
encOptim = optim.Adam(enc.parameters(), 1e-4)
decOptim = optim.Adam(dec.parameters(), 1e-4)
criterion = nn.NLLLoss()
meanLoss, enc, dec = train(pairs,lengths,enc,dec,encOptim,decOptim,criterion)
print('Total mean loss =', meanLoss)

Loss at epoch 1 step 100 = 2.2591 (time=0.8450)
Loss at epoch 1 step 200 = 2.3338 (time=1.1297)
Loss at epoch 1 step 300 = 2.0589 (time=1.2362)
Loss at epoch 1 step 400 = 1.5477 (time=1.2525)
Loss at epoch 1 step 500 = 1.5423 (time=1.2260)
Loss at epoch 1 step 600 = 2.2691 (time=1.1925)
Loss at epoch 1 step 700 = 1.6829 (time=1.1818)
Loss at epoch 1 step 800 = 1.2130 (time=1.1494)
Loss at epoch 1 step 900 = 1.3797 (time=1.1999)
Loss at epoch 1 step 1000 = 1.8641 (time=1.3682)
Loss at epoch 1 step 1100 = 1.4812 (time=1.1423)
Loss at epoch 1 step 1200 = 1.5155 (time=1.1544)
Loss at epoch 1 step 1300 = 1.0976 (time=1.1058)
Loss at epoch 1 step 1400 = 2.1136 (time=1.1800)
Loss at epoch 1 step 1500 = 2.1184 (time=1.1952)
Loss at epoch 1 step 1600 = 1.4907 (time=1.1655)
Loss at epoch 1 step 1700 = 1.5913 (time=1.1756)
Loss at epoch 1 step 1800 = 0.9895 (time=1.1751)
Loss at epoch 1 step 1900 = 1.2311 (time=1.2401)
Loss at epoch 1 step 2000 = 1.8789 (time=1.2590)
Loss at epoch 1 step 2100 = 1

##### D. EVALUATION

In [99]:
def translate(encoder, decoder, sentence, correctTranslation, indexer):
    with torch.no_grad():
        source = to_tensor(sentence)
        length = len(sentence)
        encoderOutputs = torch.zeros(MAX_LENGTH, encoder.hiddenSize, device=device) # for attention later
        encoderHidden = encoder.init_hidden()
        for ei in range(length):
            encoderOutput,encoderHidden = encoder(source[ei],encoderHidden)
            encoderOutputs[ei] = encoderOutput[0][0] # <mt=bc=1,h>
        decoderInput = torch.tensor([[0]], device=device)
        decoderHidden = encoderHidden  
        translatedSentence = []
        for di in range(MAX_LENGTH):
            decoderOutput,decoderHidden = decoder(decoderInput,decoderHidden)
            topValue,topIndex = decoderOutput.topk(1)
#             decoderInput = topIndex.squeeze().detach()
            decoderInput = topIndex
            translatedSentence.append(decoderInput.item())
            if decoderInput.item() == 1 or decoderInput.item() == 4: # 1:EOS; 4:.
                break # hitting end of sentence
        translatedSentence = indexer.get_sentence_word(translatedSentence)
    print('Original sentence >', ' '.join(indexer.get_sentence_word(sentence)))
    print('Model translation >', ' '.join(translatedSentence))
    print('Correct translation >', ' '.join(indexer.get_sentence_word(correctTranslation)),'\n')
    
def random_translate(encoder, decoder, indexer, k=10):
    for _ in range(k):
        i = random.randint(0,len(pairs))
        translate(encoder, decoder, pairs[i][0], pairs[i][1], indexer)

In [102]:
random_translate(enc, dec, INDEXER)

Original sentence > SOS G C B EOS
Model translation > SOS B D C
Correct translation > SOS H D C EOS 

Original sentence > SOS H F C H EOS
Model translation > SOS I D I I EOS EOS EOS EOS EOS
Correct translation > SOS I G D I EOS 

Original sentence > SOS F B G E C C D EOS
Model translation > SOS H E D D D F F D EOS
Correct translation > SOS G C H F D D E EOS 

Original sentence > SOS C F A A C D C EOS
Model translation > SOS D D D D B B B EOS EOS
Correct translation > SOS D G B B D E D EOS 

Original sentence > SOS C D G H D D G EOS
Model translation > SOS D F E I E G
Correct translation > SOS D E H I E E H EOS 

Original sentence > SOS G G E B D F B EOS
Model translation > SOS B B B E E E E EOS EOS
Correct translation > SOS H H F C E G C EOS 

Original sentence > SOS B G F EOS
Model translation > SOS C
Correct translation > SOS C H G EOS 

Original sentence > SOS B D D EOS
Model translation > SOS E E E EOS EOS EOS EOS EOS EOS
Correct translation > SOS C E E EOS 

Original sentence > SO