# Attention!
[pytorch tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) for building seq2seq2 translation from scratch

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# Data and preprocessing
This is what the word2count one-hot vector looks like:
![encoder](img/word-encoding.png)

In [2]:
# define start and end of sentence
SOS_token = 0
EOS_token = 1

# create language model class
"""
Each word will be represented as a one-hot vector (BOW where only the word has a count value)
"""
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {} # convert word to dictionary index
        self.word2count = {} # create one-hot count vector for word
        self.index2word = {0: "SOS", 1: "EOS"} # gets the word for an index
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        # apply pipeline to each word in a sentence
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        # if the word is new
        if word not in self.word2index:
            # add index of word as the next free index (starts at 2 because 0 and 1 is already filled)
            self.word2index[word] = self.n_words
            # add value to one-hot vector
            self.word2count[word] = 1
            # add word to the next free index
            self.index2word[self.n_words] = word
            # keep track of next open index
            self.n_words += 1
        # if word already exists, increase it's one-hot count by 1
        else:
            self.word2count[word] += 1

In [3]:
# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [4]:
# to read the data of lang1 to lang2
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs [input, target] and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances (B -> A Translation)
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        # define input and target
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [5]:
# for this example, we will pick out only sentences that start with "I am" etc
# Max sentence length of 10 words for speed

MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# applies criteria to a pair
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

# returns array of all pairs that match the criteria
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

Out pre-processing *pipeline* looks like this:
- Read text file and split into lines, split lines into pairs
- Normalize text, filter by length and content
- Make word lists from sentences in pairs

In [6]:
# pipeline
def prepareData(lang1, lang2, reverse=False):
    # read the coloumns of both languages
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    # filter out pairs that match our cirteria
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    # creating language class (one-hot vectors) for all language pairs
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# apply pipeline
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['c est une vraie commere .', 'she s a real gossip .']


# The Seq2Seq model
![s2s](img/seq2seq.png)

Consider the sentence “Je ne suis pas le chat noir” → “I am not the black cat”. Most of the words in the input sentence have a direct translation in the output sentence, but are in slightly different orders, e.g. “chat noir” and “black cat”. Because of the “ne/pas” construction there is also one more word in the input sentence. It would be difficult to produce a correct translation directly from the sequence of input words.

With a seq2seq model the encoder creates a single vector which, in the ideal case, encodes the “meaning” of the input sequence into a single vector — a single point in some N dimensional space of sentences.
## Encoder
The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

![s2s](img/encoder-network.png)

In [None]:
# define encoder block
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        # grab output and hidden layer of input
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        # inital hidden layer is 1x1xhidden_size 0-tensor
        return torch.zeros(1, 1, self.hidden_size, device=device)

## Decoder
In the simplest seq2seq decoder we use only last output of the encoder. This last output is sometimes called the context vector as it encodes context from the entire sequence. This context vector is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and hidden state. The initial input token is the start-of-string `<SOS>` token, and the first hidden state is the context vector (the encoder’s last hidden state).

![decoder](img/decoder-network.png)