# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [1]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import os
dataset_path = "data/"
os.listdir(dataset_path)

['names', 'eng-fra.txt']

In [3]:
SOS_token = 0
EOS_token = 1


class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [5]:
def read_languages(lang1, lang2, reverse=False):
    # Read the file and split into lines
    print("Reading lines...")
    lines = open(dataset_path + '%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
    
    ## Make Language objects
    input_lang = Language(lang1)
    output_lang = Language(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def pair_predicate(p):
    p1, p2 = p
    return len(p1.split(' ')) < MAX_LENGTH and \
        len(p2.split(' ')) < MAX_LENGTH and \
        p2.startswith(eng_prefixes)

def filter_pairs(pairs):
    return [pair for pair in pairs if pair_predicate(pair)]

In [7]:
def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = read_languages(lang1, lang2, reverse)
    
    print("Read %s sentence pairs" % len(pairs))
    pairs = filter_pairs(pairs)
    
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
eng 4345
fra 2803
['c est deja un homme .', 'he s already a man .']


## encoder network

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        x = self.embedding(input).view(1, 1, -1)
        output, h = self.gru(x, hidden)
        return output, h

    def init_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        x = self.embedding(input).view(1, 1, -1)
        x = F.relu(x)
        output, hidden = self.gru(x, hidden)

        # Softmax over output to transform into word probability
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [9]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttentionDecoderRNN, self).__init__()
        
        # Set dims
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # Set layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attention = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attention_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # Embed
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        # Compute attention score as matrix product
        attention_weights = F.softmax(self.attention(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attention_applied = torch.bmm(attention_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        # Linear + RELU
        x = torch.cat((embedded[0], attention_applied[0]), 1)
        x = self.attention_combine(x).unsqueeze(0)
        x = F.relu(x)
        
        # GRU RNN
        output, hidden = self.gru(x, hidden)
        
        # Softmax over output to transform into word probability
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attention_weights

    def init_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [17]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

### Positional Encoding
`PositionalEncoding` module injects some information about the relative or absolute position of the tokens in the sequence. The positional encodings have the same dimension as the embeddings so that the two can be summed. Here, we use sine and cosine functions of different frequencies.

In [18]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [10]:
def sentence_to_index(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def sentence_to_tensor(lang, sentence):
    indexes = sentence_to_index(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair_to_tensor(pair):
    input_tensor = sentence_to_tensor(input_lang, pair[0])
    target_tensor = sentence_to_tensor(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [11]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    loss = 0

    # Set optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Calculate sequence lengths
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    # Initialize encoder output and hidden state as the zero vectors
    encoder_hidden = encoder.init_hidden_state()
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    # Unroll Encoder RNN
    for t in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[t], encoder_hidden)
        encoder_outputs[t] = encoder_output[0, 0]

    # After unrolling the Encoder RNN, the decoder takes the last encoder 
    # hidden state as it's initial hidden state
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    # Unroll Attention Decoder RNN, stop when most probable output token is the EOS token
    for dt in range(target_length):
        decoder_output, decoder_hidden, decoder_att = decoder(decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[dt])
        
        # Get index of the most probable token
        _, argmax = decoder_output.topk(1)
        decoder_input = argmax.squeeze().detach()  # detach from history as input
        
        # Stop unrolling if token is EOS
        if decoder_input.item() == EOS_token:
            break

    # Perform BackProp
    loss.backward()

    # Tune params
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

In [12]:
import time
import math


def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [13]:
def train_iterations(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # Init optimizers with Stochastic Gradient Descent
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    # Init loss function
    criterion = nn.NLLLoss()

    # Init dataset, pick random pairs
    training_pairs = [pair_to_tensor(random.choice(pairs)) for i in range(n_iters)]
    
    for i in range(1, n_iters + 1):
        # Split training pair
        input_tensor, target_tensor = training_pairs[i - 1]

        # Run through train algorithm
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        # Add losses to aux. variables
        print_loss_total += loss
        plot_loss_total += loss

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (time_since(start, i / n_iters), i, i / n_iters * 100, print_loss_avg))

        if i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    show_loss_plot(plot_losses)

In [14]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def show_loss_plot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [15]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = AttentionDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

In [16]:
train_iterations(encoder1, decoder1, 75000, print_every=100)

0m 4s (- 60m 39s) (100 0%) 4.7307
0m 8s (- 51m 7s) (200 0%) 3.6012
0m 11s (- 48m 41s) (300 0%) 3.5900
0m 15s (- 47m 17s) (400 0%) 3.4469
0m 18s (- 46m 40s) (500 0%) 3.4119
0m 22s (- 46m 15s) (600 0%) 3.4907
0m 25s (- 45m 58s) (700 0%) 3.2503


KeyboardInterrupt: 

Loss: 0.5736

In [27]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = sentence_to_tensor(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.init_hidden_state()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        # Unroll Encoder RNN
        for t in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[t], encoder_hidden)
            encoder_outputs[t] += encoder_output[0, 0]

        # Initialize Decoder Hidden State as the last Encoder Hidden State
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        # Unroll Decoder until most probable tokens is EOS
        for dt in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[dt] = decoder_attention.data

            # Get index of most probable token (argmax)
            _, argmax = decoder_output.data.topk(1)
            
            # Stop unrolling if EOS token is most probable
            if argmax.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            # If not EOS token, append the most probable word (convert index to string)
            else:
                decoded_words.append(output_lang.index2word[argmax.item()])
            
            # Stop keeping track of gradients..? i.e detach() function
            decoder_input = argmax.squeeze().detach()

        return decoded_words, decoder_attentions[:dt + 1]

def evaluate_randomly(encoder, decoder, n=10):
    for i in range(n):
        source, target = random.choice(pairs)
        print('SOURCE:', source)
        print('TARGET:', target)
        print('...')
        # Translate source sentence
        output_words, attentions = evaluate(encoder, decoder, source)
        output_sentence = ' '.join(output_words)
        print('PREDICTED:', output_sentence)
        print('-'*50)

In [37]:
evaluate_randomly(encoder1, decoder1)

SOURCE: je suis plutot occupe .
TARGET: i m rather busy .
...
PREDICTED: i m pretty busy . <EOS>
--------------------------------------------------
SOURCE: on est enfin seuls .
TARGET: we re finally alone .
...
PREDICTED: we re finally alone . <EOS>
--------------------------------------------------
SOURCE: nous sommes tres occupees .
TARGET: we re very busy .
...
PREDICTED: we re very busy . <EOS>
--------------------------------------------------
SOURCE: nous sommes en charge .
TARGET: we re in charge .
...
PREDICTED: we re in charge . <EOS>
--------------------------------------------------
SOURCE: quelle commere tu fais .
TARGET: you re such a tattletale .
...
PREDICTED: you are such the one . . <EOS>
--------------------------------------------------
SOURCE: je suis souvent en difficulte .
TARGET: i am often in difficulties .
...
PREDICTED: i m in in trouble . <EOS>
--------------------------------------------------
SOURCE: nous sommes vieux amis .
TARGET: we re old friends .
...


In [24]:
# Load trained model instead
encoder_dict = torch.load('saved-models/encoder-rnn.model', map_location=torch.device(device))
decoder_dict = torch.load('saved-models/decoder-rnn-att.model', map_location=torch.device(device))

In [21]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = AttentionDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

In [25]:
encoder1.load_state_dict(encoder_dict)
decoder1.load_state_dict(decoder_dict)

<All keys matched successfully>

In [28]:
evaluate_randomly(encoder1, decoder1)

SOURCE: ce n est qu une enfant .
TARGET: she s just a child .
...
PREDICTED: she s just a child . <EOS>
--------------------------------------------------
SOURCE: elles ont de la chance d etre vivantes .
TARGET: they re lucky to be alive .
...
PREDICTED: they re lucky to be alive . <EOS>
--------------------------------------------------
SOURCE: je suis fier de vous les gars .
TARGET: i m proud of you guys .
...
PREDICTED: i m proud of you guys . <EOS>
--------------------------------------------------
SOURCE: ils sont dans la douche .
TARGET: they re in the shower .
...
PREDICTED: they re in the shower . <EOS>
--------------------------------------------------
SOURCE: il est japonais de naissance .
TARGET: he is japanese by birth .
...
PREDICTED: he is japanese in japanese . <EOS>
--------------------------------------------------
SOURCE: je ne suis pas libre cette apres midi .
TARGET: i m not free to go this afternoon .
...
PREDICTED: i m not free to go free afternoon afternoon .
---