# **Translation of French-to-English with a Neural Net**

Source:  [https://github.com/d-insight/code-bank.git](https://github.com/d-insight/code-bank.git)  
License: [MIT License](https://opensource.org/licenses/MIT). See open source [license](LICENSE) in the Code Bank repository. 

-------------

## Overview

In this illustration, we train a French-English translation model. The input data consists of English-French pairs of short sentences. We fit a sequence-to-sequence model using an encoder-decoder architecture with attention. 

The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence. The below picture and formulas are an example of attention mechanism from Luong's paper.

<img src="https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg" width="500" height="500" align="center"/>

Image source: https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg

__References__: Portions of this page are reproduced from work created and shared by Google and used according to terms described in the [Creative Commons 4.0 Attribution License](https://creativecommons.org/licenses/by/4.0/). For the original tutorial visit: https://www.tensorflow.org/tutorials/text/nmt_with_attention . For a more formal introduction to neural machine translation with attention, see Luong et al. (2015): https://arxiv.org/abs/1508.04025v5

-------------

## **Part 0**: Setup

### Import packages

In [None]:
# Import all packages

import tensorflow            as tf
import numpy                 as np
from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import io
import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


### Constants

In [None]:
PATH_TO_FILE  = 'data/fra.txt'
BATCH_SIZE    = 64
EMBEDDING_DIM = 256
UNITS         = 1024

### Support functions

In [None]:
def unicode_to_ascii(s):
    """
    Converts unicode text to ascii (e.g. removing accents)
    
    Args:
        s (str): input text
        
    Returns: normal form form of the unicode string 
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    """
    Pre-process sentence: add space between word and punctuation, add start and end tags, etc.
    
    Args:
        w (str): input sentence
        
    Returns: preprocessed sentence
    """
    
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w


def create_dataset(path, num_examples = None):
    """
    1. Remove the accents
    2. Clean the sentences
    3. Return word pairs in the format: [ENGLISH, FRENCH]
    
    Args:
        path (str): path of data
        num_examples (int): number of example sentence pairs to include
        
    Returns: list of lists with word pairs
    """
    
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return list(zip(*word_pairs))


def max_length(tensor):
    """
    Returns the maximum length of a sentence; used for padding data.
    
    Args:
        tensor (tf.keras tensor): array of words 
        
    Returns: maximum number of words in a sentence 
    """
    return max([len(t) for t in tensor])


def tokenize(lang):
    """
    Tokenizes text
    
    Args:
        lang (str): input text
        
    Returns: tensor of words, padded with 0s at the end 
    """
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer


def load_dataset(path, num_examples=None):
    """
    Creating cleaned input, output pairs
    
    Args:
        path (str): path to dataset
        num_examples (int): number of example sentence pairs to include
        
    Returns: 4 elements (input/target tensor and input/target tokenized)
    """
    targ_lang, inp_lang, _ = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

def convert(lang, tensor):
    """
    Prints how indices map to words
    
    Args:
        lang (str): input text
        tensor (tf.keras): input tensor of indices
    """
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        
        return tf.zeros((self.batch_sz, self.enc_units))

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights
    
def loss_function(real, pred):
    """
    Loss function
    
    Args:
        real: ground truth values
        pred: predicted values
    
    Returns: function to reduce loss function in tensorflow
    """
    
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

@tf.function
def train_step(inp, targ, enc_hidden):
    """
    Runs one training step
    Note that the decorator will compile the function into the tensorflow graph for faster execution on GPUs and TPUs
    
    Args:
        inp: input text data
        targ: target 
        enc_hidden: weights of the encoder
        
    Returns: loss for batch
    """
    
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


def evaluate(sentence):
    """
    Evaluates the prediction
    
    Args:
        sentence (str): input sentence
        
    Returns: 3 elements (prediction result, pre-processed sentence, attention plot)
    """
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


def plot_attention(attention, sentence, predicted_sentence):
    
    """
    Plot attention weights
    
    Args:
        attention: matrix of attention in source and target sentence
        sentence: input sentence
        predicted_sentence: predicted sentence
        
    """
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    
def translate(sentence):
    """
    Translates an input sentence
    
    Args:
        sentence (str): input sentence to translate 
        
    """
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

## **Part 1**: Load & pre-process data

We first load in training data of the form of sentence pairs, for example: 

    I'll try.								Je vais essayer.
    I'm busy.								Je suis occupé.
    What happened? You look pale.			Qu'est-il arrivé ? Tu es tout pâle.
    I took a taxi because the bus was late.  Je pris un taxi car le bus était en retard.


Then, we perform the following pre-processing steps:
1. Add a start and end token to each sentence.
2. Clean the sentences by removing special characters.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.

In [None]:
# Example sentence
en_sentence = u"How about calling it a day?"
fr_sentence = u"Que diriez-vous d'arrêter pour aujourd'hui ?"

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(fr_sentence).encode('utf-8'))

In [None]:
# Load data
en, fr, _ = create_dataset(PATH_TO_FILE)

print(en[-1])
print()
print(fr[-1])

### Reduce data size for faster experimentation

In [None]:
# Try experimenting with the size of that dataset 
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH_TO_FILE, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

# Example tensors
print('Length:', len(input_tensor[0]), input_tensor[0])
print('Length:', len(input_tensor[25]), input_tensor[25])

In [None]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print('Validation samples:'.ljust(22) + str(input_tensor_train.shape) + ' ' + str(target_tensor_train.shape))
print('Training samples:'.ljust(22) + str(input_tensor_val.shape) + '  ' + str(target_tensor_val.shape))

In [None]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

In [None]:
# Create tf.data datast 
BUFFER_SIZE = len(input_tensor_train)

steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

dataset

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
print('Input shape:'.ljust(15) +  str(example_input_batch.shape))
print('Target shape:'.ljust(15) +  str(example_target_batch.shape))

## **Part 2**: Set up encoder-decoder

We now implement an encoder-decoder model with attention. For a conceptual understanding of the encoder-decoder model similar to the image at the very top, see [Luong, Pham, and Manning (2015)](https://arxiv.org/pdf/1508.04025v5.pdf).

This tutorial uses Bahdanau attention for the encoder (for details see [Bahdanau, Cho, and Bengio (2016)](https://arxiv.org/pdf/1409.0473.pdf))).

In [None]:
# ENCODER

encoder = Encoder(vocab_inp_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
# ATTENTION

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

In [None]:
# DECODER

decoder = Decoder(vocab_tar_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

In [None]:
# Optimizer and loss function
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


In [None]:
# Checkpoints
checkpoint_dir = './data/nmt_training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## **Part 3**: Train model 

This notebook with 30,000 samples (of the >100,000 sentences in the dataset) takes approximately 10 minutes to run on a single NVIDIA P100 GPU, which costs around $5,000. Hence, we do not aim to train a model from scratch, but rather import a pre-trained model (in the form of checkpoints) that we ran on a P100 GPU.

In [None]:
EPOCHS = 0  # not training: we use pre-trained model

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    # if (epoch + 1) % 2 == 0:
    #     checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {:.4f} sec\n'.format(time.time() - start))

## **Part 4**: Restore checkpoint and test

In [None]:
# Download pre-trained model from Google Cloud storage
# CHECKPOINT LOCATIONS ON GOOGLE MOVED!
!wget -N 'https://storage.googleapis.com/dsfm/nmt_training_checkpoints/checkpoint' --directory-prefix='data/nmt_training_checkpoints'
!wget -N 'https://storage.googleapis.com/dsfm/nmt_training_checkpoints/ckpt-1.data-00000-of-00002' --directory-prefix='data/nmt_training_checkpoints'
!wget -N 'https://storage.googleapis.com/dsfm/nmt_training_checkpoints/ckpt-1.data-00001-of-00002' --directory-prefix='data/nmt_training_checkpoints'
!wget -N 'https://storage.googleapis.com/dsfm/nmt_training_checkpoints/ckpt-1.index' --directory-prefix='data/nmt_training_checkpoints'

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
# What works (sort of)
translate(u'tu ne comprends rien .')


In [None]:
# What works
translate(u'nous avons reussi .')


In [None]:
# What works
translate(u'je te dois un petit déjeuner .')

In [None]:
# What works
translate(u'allez aider votre pere !')

In [None]:
# What works
translate(u'vous devriez arrêter de fumer maintenant .')

In [None]:
# What works
translate(u'tu fais quoi demain ?')

In [None]:
# What doesn't work
translate(u'c\'est ma vie .')

In [None]:
# What doesn't work
translate(u'je profite bien plus de la vie .')
