In [1]:
import numpy as np
import math
import re
import time

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

## Load data

In [3]:
with open("./fr-en/europarl-v7.fr-en.en",mode='r',encoding='utf-8') as f:
    europarl_en = f.read()
    
with open("./fr-en/europarl-v7.fr-en.fr",mode='r',encoding='utf-8') as f:
    europarl_fr = f.read()
    
with open("./fr-en/nonbreaking_prefix.en",mode='r',encoding='utf-8') as f:
    non_breaking_prefix_en = f.read()

with open("./fr-en/nonbreaking_prefix.fr",mode='r',encoding='utf-8') as f:
    non_breaking_prefix_fr = f.read()

In [4]:
europarl_en[:50]

'Resumption of the session\nI declare resumed the se'

## Cleaning

In [5]:
non_breaking_prefix_en = non_breaking_prefix_en.split('\n')
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en]

non_breaking_prefix_fr = non_breaking_prefix_fr.split('\n')
non_breaking_prefix_fr = [' ' + pref + '.' for pref in non_breaking_prefix_fr]

We will need each word and other symbol that we want to keep to be in lower case and separated by spaces, so we can "tokenize" them.

In [6]:
corpus_en = europarl_en
for prefix in non_breaking_prefix_en:
    corpus_en = corpus_en.replace(prefix, prefix + "###")
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###", corpus_en)
corpus_en = re.sub(r"\.###",'',corpus_en)
corpus_en = re.sub(r" +", ' ', corpus_en)
corpus_en = corpus_en.split('\n')

corpus_fr = europarl_fr
for prefix in non_breaking_prefix_fr:
    corpus_fr = corpus_fr.replace(prefix, prefix + "###")
corpus_fr = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###", corpus_fr)
corpus_fr = re.sub(r"\.###",'',corpus_fr)
corpus_fr = re.sub(r" +", ' ', corpus_fr)
corpus_fr = corpus_fr.split('\n')

## Tokenizing text

In [7]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_fr = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_fr, target_vocab_size=2**13)

In [8]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_FR = tokenizer_fr.vocab_size + 2

In [9]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1]
          for sentence in corpus_en]
outputs = [[VOCAB_SIZE_FR-2] + tokenizer_fr.encode(sentence) + [VOCAB_SIZE_FR-1]
           for sentence in corpus_fr]

## Remove too long sentences

In [10]:
MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

## Inputs/outputs creation

As we train with batches, we need each input to have the same length. We pad with the appropriate token, and we will make sure this padding token doesn't interfere with our training later.

In [11]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [12]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs,outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

## Model building

### Embedding

Positional encoding formulae:

$PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$

$PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$

In [13]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

### Attention

#### Attention computation

$Attention(Q, K, V ) = \text{softmax}\left(\dfrac{QK^T}{\sqrt{d_k}}\right)V $