# **stage 1: import librarys**

---



## colab

In [None]:
#only for google colab
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
try:
  %tensorflow_version 2.x
except:
  pass

## All devices

In [None]:
import os
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

# **Stage 2: Preprocessing**
---

## Extract dataset es-en from file targz
I download from this page https://www.statmt.org/europarl/

In [None]:
%cd ./drive/MyDrive/Colab\ Notebooks/investigacion/transformers

/content/drive/MyDrive/Colab Notebooks/investigacion/transformers


## Open data

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/investigacion/transformers/data/europarl-v7.es-en.en', 
          mode='r',
          encoding='utf-8') as f:
  europarl_en = f.read()

with open('/content/drive/MyDrive/Colab Notebooks/investigacion/transformers/data/europarl-v7.es-en.es', 
          mode='r',
          encoding='utf-8') as f:
  europarl_es = f.read()

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/investigacion/transformers/data/europarl-v7.es-en.es', 
          mode='r',
          encoding='utf-8') as f:
  europarl_es = f.read()

## Cleaning
Some basic cleaning only for test our implementation

In [None]:
corpus_en = europarl_en
corpus_es = europarl_es

In [None]:
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])",'.###', corpus_en)
corpus_en = re.sub(r"\.###",'', corpus_en)
corpus_en = re.sub(r"  +",' ', corpus_en)
corpus_en = corpus_en.split('\n')

corpus_es = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])",'.###', corpus_es)
corpus_es = re.sub(r"\.###",'', corpus_es)
corpus_es = re.sub(r"  +",' ', corpus_es)
corpus_es = corpus_es.split('\n')

## Tokenization

the algorithm used is byte pair encoding: https://leimao.github.io/blog/Byte-Pair-Encoding/

In [None]:
# Build
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)
tokenizer_es = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_es, target_vocab_size=2**13)

tokenizer_en.save_to_file('./data/tokenizer_en')
tokenizer_es.save_to_file('./data/tokenizer_es')

In [None]:
#Load
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file('./data/tokenizer_en')
#ids = tokenizer_en.encode("hello world")
#text = tokenizer_en.decode([1, 2, 3, 4])

tokenizer_es = tfds.deprecated.text.SubwordTextEncoder.load_from_file('./data/tokenizer_es')
#ids = tokenizer_es.encode("hola mundo")
#text = tokenizer_es.decode([1, 2, 3, 4])

In [None]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_ES = tokenizer_es.vocab_size + 2

In [None]:
MAX_LENGTH = 0
for i in corpus_es:
  longitud = len(tokenizer_es.encode(i))
  if MAX_LENGTH < longitud:
    MAX_LENGTH = longitud
MAX_LENGTH

1305

In [None]:
def encode(lang1, lang2):
  inputs = [VOCAB_SIZE_ES-2]+tokenizer_es.encode(lang1.numpy())+ [VOCAB_SIZE_ES-1]
  outputs = [VOCAB_SIZE_EN-2]+tokenizer_en.encode(lang2.numpy())+ [VOCAB_SIZE_EN-1]
  return inputs, outputs

def tf_encode(es, en):
  result_es, result_en = tf.py_function(encode, [es, en], [tf.int64, tf.int64])
  result_es.set_shape([None])
  result_en.set_shape([None])
  return result_es, result_en

## Remove too long sentences

In [None]:
MAX_LENGTH = 160

def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

## Inputs and output for model

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = len(corpus_es) # space where the shuffle is done 20000
dataset = tf.data.Dataset.from_tensor_slices((corpus_es, corpus_en))

dataset = dataset.map(tf_encode)
dataset = dataset.filter(filter_max_length)
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes= (MAX_LENGTH,MAX_LENGTH))
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Stage 3: Model building

I build the model taken from the next article https://arxiv.org/abs/1706.03762

## Positional encoding
the positional encoding is defined as:
* $PE(pos, 2i)=sin(pos/1000^{2i/d_{model}})$
* $PE(pos, 2i+1)=cos(pos/1000^{2i/d_{model}})$





In [None]:
class PositionalEncoding(layers.Layer):
  def __init__(self):
    super(PositionalEncoding, self).__init__()
    
  def get_angles(self,pos, i, d_model):#pos:(seq_length,1), i:(1, d_model)
    angles = 1/(np.power(10000.,(2*(i//2))/d_model))
    return pos*angles #(seq_length, d_model)
  
  def call(self, inputs):
    seq_length = inputs.shape.as_list()[-2]
    d_model = inputs.shape.as_list()[-1]
    angles = self.get_angles(np.arange(seq_length)[:,np.newaxis],
                            np.arange(d_model)[np.newaxis,:],
                            d_model)
    angles[:,0::2] = np.sin(angles[:,0::2])
    angles[:,1::2] = np.cos(angles[:,1::2])
    pos_encoding =  angles[np.newaxis,...]
    return inputs + tf.cast(pos_encoding, tf.float32)

## Dot product attention computation
* $attention(Q,K,V) = softmax(QK^{T}/\sqrt{d_{k}})V$


In [None]:
def scaled_dot_product_attetion(queries, keys, values, mask):
  """
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 

    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    """
  product = tf.matmul(queries, keys, transpose_b=True) # (..., seq_len_queries, seq_len_keys)

  keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
  scaled_dot_product = product/ tf.math.sqrt(keys_dim)

  if mask is not None:
    scaled_dot_product += (mask*-1e9)

  attention_weights = tf.nn.softmax(scaled_dot_product,axis=-1)
  output = tf.matmul(attention_weights, values) #(..., seq_len_q, depth_v)

  #q(1, 3)
  #k(4, 3)
  #v(4, 2)
  #result = (1,3).t((4,3)) => (1,4)*(4,2) =>(1,2)=(q_len, v_dim)
  return output

## Multihead attention

In [None]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
  
  def build(self, input_shape):
    self.d_model = input_shape[-1]
    assert self.d_model % self.num_heads == 0

    self.depth = self.d_model//self.num_heads

    self.wq = layers.Dense(self.d_model)
    self.wk = layers.Dense(self.d_model)
    self.wv = layers.Dense(self.d_model)

    self.top_dense = layers.Dense(self.d_model)

  def split_heads(self, inputs, batch_size):
    shape = (batch_size, -1, self.num_heads, self.depth)
    splited_inputs = tf.reshape(inputs, shape=shape)
    return tf.transpose(splited_inputs, perm=[0,2,1,3])#(batch_size,heads,seq_len,depth)

  def call(self, queries, keys, values, mask):
    batch_size = tf.shape(queries)[0]

    queries = self.wq(queries)# (batch_size, seq_len, d_model)
    keys = self.wk(keys)# (batch_size, seq_len, d_model)
    values = self.wv(values)# (batch_size, seq_len, d_model)


    queries = self.split_heads(queries, batch_size)# (batch_size, num_heads, seq_len_q, depth)
    keys = self.split_heads(keys, batch_size)# (batch_size, num_heads, seq_len_k, depth)
    values = self.split_heads(values, batch_size)# (batch_size, num_heads, seq_len_v, depth)

    attention = scaled_dot_product_attetion(queries, keys, values, mask)
    attention = tf.transpose(attention, perm=[0,2,1,3])  

    concat_attention = tf.reshape(attention, shape=(batch_size,-1, self.d_model))
    outputs = self.top_dense(concat_attention)
    return outputs

## Encoder

In [None]:
class EncoderLayer(layers.Layer):
  def __init__(self,
               ffn_units, num_heads, dropout):
    super(EncoderLayer,self).__init__()
    self.ffn_units = ffn_units
    self.num_heads = num_heads
    self.dropout = dropout
  
  def build(self, inputs_shape):
    self.d_model = inputs_shape[-1]
    self.multi_head_attention = MultiHeadAttention(self.num_heads)
    self.dropout_1 = layers.Dropout(rate = self.dropout)
    self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

    self.dense_1 = layers.Dense(units = self.ffn_units, activation="relu")
    self.dense_2 = layers.Dense(units = self.d_model)
    self.dropout_2 = layers.Dropout(rate = self.dropout)
    self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

  def call(self, inputs, mask, training):
    attention = self.multi_head_attention(inputs,inputs,inputs, mask)
    attention = self.dropout_1(attention, training = training)
    attention = self.norm_1(attention+inputs)

    outputs = self.dense_1(attention)
    outputs = self.dense_2(outputs)
    outputs = self.dropout_2(outputs, training = training)
    outputs = self.norm_2(outputs)
    return outputs

In [None]:
class Encoder(layers.Layer):
  def __init__(self, nb_layers, 
               ffn_units, 
               num_heads, 
               dropout, 
               vocab_size,
               d_model,
               name="Encoder"):
    super(Encoder, self).__init__(name=name)
    self.nb_layers = nb_layers
    self.d_model = d_model
    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate=dropout)
    self.enc_layers = [EncoderLayer(ffn_units,
                                    num_heads, 
                                    dropout) for _ in range(nb_layers)]

  def call(self, inputs, mask, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training=training)

    for i in range(self.nb_layers):
      outputs = self.enc_layers[i](outputs, mask, training)

    return outputs

## Decoder

In [None]:
class DecoderLayer(layers.Layer):
  def __init__(self, ffn_units, num_head, dropout):
    super(DecoderLayer, self).__init__()
    self.ffn_units = ffn_units
    self.num_head = num_head
    self.dropout = dropout
  
  def build(self, input_shape):
    self.d_model = input_shape[-1]

    self.multi_head_attention_1 = MultiHeadAttention(self.num_head)
    self.dropout_1 = layers.Dropout(rate=self.dropout)
    self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

    self.multi_head_attention_2 = MultiHeadAttention(self.num_head)
    self.dropout_2 = layers.Dropout(rate=self.dropout)
    self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    self.dense_1 = layers.Dense(units = self.ffn_units, activation="relu")
    self.dense_2 = layers.Dense(units = self.d_model)
    self.dropout_3 = layers.Dropout(rate = self.dropout)
    self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

  def call(self, inputs, enc_outputs, mask_1, mask_2, training):
    attention = self.multi_head_attention_1(inputs,
                                            inputs,
                                            inputs,
                                            mask_1)
    attention = self.dropout_1(attention, training)
    attention = self.norm_1(attention + inputs)

    attention_2 = self.multi_head_attention_2(attention,
                                            enc_outputs,
                                            enc_outputs,
                                            mask_2)
    attention_2 = self.dropout_2(attention_2, training)
    attention_2 = self.norm_2(attention_2 + inputs)

    outputs = self.dense_1(attention_2)
    outputs = self.dense_2(outputs)
    outputs = self.dropout_3(outputs, training = training)
    outputs = self.norm_3(outputs+attention_2)

    return outputs

In [None]:
class Decoder(layers.Layer):
  def __init__(self,
              nb_layers,
              ffn_units,
              num_heads,
              dropout,
              vocab_size,
              d_model,
              name="decoder"):
    super(Decoder,self).__init__(name=name)
    self.d_model = d_model
    self.nb_layers = nb_layers
    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate=dropout)
    self.dec_layers = [DecoderLayer(ffn_units, 
                                    num_heads, 
                                    dropout) for _ in range(nb_layers)]

  def call(self, inputs, enc_outputs, mask_1, mask_2, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training=training)

    for i in range(self.nb_layers):
      outputs = self.dec_layers[i](outputs, enc_outputs, mask_1, mask_2, training)

    return outputs

## Transformer

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, vocab_size_enc,
               vocab_size_dec,
               d_model,
               nb_layers,
               ffn_units,
               num_heads,
               dropout,
               name="transformer"):
    super(Transformer, self).__init__(name = name)
    self.encoder = Encoder(nb_layers,
                          ffn_units,
                          num_heads,
                          dropout,
                          vocab_size_enc,
                          d_model)
    self.decoder = Decoder(nb_layers,
                          ffn_units,
                          num_heads,
                          dropout,
                          vocab_size_dec,
                          d_model)
    
    self.last_dense = layers.Dense(units = vocab_size_dec)

  def create_padding_mask(self, seq):
    mask = tf.cast(tf.math.equal(seq,0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :] 

  def create_look_ahead_mask(self,seq):
    seq_len = tf.shape(seq)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)),-1,0)
    return look_ahead_mask

  def call(self, enc_inputs, dec_inputs, training):
    enc_mask = self.create_padding_mask(enc_inputs)
    dec_mask_1 = tf.maximum(
        self.create_padding_mask(dec_inputs),
        self.create_look_ahead_mask(dec_inputs)
    )
    dec_mask_2 = self.create_padding_mask(enc_inputs)
    enc_ouputs = self.encoder(enc_inputs, enc_mask, training)
    dec_outputs = self.decoder(dec_inputs,
                               enc_ouputs, 
                               dec_mask_1,
                               dec_mask_2,
                               training)
    outputs = self.last_dense(dec_outputs)
    return outputs

## Training

### Hyper-parameters

In [None]:
tf.keras.backend.clear_session()
D_MODEL = 256 #512
NB_LAYERS = 4 #6
FFN_UNITS = 2048 #2048
NB_HEADS = 8 #8
DROPOUT = 0.1 #0.1

transformer = Transformer(vocab_size_enc = VOCAB_SIZE_ES,
               #vocab_size_dec = VOCAB_SIZE_EN,
               vocab_size_dec = VOCAB_SIZE_ES,
               d_model = D_MODEL,
               nb_layers = NB_LAYERS,
               ffn_units = FFN_UNITS,
               num_heads = NB_HEADS,
               dropout = DROPOUT)

### Loss and accuracy function

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(target, pred):
  mask = tf.math.logical_not(tf.math.equal(target, 0))
  loss_ = loss_object(target, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [None]:
def accuracy_function(target, pred):
  accuracy = tf.equal(target, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(target, 0))
  accuracy = tf.math.logical_and(mask, accuracy)

  accuracy = tf.cast(accuracy, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracy)/tf.reduce_sum(mask)

### Custom Schedule
$lrate = d_{model}^{-0.5}*min({steps}^{-0.5},\mbox{step_num}*\mbox{warmup_steps}^{-0.5})$

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

### Metrics

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Mean(name='val_accuracy')

### Checkpoint setting

In [None]:
checkpoint_path = './ckpt'
ckpt = tf.train.Checkpoint(transformer = transformer, 
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
if(ckpt_manager.latest_checkpoint):
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('latest_checkpoint restored')

latest_checkpoint restored


### Training

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, MAX_LENGTH), dtype=tf.int64),
    tf.TensorSpec(shape=(None, MAX_LENGTH), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  with tf.GradientTape() as tape:
    predictions = transformer(inp, tar_inp, 
                                 True)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_weights)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_weights))

  train_loss.update_state(loss)
  train_accuracy.update_state(accuracy_function(tar_real, predictions))


In [None]:
#I stop the model only in two epochs due to lack of time
EPOCHS=2
for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  # inp -> espanish, tar -> english
  for (batch, (inp, tar)) in enumerate(dataset):
    train_step(inp, tar)

    if batch % 50 == 0:
      print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
  #if (epoch + 1) % 5 == 0:
  ckpt_save_path = ckpt_manager.save()
  print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))

  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


Epoch 1 Loss 9.0188 Accuracy 0.0000
Epoch 1 Loss 8.8804 Accuracy 0.0289
Epoch 1 Loss 8.7290 Accuracy 0.0401
Epoch 1 Loss 8.5899 Accuracy 0.0488
Epoch 1 Loss 8.4231 Accuracy 0.0595
Epoch 1 Loss 8.2353 Accuracy 0.0693
Epoch 1 Loss 8.0370 Accuracy 0.0791
Epoch 1 Loss 7.8448 Accuracy 0.0886
Epoch 1 Loss 7.6659 Accuracy 0.0985
Epoch 1 Loss 7.5007 Accuracy 0.1083
Epoch 1 Loss 7.3454 Accuracy 0.1178
Epoch 1 Loss 7.1995 Accuracy 0.1267
Epoch 1 Loss 7.0637 Accuracy 0.1351
Epoch 1 Loss 6.9383 Accuracy 0.1428
Epoch 1 Loss 6.8204 Accuracy 0.1501
Epoch 1 Loss 6.7109 Accuracy 0.1570
Epoch 1 Loss 6.6101 Accuracy 0.1632
Epoch 1 Loss 6.5166 Accuracy 0.1691
Epoch 1 Loss 6.4287 Accuracy 0.1748
Epoch 1 Loss 6.3474 Accuracy 0.1800
Epoch 1 Loss 6.2691 Accuracy 0.1853
Epoch 1 Loss 6.1963 Accuracy 0.1901
Epoch 1 Loss 6.1263 Accuracy 0.1950
Epoch 1 Loss 6.0608 Accuracy 0.1995
Epoch 1 Loss 5.9978 Accuracy 0.2039
Epoch 1 Loss 5.9375 Accuracy 0.2082
Epoch 1 Loss 5.8808 Accuracy 0.2123
Epoch 1 Loss 5.8257 Accuracy

## Testing

In [None]:
def evaluate(inp_sentence):
  inp_sentence = [VOCAB_SIZE_ES-2]+tokenizer_es.encode(inp_sentence)+ [VOCAB_SIZE_ES-1]
  enc_input = tf.expand_dims(inp_sentence, axis=0)
  output = tf.expand_dims([VOCAB_SIZE_ES-2], axis=0)
  for _ in range(MAX_LENGTH):
    predictions = transformer(enc_input, output, False)
    predictions = predictions[:,-1:,:]
    predicted_id = tf.cast(tf.argmax(predictions,axis=-1), tf.int32)
    if predicted_id == VOCAB_SIZE_ES -1:
      return tf.squeeze(output,axis=0)
    output = tf.concat([output, predicted_id], axis=-1)
  return tf.squeeze(output,axis=0)

In [None]:
def sort_sentences(sentence):
  output = evaluate(sentence).numpy()
  predicted_sentence = tokenizer_es.decode([id for id in output if id<(VOCAB_SIZE_ES-2)])
  return predicted_sentence