# Neural Machine Translation
Using a Transformer model to translate from English to Spanish

In [1]:
import tensorflow as tf
import tensorflow.keras as keras

import numpy as np
import logging
import pathlib
import re
import math
import os
import time
import sys
from datetime import datetime

In [2]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

## Overview
* Dataset
* Data preprocessing
* Positional Encoding
* Scaled Dot Product Attention
* Multi-head Attention
* Point Wise NN
* Encoder and decoder
* Transformer Model
* Training
  * Learning Rate Scheduler
  * Losses and Metrics
  * Inference

## Dataset

In [4]:
# Download the file
path_to_zip = keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

In [5]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  inp = [inp for inp, targ in pairs]
  targ = [targ for inp, targ in pairs]

  return inp, targ

In [6]:

inputs, targets = load_data(path_to_file)

BATCH_SIZE = 64
BUFFER_SIZE = len(inputs)

dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

train_set = dataset.take(1200)

tmp = dataset.skip(1200)

val_set = tmp.take(300)

test_set = tmp.skip(300)

In [7]:
size = 0

for batch in dataset:
  size += 1

print(f'num_batches: {size}')

num_batches: 1858


In [8]:
for X, Y in dataset.take(1):
  print(X[:5])
  print(Y[:5])
  break

tf.Tensor(
[b'I have a sore throat and a slight fever.'
 b"I never imagined we'd be talking about this topic today."
 b'Give me a telephone call when you get back.'
 b'Please correct me when I make a mistake.' b'Look before you leap.'], shape=(5,), dtype=string)
tf.Tensor(
[b'Me duele la garganta y tengo un poco de fiebre.'
 b'Nunca me imagin\xc3\xa9 que estar\xc3\xadamos hablando de este tema hoy.'
 b'Dame un telefonazo cuando vuelvas.'
 b'Corr\xc3\xadgeme cuando cometa un error, por favor.'
 b'Mira antes de saltar.'], shape=(5,), dtype=string)


## Dataset preprocessing
Create TextVectorization layer to tokenize and preprocess dataset

In [9]:
def standardize_text(text):
  text = tf.strings.lower(text)

  # keep space, a to z, and select punctuation
  text = tf.strings.regex_replace(text, u'[^ a-z.?!,¿]', '')
  
  # add spaces around punctuation
  text = tf.strings.regex_replace(text, u'[.?!,¿]', r' \0 ')

  # strip whitespace 
  text = tf.strings.strip(text)

  # add start of sequence and end of sequence tokens
  text = tf.strings.join(['<sos>', text, '<eos>'], separator=' ')

  return text

In [10]:
# Testing

example_text = tf.constant('¿Hola, como estas?')
print(example_text.numpy().decode())
print(standardize_text(example_text).numpy().decode())

¿Hola, como estas?
<sos> ¿ hola ,  como estas ? <eos>


In [11]:
# use an input TextVectorization layer that uses this function
MAX_VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100

# pad sequences to SEQUENCE_LENGTH
input_text_processor = tf.keras.layers.TextVectorization(standardize=standardize_text, max_tokens=MAX_VOCAB_SIZE, output_sequence_length=SEQUENCE_LENGTH)

# and adapt to inputs
input_text_processor.adapt(inputs)

In [12]:
# now for output
output_text_processor = tf.keras.layers.TextVectorization(standardize=standardize_text, max_tokens=MAX_VOCAB_SIZE)

# and adapt to outputs
output_text_processor.adapt(targets)

In [13]:
# Testing
example_text = tf.constant('Hi, how are you?')
print(example_text.numpy().decode())

example_tokens = input_text_processor(example_text)
print(example_tokens)

input_vocab = np.array(input_text_processor.get_vocabulary())
tokens = input_vocab[example_tokens.numpy()]
' '.join(tokens)

Hi, how are you?
tf.Tensor(
[   2 2271   19   54   28    8   11    3    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(100,), dtype=int64)


'<sos> hi , how are you ? <eos>                                                                                            '

## Positional Encoding

In [14]:
class PositionalEncoding(keras.layers.Layer):
  '''
    A positional encoding is a dense vector that encodes the position of a word in a sentence.
    They capture the positional information of a word in a sentence
    The positional encodings are added to each word's embedding
  '''
  def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    # must be even
    if max_dims % 2 == 1: max_dims += 1
    # position, dimension
    p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
    pos_emb = np.empty((max_steps, max_dims))
    # evens, sin position
    pos_emb[:, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
    # odds, cos position
    pos_emb[:, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T

    # add new dim for adding with embeddings
    pos_emb = pos_emb[np.newaxis, ...]

    self.positional_encoding = tf.constant(pos_emb.astype(self.dtype))
    self.dim = max_dims
  
  def call(self, inputs):
    shape = tf.shape(inputs) # (batch_size, input_seq_len, d_model)
    return inputs + self.positional_encoding[:, :shape[-2], :]


## Scaled Dot Product Attention

In [15]:
def scaled_dot_product_attention(query, key, value, mask):
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth) # 4 dims

  # add the mask zero out padding tokens.
  if mask is not None:
    logits += (mask * -1e9)

  attention_weights = tf.nn.softmax(logits, axis=-1)

  return tf.matmul(attention_weights, value), attention_weights

## Multi-head Attention

In [16]:
class MyMultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads):
    super(MyMultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [17]:
temp_mha = MyMultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

## Point Wise NN

In [18]:
def point_wise_network(dim, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(dim)
  ])

## Encoder and decoder

In [19]:
class Encoder(keras.layers.Layer):
  '''
  Structure:   multi-head attention -> point wise -> LayerNormalization
               ------residual connection-------  
  '''
  def __init__(self, dim, n_heads, dff, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.multi_attn = MyMultiHeadAttention(d_model=dim, num_heads=n_heads)
    self.poin_wise_net = point_wise_network(dim, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, input, training, mask):
    attn_output, _ = self.multi_attn(input, k=input, q=input, mask=mask)
    attn_output = self.dropout1(attn_output, training=training)

    out1 = self.layernorm1(input + attn_output)

    pwn_output = self.poin_wise_net(out1)
    ffn_output = self.dropout2(pwn_output, training=training)
    out2 = self.layernorm2(out1 + pwn_output)

    return out2

class Decoder(keras.layers.Layer):
  '''
  masked multi-head attention -> LayerNorm -> multi-head attention -> Layer Norm -> point wise
  ----------residual connection----------  ------residual connection--------- ----------residual connection-----------
  '''
  def __init__(self, dim, n_heads, dff, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.multi_attn1 = MyMultiHeadAttention(d_model=dim, num_heads=n_heads)
    self.multi_attn2 = MyMultiHeadAttention(d_model=dim, num_heads=n_heads)

    self.poin_wise_net = point_wise_network(dim, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, input, enc_output, training, look_ahead_mask, padding_mask):
    attn1, attn_weights_block1 = self.multi_attn1(input, k=input, q=input, mask=look_ahead_mask)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + input)

    attn2, attn_weights_block2 = self.multi_attn2(enc_output, k=enc_output, q=out1, mask=padding_mask)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)

    ffn_output = self.poin_wise_net(out2)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)

    return out3, attn_weights_block1, attn_weights_block2

In [20]:
# Testing
sample_encoder_layer = Encoder(dim=512, n_heads=8, dff=2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)), False, None)

sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

TensorShape([64, 43, 512])

In [21]:
# Testing
sample_decoder_layer = Decoder(dim=512, n_heads=8, dff=2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_layer_output,
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

## Transformer Model

In [22]:
class TransformerModel(tf.keras.Model):
  '''
  run through encoder to get encoder_output and encoder_state
  Decoder
  '''
  def __init__(self, 
               num_layers,
               dim=512, 
               n_heads=8, 
               dff=2048,
               max_steps=500, 
               vocab_size=10000, 
               ):
    super().__init__()

    self.num_layers = num_layers

    self.embedding = keras.layers.Embedding(vocab_size, dim, mask_zero=True) # (batch_size, input_seq_len, d_model)
    self.positional_encoding = PositionalEncoding(max_steps, dim) 

    self.enc_layers = [
        Encoder(dim=dim, n_heads=n_heads, dff=dff)
        for _ in range(num_layers)
        ]

    self.dec_layers = [
        Decoder(dim=dim, n_heads=n_heads, dff=dff)
        for _ in range(num_layers)
        ]

    self.final_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, training):
    encoder_tokens = inputs[0] # (batch_size, 100)
    decoder_tokens = inputs[1] # (batch_size, output_seq_len)

    # Embed
    encoder_embeddings = self.embedding(encoder_tokens) # (batch_size, input_seq_len, d_model)
    decoder_embeddings = self.embedding(decoder_tokens) # (batch_size, input_seq_len, d_model)


    # Positionally encode
    encoder_in = self.positional_encoding(encoder_embeddings) # (batch_size, input_seq_len, d_model)
    decoder_in = self.positional_encoding(decoder_embeddings) # (batch_size, input_seq_len, d_model)

    # Now encoder decoder stuff

    # First create masks
    padding_mask, look_ahead_mask = self.create_masks(encoder_tokens, decoder_tokens)

    # Encoder
    x = encoder_in
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, padding_mask)
    encoder_out = x # (batch_size, input_seq_len, d_model)

    # Decoder
    attention_weights = {}
    y = decoder_in
    for i in range(self.num_layers):
      y, block1, block2 = self.dec_layers[i](y, encoder_out, training, look_ahead_mask, padding_mask)

      attention_weights[f'decoder_layer{i+1}_block1'] = block1
      attention_weights[f'decoder_layer{i+1}_block2'] = block2

    decoder_out = y

    final_output = self.final_layer(decoder_out)

    return final_output, attention_weights

  def create_masks(self, inp, tar):
    padding_mask = self.create_padding_mask(inp)

    # (batch_size, tar_seq_len)
    look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1])

    dec_target_padding_mask = self.create_padding_mask(tar)
    look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return padding_mask, look_ahead_mask

  def create_padding_mask(self, seq):
    # outputs '1' where a pad value of '0' is
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32) # (batch_size, seq_length)

    # add extra dims to add padding to attention LOGITS
    # (logits have 4 dims)
    return seq[:, tf.newaxis, tf.newaxis, :]

  def create_look_ahead_mask(self, size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask # (seq_len, seq_len)

In [22]:
# Testing (TRAINING)

sample_transformer = TransformerModel(num_layers=2)
# one batch
for x, y in dataset.take(1):
  temp_input = input_text_processor(x)
  temp_target = output_text_processor(y)

fn_out, _ = sample_transformer([temp_input, temp_target], training=True)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 17, 10000])

In [23]:
# Testing (INFERENCE)

sample_transformer = TransformerModel(num_layers=2)

temp_input = tf.random.uniform((64, 100), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = sample_transformer([temp_input, temp_target], training=False)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 36, 10000])

# Training

In [23]:
num_layers=6
d_model=128
num_heads=8
dff=512
dropout_rate = 0.1

## Loss and Metrics

In [24]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [25]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [26]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [27]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [28]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [29]:
transformer = TransformerModel(
    num_layers=num_layers,
    dim=d_model,
    n_heads=num_heads,
    dff=dff)

In [30]:
checkpoint_path = './checkpoints/train'

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [31]:
EPOCHS = 50

In [32]:
train_step_signature = [
    tf.TensorSpec(shape=(None,), dtype=tf.string),
    tf.TensorSpec(shape=(None,), dtype=tf.string),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  inp = input_text_processor(inp)
  tar = output_text_processor(tar)

  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]

  with tf.GradientTape() as tape:
    predictions, _ = transformer([inp, tar_inp],
                                 training = True)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

In [33]:

for epoch in range(EPOCHS):
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  # inp -> english, tar -> spanish
  for (batch, (inp, tar)) in enumerate(train_set):
    train_step(inp, tar)

    if batch % 100 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

  if epoch % 30 == 0:
    transformer.save(f'EngSpanModel-{epoch}')

  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 Loss 9.2304 Accuracy 0.0000
Epoch 1 Batch 100 Loss 9.0071 Accuracy 0.0803
Epoch 1 Batch 200 Loss 8.7236 Accuracy 0.1003
Epoch 1 Batch 300 Loss 8.3054 Accuracy 0.1074
Epoch 1 Batch 400 Loss 7.8250 Accuracy 0.1107
Epoch 1 Batch 500 Loss 7.4009 Accuracy 0.1218
Epoch 1 Batch 600 Loss 7.0544 Accuracy 0.1400
Epoch 1 Batch 700 Loss 6.7738 Accuracy 0.1567
Epoch 1 Batch 800 Loss 6.5476 Accuracy 0.1699
Epoch 1 Batch 900 Loss 6.3634 Accuracy 0.1806
Epoch 1 Batch 1000 Loss 6.2031 Accuracy 0.1904
Epoch 1 Batch 1100 Loss 6.0626 Accuracy 0.1995




Epoch 1 Loss 5.9379 Accuracy 0.2077
Time taken for 1 epoch: 185.85 secs

Epoch 2 Batch 0 Loss 4.5290 Accuracy 0.3008
Epoch 2 Batch 100 Loss 4.4491 Accuracy 0.3053
Epoch 2 Batch 200 Loss 4.4382 Accuracy 0.3062
Epoch 2 Batch 300 Loss 4.4050 Accuracy 0.3104
Epoch 2 Batch 400 Loss 4.3680 Accuracy 0.3150
Epoch 2 Batch 500 Loss 4.3353 Accuracy 0.3188
Epoch 2 Batch 600 Loss 4.3047 Accuracy 0.3222
Epoch 2 Batch 700 Loss 4.2764 Accuracy 0.3256
Epoch 2 Batch 800 Loss 4.2512 Accuracy 0.3286
Epoch 2 Batch 900 Loss 4.2276 Accuracy 0.3311
Epoch 2 Batch 1000 Loss 4.2025 Accuracy 0.3338
Epoch 2 Batch 1100 Loss 4.1816 Accuracy 0.3359
Epoch 2 Loss 4.1611 Accuracy 0.3382
Time taken for 1 epoch: 141.92 secs

Epoch 3 Batch 0 Loss 3.8468 Accuracy 0.3596
Epoch 3 Batch 100 Loss 3.8382 Accuracy 0.3686
Epoch 3 Batch 200 Loss 3.8164 Accuracy 0.3715
Epoch 3 Batch 300 Loss 3.8038 Accuracy 0.3728
Epoch 3 Batch 400 Loss 3.7885 Accuracy 0.3751
Epoch 3 Batch 500 Loss 3.7767 Accuracy 0.3769
Epoch 3 Batch 600 Loss 3.756



Epoch 11 Loss 1.6448 Accuracy 0.6955
Time taken for 1 epoch: 159.62 secs

Epoch 12 Batch 0 Loss 1.5381 Accuracy 0.7207
Epoch 12 Batch 100 Loss 1.5705 Accuracy 0.7047
Epoch 12 Batch 200 Loss 1.5721 Accuracy 0.7052
Epoch 12 Batch 300 Loss 1.5760 Accuracy 0.7053
Epoch 12 Batch 400 Loss 1.5749 Accuracy 0.7061
Epoch 12 Batch 500 Loss 1.5753 Accuracy 0.7063
Epoch 12 Batch 600 Loss 1.5729 Accuracy 0.7071
Epoch 12 Batch 700 Loss 1.5725 Accuracy 0.7073
Epoch 12 Batch 800 Loss 1.5705 Accuracy 0.7081
Epoch 12 Batch 900 Loss 1.5743 Accuracy 0.7076
Epoch 12 Batch 1000 Loss 1.5771 Accuracy 0.7074
Epoch 12 Batch 1100 Loss 1.5784 Accuracy 0.7075
Epoch 12 Loss 1.5812 Accuracy 0.7073
Time taken for 1 epoch: 128.53 secs

Epoch 13 Batch 0 Loss 1.4149 Accuracy 0.7373
Epoch 13 Batch 100 Loss 1.4906 Accuracy 0.7155
Epoch 13 Batch 200 Loss 1.5086 Accuracy 0.7142
Epoch 13 Batch 300 Loss 1.5044 Accuracy 0.7162
Epoch 13 Batch 400 Loss 1.5071 Accuracy 0.7165
Epoch 13 Batch 500 Loss 1.5072 Accuracy 0.7163
Epoch 13



Epoch 21 Loss 1.2133 Accuracy 0.7627
Time taken for 1 epoch: 159.56 secs

Epoch 22 Batch 0 Loss 1.3394 Accuracy 0.7683
Epoch 22 Batch 100 Loss 1.1460 Accuracy 0.7696
Epoch 22 Batch 200 Loss 1.1485 Accuracy 0.7697
Epoch 22 Batch 300 Loss 1.1593 Accuracy 0.7685
Epoch 22 Batch 400 Loss 1.1617 Accuracy 0.7684
Epoch 22 Batch 500 Loss 1.1685 Accuracy 0.7679
Epoch 22 Batch 600 Loss 1.1728 Accuracy 0.7673
Epoch 22 Batch 700 Loss 1.1761 Accuracy 0.7672
Epoch 22 Batch 800 Loss 1.1836 Accuracy 0.7665
Epoch 22 Batch 900 Loss 1.1892 Accuracy 0.7661
Epoch 22 Batch 1000 Loss 1.1916 Accuracy 0.7657
Epoch 22 Batch 1100 Loss 1.1926 Accuracy 0.7660
Epoch 22 Loss 1.1942 Accuracy 0.7660
Time taken for 1 epoch: 127.36 secs

Epoch 23 Batch 0 Loss 0.9875 Accuracy 0.7996
Epoch 23 Batch 100 Loss 1.1238 Accuracy 0.7735
Epoch 23 Batch 200 Loss 1.1363 Accuracy 0.7718
Epoch 23 Batch 300 Loss 1.1386 Accuracy 0.7721
Epoch 23 Batch 400 Loss 1.1432 Accuracy 0.7717
Epoch 23 Batch 500 Loss 1.1483 Accuracy 0.7711
Epoch 23



Epoch 31 Loss 1.0620 Accuracy 0.7878
Time taken for 1 epoch: 159.92 secs

Epoch 32 Batch 0 Loss 0.8537 Accuracy 0.8178
Epoch 32 Batch 100 Loss 0.9885 Accuracy 0.7959
Epoch 32 Batch 200 Loss 0.9952 Accuracy 0.7950
Epoch 32 Batch 300 Loss 1.0034 Accuracy 0.7946
Epoch 32 Batch 400 Loss 1.0130 Accuracy 0.7934
Epoch 32 Batch 500 Loss 1.0195 Accuracy 0.7925
Epoch 32 Batch 600 Loss 1.0254 Accuracy 0.7918
Epoch 32 Batch 700 Loss 1.0285 Accuracy 0.7915
Epoch 32 Batch 800 Loss 1.0313 Accuracy 0.7912
Epoch 32 Batch 900 Loss 1.0379 Accuracy 0.7902
Epoch 32 Batch 1000 Loss 1.0418 Accuracy 0.7899
Epoch 32 Batch 1100 Loss 1.0450 Accuracy 0.7896
Epoch 32 Loss 1.0503 Accuracy 0.7889
Time taken for 1 epoch: 127.26 secs

Epoch 33 Batch 0 Loss 0.9281 Accuracy 0.7985
Epoch 33 Batch 100 Loss 0.9845 Accuracy 0.7981
Epoch 33 Batch 200 Loss 1.0045 Accuracy 0.7932
Epoch 33 Batch 300 Loss 1.0135 Accuracy 0.7925
Epoch 33 Batch 400 Loss 1.0171 Accuracy 0.7922
Epoch 33 Batch 500 Loss 1.0215 Accuracy 0.7919
Epoch 33



Epoch 41 Loss 0.9704 Accuracy 0.8023
Time taken for 1 epoch: 158.80 secs

Epoch 42 Batch 0 Loss 0.9128 Accuracy 0.8177
Epoch 42 Batch 100 Loss 0.9114 Accuracy 0.8096
Epoch 42 Batch 200 Loss 0.9150 Accuracy 0.8085


KeyboardInterrupt: ignored

Save the model for future use

In [None]:
transformer.save('EngSpanModel')

In [34]:
!zip -r EngSpanModel.zip EngSpanModel

  adding: EngSpanModel-40/ (stored 0%)
  adding: EngSpanModel-40/assets/ (stored 0%)
  adding: EngSpanModel-40/keras_metadata.pb (deflated 96%)
  adding: EngSpanModel-40/saved_model.pb (deflated 89%)
  adding: EngSpanModel-40/variables/ (stored 0%)
  adding: EngSpanModel-40/variables/variables.index (deflated 78%)
  adding: EngSpanModel-40/variables/variables.data-00000-of-00001 (deflated 8%)


## Inference

In [13]:
!unzip EngSpanModel.zip

loaded_model = tf.keras.models.load_model('EngSpanModel-40')

Archive:  EngSpanModel.zip
   creating: EngSpanModel-40/
   creating: EngSpanModel-40/assets/
  inflating: EngSpanModel-40/keras_metadata.pb  
  inflating: EngSpanModel-40/saved_model.pb  
   creating: EngSpanModel-40/variables/
  inflating: EngSpanModel-40/variables/variables.index  
  inflating: EngSpanModel-40/variables/variables.data-00000-of-00001  


In [14]:
loaded_model.save_weights("weights.h5")

**Load Model Weights**

In [23]:
num_layers=6
d_model=128
num_heads=8
dff=512
dropout_rate = 0.1

transformer = TransformerModel(
    num_layers=num_layers,
    dim=d_model,
    n_heads=num_heads,
    dff=dff)

temp_input = tf.random.uniform((64, 100), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

_, _ = transformer([temp_input, temp_target], training=False)

transformer.load_weights("weights.h5")

In [82]:
class Translator(tf.Module):
  def __init__(self, input_text_processor, output_text_processor, transformer, max_length=100):
    self.input_text_processor = input_text_processor
    self.output_text_processor = output_text_processor
    self.transformer = transformer
    self.max_length = max_length

  def __call__(self, input_text, output_vocab):
    if len(input_text.shape) == 0:
      input_text = input_text[tf.newaxis]


    input_tokens = self.input_text_processor(input_text)

    encoder_input = input_tokens

    start_end = self.output_text_processor([""])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(self.max_length):
      output = tf.transpose(output_array.stack())

      predictions, _ = self.transformer([encoder_input, output], training=False)

      # select the last token from the seq_len dimension
      predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

      predicted_id = tf.argmax(predictions, axis=-1)

      output_array = output_array.write(i+1, predicted_id[0])

      if tf.equal(predicted_id, end):
        break
    
    output = tf.transpose(output_array.stack()) # (1, tokens)
    

    tokens = tf.TensorArray(dtype=tf.string, size=0, dynamic_size=True)

    for i in tf.range(tf.shape(output[0])[0]):
      val = output_vocab[output[0][i]]
      tokens = tokens.write(i, val)

    predicted_words = tf.transpose(tokens.stack())

    return predicted_words

    

In [83]:
translator = Translator(input_text_processor, output_text_processor, transformer)

sentence = "this is a problem we have to solve"


output_vocab = tf.constant(output_text_processor.get_vocabulary())


prediction = translator(tf.constant(sentence), output_vocab)

print(prediction)

tf.Tensor(
[b'<sos>' b'este' b'es' b'un' b'problema' b'que' b'tenemos' b'que'
 b'resolver' b'.' b'<eos>'], shape=(11,), dtype=string)


# Export

In [None]:
class ExportTranslator(tf.Module):
    '''
    output_vocab needs to be passed in from outside
    because its get_vocabulary method cannot be converted
    to a Tensorflow Graph
  '''
  def __init__(self, translator, ouput_vocab):
    self.translator = translator
    self.output_vocab = ouput_vocab

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    return self.translator(sentence, self.output_vocab)

In [85]:
export_translator = ExportTranslator(translator, output_vocab)

In [86]:
export_translator(tf.constant("this is a problem we have to solve"))

<tf.Tensor: shape=(11,), dtype=string, numpy=
array([b'<sos>', b'este', b'es', b'un', b'problema', b'que', b'tenemos',
       b'que', b'resolver', b'.', b'<eos>'], dtype=object)>

In [87]:
tf.saved_model.save(export_translator, export_dir='translator')



In [88]:
reloaded = tf.saved_model.load('translator')

In [89]:
reloaded(tf.constant("this is a problem we have to solve"))

<tf.Tensor: shape=(11,), dtype=string, numpy=
array([b'<sos>', b'este', b'es', b'un', b'problema', b'que', b'tenemos',
       b'que', b'resolver', b'.', b'<eos>'], dtype=object)>