<a href="https://colab.research.google.com/github/wilstermanz/holbertonschool-machine_learning/blob/main/supervised_learning/transformer_apps/transformer_apps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow_datasets as tfds
import tensorflow.compat.v2 as tf

pt2en_train = tfds.load('ted_hrlr_translate/pt_to_en',
                        split='train',
                        as_supervised=True)
for pt, en in pt2en_train.take(1):
  print(pt.numpy().decode('utf-8'))
  print(en.numpy().decode('utf-8'))


# 0. Dataset

Create the class ```Dataset``` that loads and preps a dataset for machine translation:

    Class constructor def __init__(self):
        creates the instance attributes:
            data_train, which contains the ted_hrlr_translate/pt_to_en tf.data.Dataset train split, loaded as_supervided
            data_valid, which contains the ted_hrlr_translate/pt_to_en tf.data.Dataset validate split, loaded as_supervided
            tokenizer_pt is the Portuguese tokenizer created from the training set
            tokenizer_en is the English tokenizer created from the training set
    Create the instance method def tokenize_dataset(self, data): that creates sub-word tokenizers for our dataset:
        data is a tf.data.Dataset whose examples are formatted as a tuple (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        The maximum vocab size should be set to 2**15
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer


In [None]:
class Dataset:
    """loads and preps a dataset for machine translation"""
    def __init__(self):
        """initializes instance of Dataset"""
        self.data_train = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                    split='train',
                                    as_supervised=True)
        self.data_valid = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                    split='validation',
                                    as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(
            self.data_train)

    def tokenize_dataset(self, data):
        """
        creates sub-word tokenizers for our dataset:

        data is a tf.data.Dataset whose examples are formatted as a tuple
        (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        The maximum vocab size should be set to 2**15
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer
        """
        STE = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus
        tokenizer_pt = STE((pt.numpy() for pt, en in data),
                           target_vocab_size=2**15)
        tokenizer_en = STE((en.numpy() for pt, en in data),
                           target_vocab_size=2**15)

        return tokenizer_pt, tokenizer_en

In [None]:
data = Dataset()
for pt, en in data.data_train.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
for pt, en in data.data_valid.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
print(type(data.tokenizer_pt))
print(type(data.tokenizer_en))

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
tinham comido peixe com batatas fritas ?
did they eat fish and chips ?
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>


# 1. Encode Tokens

Update the class ```Dataset```:

    Create the instance method def encode(self, pt, en): that encodes a translation into tokens:
        pt is the tf.Tensor containing the Portuguese sentence
        en is the tf.Tensor containing the corresponding English sentence
        The tokenized sentences should include the start and end of sentence tokens
        The start token should be indexed as vocab_size
        The end token should be indexed as vocab_size + 1
        Returns: pt_tokens, en_tokens
            pt_tokens is a np.ndarray containing the Portuguese tokens
            en_tokens is a np.ndarray. containing the English tokens


In [None]:
def encode(self, pt, en):
    """
    encodes a translation into tokens:

    pt is the tf.Tensor containing the Portuguese sentence
    en is the tf.Tensor containing the corresponding English sentence
    The tokenized sentences should include the start and end of sentence tokens
    The start token should be indexed as vocab_size
    The end token should be indexed as vocab_size + 1
    Returns: pt_tokens, en_tokens
        pt_tokens is a np.ndarray containing the Portuguese tokens
        en_tokens is a np.ndarray. containing the English tokens
    """
    pt_start = self.tokenizer_pt.vocab_size
    en_start = self.tokenizer_en.vocab_size
    pt_tokens = self.tokenizer_pt.encode(pt.numpy())
    en_tokens = self.tokenizer_en.encode(en.numpy())

    return ([pt_start] + pt_tokens + [pt_start + 1],
            [en_start] + en_tokens + [en_start + 1])

Dataset.encode = encode

In [None]:
data = Dataset()
for pt, en in data.data_train.take(1):
    print(data.encode(pt, en))
for pt, en in data.data_valid.take(1):
    print(data.encode(pt, en))

([30138, 6, 36, 17925, 13, 3, 3037, 1, 4880, 3, 387, 2832, 18, 18444, 1, 5, 8, 3, 16679, 19460, 739, 2, 30139], [28543, 4, 56, 15, 1266, 20397, 10721, 1, 15, 100, 125, 352, 3, 45, 3066, 6, 8004, 1, 88, 13, 14859, 2, 28544])
([30138, 289, 15409, 2591, 19, 20318, 26024, 29997, 28, 30139], [28543, 93, 25, 907, 1366, 4, 5742, 33, 28544])


# 2. TF Encode

Update the class ```Dataset```:

    Create the instance method def tf_encode(self, pt, en): that acts as a tensorflow wrapper for the encode instance method
        Make sure to set the shape of the pt and en return tensors
    Update the class constructor def __init__(self):
        update the data_train and data_validate attributes by tokenizing the examples


In [None]:
def tf_encode(self, pt, en):
    """
    Acts as a tensorflow wrapper for the encode instance method

    Make sure to set the shape of the pt and en return tensors
    """
    pt_tokens, en_tokens = tf.py_function(
        self.encode, [pt, en], (tf.int64, tf.int64))
    pt_tokens.set_shape([None])
    en_tokens.set_shape([None])

    return pt_tokens, en_tokens

Dataset.tf_encode = tf_encode

In [None]:
def __init__(self):
    """initializes instance of Dataset"""
    self.data_train = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                split='train',
                                as_supervised=True)
    self.data_valid = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                split='validation',
                                as_supervised=True)
    self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(
        self.data_train)
    self.data_train = self.data_train.map(self.tf_encode)
    self.data_valid = self.data_valid.map(self.tf_encode)

Dataset.__init__ = __init__

In [None]:
data = Dataset()
for pt, en in data.data_train.take(1):
    print(pt, en)
for pt, en in data.data_valid.take(1):
    print(pt, en)

tf.Tensor(
[30138     6    36 17925    13     3  3037     1  4880     3   387  2832
    18 18444     1     5     8     3 16679 19460   739     2 30139], shape=(23,), dtype=int64) tf.Tensor(
[28543     4    56    15  1266 20397 10721     1    15   100   125   352
     3    45  3066     6  8004     1    88    13 14859     2 28544], shape=(23,), dtype=int64)
tf.Tensor([30138   289 15409  2591    19 20318 26024 29997    28 30139], shape=(10,), dtype=int64) tf.Tensor([28543    93    25   907  1366     4  5742    33 28544], shape=(9,), dtype=int64)


# 3. Pipeline

Update the class ```Dataset``` to set up the data pipeline:

    Update the class constructor def __init__(self, batch_size, max_len):
        batch_size is the batch size for training/validation
        max_len is the maximum number of tokens allowed per example sentence
        update the data_train attribute by performing the following actions:
            filter out all examples that have either sentence with more than max_len tokens
            cache the dataset to increase performance
            shuffle the entire dataset
            split the dataset into padded batches of size batch_size
            prefetch the dataset using tf.data.experimental.AUTOTUNE to increase performance
        update the data_validate attribute by performing the following actions:
            filter out all examples that have either sentence with more than max_len tokens
            split the dataset into padded batches of size batch_size


In [None]:
def __init__(self, batch_size, max_len):
    """initializes instance of Dataset"""
    self.data_train = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                split='train',
                                as_supervised=True)
    self.data_valid = tfds.load(name='ted_hrlr_translate/pt_to_en',
                                split='validation',
                                as_supervised=True)
    self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(
        self.data_train)
    self.data_train = self.data_train.map(self.tf_encode)
    self.data_valid = self.data_valid.map(self.tf_encode)

    def filter_len(pt, en):
        """Checks length of both parts of a tuple again max_len"""
        return tf.logical_and(tf.size(pt) <= max_len, tf.size(en) <= max_len)

    self.data_train = self.data_train.filter(filter_len)
    self.data_train = self.data_train.cache()
    self.data_train = self.data_train.shuffle(2**15,
                                              reshuffle_each_iteration=True)
    self.data_train = self.data_train.padded_batch(batch_size)
    self.data_train = self.data_train.prefetch(tf.data.experimental.AUTOTUNE)

    self.data_valid = self.data_valid.filter(filter_len)
    self.data_valid = self.data_valid.padded_batch(batch_size)

Dataset.__init__ = __init__

In [None]:
tf.compat.v1.set_random_seed(0)
data = Dataset(32, 40)
for pt, en in data.data_train.take(1):
    print(pt, en)
for pt, en in data.data_valid.take(1):
    print(pt, en)

tf.Tensor(
[[30138    21     5 ...     0     0     0]
 [30138    32    13 ...     0     0     0]
 [30138     7   880 ...     0     0     0]
 ...
 [30138   418  8287 ...     0     0     0]
 [30138   113  2338 ...     0     0     0]
 [30138   131    26 ... 17432    34 30139]], shape=(32, 32), dtype=int64) tf.Tensor(
[[28543    18   304 ...     0     0     0]
 [28543    13    17 ...     0     0     0]
 [28543    17   143 ...     0     0     0]
 ...
 [28543    15    30 ...     0     0     0]
 [28543   192 11962 ...     0     0     0]
 [28543    59    11 ...    19    71 28544]], shape=(32, 36), dtype=int64)
tf.Tensor(
[[30138   289 15409 ...     0     0     0]
 [30138    86   168 ...     0     0     0]
 [30138  5036     9 ...     0     0     0]
 ...
 [30138  1157 29927 ...     0     0     0]
 [30138    33   837 ...     0     0     0]
 [30138   126  3308 ...     0     0     0]], shape=(32, 32), dtype=int64) tf.Tensor(
[[28543    93    25 ...     0     0     0]
 [28543    11    20 ...     0  


# 4. Create Masks

Create the function ```def create_masks(inputs, target)```: that creates all masks for training/validation:

    inputs is a tf.Tensor of shape (batch_size, seq_len_in) that contains the input sentence
    target is a tf.Tensor of shape (batch_size, seq_len_out) that contains the target sentence
    This function should only use tensorflow operations in order to properly function in the training step
    Returns: encoder_mask, combined_mask, decoder_mask
        encoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1, seq_len_in) to be applied in the encoder
        combined_mask is the tf.Tensor of shape (batch_size, 1, seq_len_out, seq_len_out) used in the 1st attention block in the decoder to pad and mask future tokens in the input received by the decoder. It takes the maximum between a look ahead mask and the decoder target padding mask.
        decoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1, seq_len_in) used in the 2nd attention block in the decoder.


In [30]:
def create_masks(inputs, target):
    """
    creates all masks for training/validation:

    inputs is a tf.Tensor of shape (batch_size, seq_len_in) that contains the
    input sentence

    target is a tf.Tensor of shape (batch_size, seq_len_out) that contains the
    target sentence

    This function should only use tensorflow operations in order to properly
    function in the training step

    Returns: encoder_mask, combined_mask, decoder_mask

        encoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1,
        seq_len_in) to be applied in the encoder

        combined_mask is the tf.Tensor of shape (batch_size, 1, seq_len_out,
        seq_len_out) used in the 1st attention block in the decoder to pad and
        mask future tokens in the input received by the decoder. It takes the
        maximum between a look ahead mask and the decoder target padding mask.

        decoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1,
        seq_len_in) used in the 2nd attention block in the decoder.
    """
    def  padding_mask(input):
        mask = tf.math.equal(input, 0)
        mask = tf.cast(mask, tf.float32)
        mask = mask[:, tf.newaxis, tf.newaxis, :]
        return mask

    def lookahead_mask(seq_len):
        mask = 1 - tf.linalg.band_part(tf.fill((seq_len, seq_len), 1.), -1, 0)
        return mask

    encoder_mask = padding_mask(inputs)
    target_mask = padding_mask(target)
    combined_mask = tf.maximum(target_mask, lookahead_mask(target.shape[1]))
    decoder_mask = padding_mask(inputs)

    # This is for testing
    print(f'batch_size: {inputs.shape[0]}')
    print(f'seq_len_in: {inputs.shape[1]}')
    print(f'seq_len_out: {target.shape[1]}\n')
    print(f'encoder mask shape should be: ({inputs.shape[0]}, 1, 1, {inputs.shape[1]})')
    print(f'encoder mask shape is:        {encoder_mask.shape}\n')
    print(f'decoder mask shape should be: ({inputs.shape[0]}, 1, 1, {inputs.shape[1]})')
    print(f'decoder mask shape is:        {decoder_mask.shape}\n')
    print(f'combined mask shape should be: ({inputs.shape[0]}, 1, {target.shape[1]}, {target.shape[1]})')
    print(f'combined mask shape is:        {combined_mask.shape}\n')
    # End tests

    return encoder_mask, combined_mask, decoder_mask


In [31]:
tf.compat.v1.set_random_seed(0)
data = Dataset(32, 40)
for inputs, target in data.data_train.take(1):
    print(create_masks(inputs, target))

batch_size: 32
seq_len_in: 32
seq_len_out: 36

encoder mask shape should be: (32, 1, 1, 32)
encoder mask shape is:        (32, 1, 1, 32)

decoder mask shape should be: (32, 1, 1, 32)
decoder mask shape is:        (32, 1, 1, 32)

combined mask shape should be: (32, 1, 36, 36)
combined mask shape is:        (32, 1, 36, 36)

(<tf.Tensor: shape=(32, 1, 1, 32), dtype=float32, numpy=
array([[[[0., 0., 0., ..., 1., 1., 1.]]],


       [[[0., 0., 0., ..., 1., 1., 1.]]],


       [[[0., 0., 0., ..., 1., 1., 1.]]],


       ...,


       [[[0., 0., 0., ..., 1., 1., 1.]]],


       [[[0., 0., 0., ..., 1., 1., 1.]]],


       [[[0., 0., 0., ..., 0., 0., 0.]]]], dtype=float32)>, <tf.Tensor: shape=(32, 1, 36, 36), dtype=float32, numpy=
array([[[[0., 1., 1., ..., 1., 1., 1.],
         [0., 0., 1., ..., 1., 1., 1.],
         [0., 0., 0., ..., 1., 1., 1.],
         ...,
         [0., 0., 0., ..., 1., 1., 1.],
         [0., 0., 0., ..., 1., 1., 1.],
         [0., 0., 0., ..., 1., 1., 1.]]],


       [[[