This is just a note for my learning of transformer, referred to articles and codes from

- https://www.tensorflow.org/tutorials/text/transformer
- https://github.com/MorvanZhou/NLP-Tutorials/blob/master/transformer.py
- https://zhuanlan.zhihu.com/p/347904940

All credits go to the above authors.

The notes / comments are my understanding at this moment, please correct me if they are wrong.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

## Tokenizer
Tokenize the inputting sequence

In [None]:
class Tokenizer():
    def __init__(
        self,
        vocabulary          : np.array # List[str]
        ):
        self.voc = vocabulary

    def __call__(self, sequence_batch : np.array) -> np.array:
        """
        Tokenize the sequence
        @arg
            sequence_batch: batch of sequences inputting.
                            starts with <BOS> and ends with <EOS>
                            the list wrapped in np.array is List[List[str]]
        @return
            np.array, shape [batch, sequence_size]
        """
        # TODO 1. insert <bos> and <eos> to sequence and padding ?
        #      2. performance issue ?
        tk = np.array([[self.voc.index(s) for s in seq] for seq in sequence_batch])
        return tk
        

## Embedding And Positional encoding

If input is [batch_size, sequence_length], after embedding, we get a tensor of [batch_size, sequence_length, embedding_dimension].
* The batch size is the number of batch to transformer, in human's language, the number of sentences.
* The sequence length is the size of samples in one inputting, that is, the words number in one sentence. Since not all sentences have same length, this value should be the possible longest length.
* The embedding dimention is decided by Word2Vec. For transformer make it 512.

In [42]:
class PositionalEmbedding():
    def __init__(
        self, 
        vocabulary_size         : int,
        max_sequence_length     : int,
        embed_dimension         : int = 512,
        padding                 : bool = False, # in case for encoder if masking padding is desired
        ):
        """
        @arg
        embed_dimension:
            The dimmention of embedding, or feature number.
        sequence_length:
            The words size in one sentence. 
            If there has 8 words in one sentence, the value is 8. 
            However, since it's impossible for all sentences have same size, 
            this value should be set to a size of the possibe longest sentence.
        vocabulary:
            The vocabulary table.
        """
        
        self.s_size = max_sequence_length
        self.v_size = vocabulary_size
        self.e_size = embed_dimension
        
        self.embedding = keras.layers.Embedding(
            input_dim=self.v_size,
            output_dim=self.e_size,
            input_length=self.s_size,
            embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
            mask_zero=True
        )
        
        # make positional encoding array
        positional_encoding = np.array(
            [
            [pos / np.power(10000, 2 * i / self.e_size) for i in range(self.e_size)]
            if (not (padding and 0 != pos)) else 
            np.zeros(self.e_size) for pos in np.arange(self.s_size) 
            ])

        # The formula for calculating the positional encoding is as follows:
        # PE(pos, 2i) = sin(pos/10000^(2i/d_model))
        # PE(pos, 2i+1) = cos(pos/10000^(2i/d_model))

        positional_encoding[0:, 0::2] = np.sin(positional_encoding[0:, 0::2])
        positional_encoding[0:, 1::2] = np.cos(positional_encoding[0:, 1::2])
        positional_encoding = positional_encoding[None, :, :]
        self.positional_encoding = tf.cast(positional_encoding, dtype=tf.float32)
        
    def __call__(self, batch : np.array) -> tf.Tensor:
        """
        The batch should be tokenized, which is expected to be the output of Tokenizer.
        The type of batch is np.array, shape -> [batch_size, sequence_szie]

        TODO: ?? embeding * tf.math.sqrt() ??
        """
        return self.embedding(batch) + self.positional_encoding[:, :self.s_size, :]

In [41]:
pe = PositionalEmbedding(5, 4, 3, padding=True)
rst = pe(np.array([[0,1,2,3],[2,3,4,1]]))
rst

<tf.Tensor: shape=(2, 4, 3), dtype=float32, numpy=
array([[[ 0.00576795, -0.0003815 ,  0.00452463],
        [ 0.00479912,  0.00535715, -0.00688996],
        [ 0.00795193, -0.02192049, -0.00800237],
        [ 0.01102773, -0.00101122, -0.01441262]],

       [[ 0.00795193, -0.02192049, -0.00800237],
        [ 0.01102773, -0.00101122, -0.01441262],
        [ 0.01640142, -0.00770752, -0.00339737],
        [ 0.00479912,  0.00535715, -0.00688996]]], dtype=float32)>

## Encoder

In [None]:
class encoder(keras.layers.Layer):
    pass


## Decoder

In [None]:
class decoder():
    pass

## Transfomer model

In [None]:
class transformer(keras.Model):
    def __init__(self):
       pass