This is just a note for my learning of transformer, referred to articles and codes from

- https://www.tensorflow.org/tutorials/text/transformer
- https://github.com/MorvanZhou/NLP-Tutorials/blob/master/transformer.py
- https://zhuanlan.zhihu.com/p/347904940
- https://datawhalechina.github.io/learn-nlp-with-transformers
- https://www.tensorflow.org/text/tutorials/transformer

All credits go to the above authors.

The notes / comments are my understanding at this moment, please correct me if they are wrong.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

## Tokenizer
Tokenize the inputting sequence

In [None]:
from typing import List

class Tokenizer():
    def __init__(
        self,
        vocabulary          : list
        ):
        self.voc = vocabulary

    def __call__(self, sequence_batch : List[List[str]]) -> List[List[int]]:
        """
        Tokenize the sequence
        @arg
            sequence_batch: batch of sequences inputting.
                            starts with <BOS> and ends with <EOS>
        """
        # TODO 1. insert <bos> and <eos> to sequence and padding
        #      2. performance issue ?
        tk = [[self.voc.index(s) for s in seq] for seq in sequence_batch]
        return tk
        

## Embedding And Positional encoding

If input is [batch_size, sequence_length], after embedding, we get a tensor of [batch_size, sequence_length, embedding_dimension].
* The batch size is the number of batch to transformer, in human's language, the number of sentences.
* The sequence length is the size of samples in one inputting, that is, the words number in one sentence. Since not all sentences have same length, this value should be the possible longest length.
* The embedding dimention is decided by Word2Vec. For transformer make it 512.

In [None]:
class PositionalEmbedding():
    def __init__(
        self, 
        vocabulary_size         : list,
        max_sequence_length     : int,
        embed_dimension         : int = 512
        ):
        """
        @arg
        embed_dimension:
            The dimmention of embedding, or feature number.
        sequence_length:
            The words size in one sentence. 
            If there has 8 words in one sentence, the value is 8. 
            However, since it's impossible for all sentences have same size, 
            this value should be set to a size of the possibe longest sentence.
        vocabulary:
            The vocabulary table.
        """
        
        self.s_size = max_sequence_length
        self.v_size = vocabulary_size
        self.e_size = embed_dimension
        
        em = keras.layers.Embedding(
            input_dim=self.v_size,
            output_dim=self.e_size,
            input_size=self.s_size,
            embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
            mask_zero=True
        )
        
        # make positional encoding
        pos = np.arange(self.s_size)[:, None]
        positional_encoding = np.array([
            [p / np.power(10000, (2 * i for i in np.arange(self.e_dim)[None, :] / self.e_dim))] if p !=0 
            else np.zeros(self.e_dim)       # padding zeros for the first one
            for p in pos
        ])
        
        # The formula for calculating the positional encoding is as follows:
        # PE(pos, 2i) = sin(pos/10000^(2i/d_model))
        # PE(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        # and the formula are not applied to the padding (the first one)

        positional_encoding[:, 0::2] = np.sin(positional_encoding[1:, 0::2])
        positional_encoding[:, 1::2] = np.cos(positional_encoding[1:, 1::2])
        
        self.positional_encoding = tf.cast(positional_encoding, dtype=tf.float32)
        

    def __call__(self):
        # TODO: embedding compile/predict is needed?
        return self.embedding(self.v_size) + self.positional_encoding

## Encoder

In [None]:
class encoder(keras.layers.Layer):
    pass


## Decoder

In [None]:
class decoder():
    pass

## Transfomer model

In [None]:
class transformer(keras.Model):
    def __init__(self):
       pass