This is just a note for my learning of transformer, referred to articles and codes from

- https://www.tensorflow.org/tutorials/text/transformer
- https://github.com/MorvanZhou/NLP-Tutorials/blob/master/transformer.py
- https://zhuanlan.zhihu.com/p/347904940
- https://datawhalechina.github.io/learn-nlp-with-transformers
- https://www.tensorflow.org/text/tutorials/transformer

All credits go to the above authors.

The notes / comments are my understanding at this moment, please correct me if they are wrong.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

2022-01-30 16:37:17.716110: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-30 16:37:17.716504: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Input and Embedding
Take the vocabulary table(s) and the sequence max size information. When call on it with a sequence, return the tokenized and embedded list for next processing.

In [None]:
class InputEmbedding():
    def __init__(
        self,
        max_seq_length      : int,
        vocabularies        : list  #shape: (2, x) -> there has two vocabulary table, one for in and one for out. And the two can be the same table.
        ):
        self.max_s_size = max_seq_length
        if 1 == len(vocabularies):
            self.voc_in = vocabularies
            self.voc_out = vocabularies
        elif 2 == len(vocabularies):
            self.voc_in = vocabularies[0]
            self.voc_out = vocabularies[1]
        else:
            raise "invalid vocabularies, it should have one or two 1-d list(s)."

    def __call__(self, sequence):
        """
        Tokenize the sequence, and embed
        """
        pass

## Input embedding and Positional encoding

If input is [batch_size, sequence_length], after embedding, we get a tensor of [batch_size, sequence_length, embedding_dimension].
* The batch size is the number of batch to transformer, in human's language, the number of sentences.
* The sequence length is the size of samples in one inputting, that is, the words number in one sentence. Since not all sentences have same length, this value should be the possible longest length.
* The embedding dimention is decided by Word2Vec. For transformer make it 512.

In [None]:
class PositionalEmbedding():
    def __init__(
        self, 
        #vocabulary_size         : int,
        vocabulary              : np.array,
        max_sequence_length     : int,
        batch_size              : int,
        embed_dimension         : int = 512
        ):
        """
        @arg
        embed_dimension:
            The dimmention of embedding, or feature number.
        batch_size:
            If the sentences to be trained is 10, the batch_size is 10.
        sequence_length:
            The words size in one sentence. 
            If there has 8 words in one sentence, the value is 8. 
            However, since it's impossible for all sentences have same size, 
            this value should be set to a size of the possibe longest sentence.
        vocabulary_size:
            The vocabulary table size.
        """
        
        self.e_dim  = embed_dimension
        self.b_size = batch_size
        self.s_size = max_sequence_length
        self.v_size = vocabulary.size()
        
        # [sequence_size, embed_dimension]
        pos = np.arange(self.s_size)[:, None]
        positional_encoding = np.array([
            [p / np.power(10000, (2 * i for i in np.arange(self.e_dim)[None, :] / self.e_dim))] if p !=0 
            else np.zeros(self.e_dim)       # padding zeros for the first one
            for p in pos
        ])
        
        # The formula for calculating the positional encoding is as follows:
        # PE(pos, 2i) = sin(pos/10000^(2i/d_model))
        # PE(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        # and the formula are not applied to the padding (the first one)

        positional_encoding[:, 0::2] = np.sin(positional_encoding[1:, 0::2])
        positional_encoding[:, 1::2] = np.cos(positional_encoding[1:, 1::2])
        
        self.positional_encoding = tf.cast(positional_encoding, dtype=tf.float32)
        
        # keep the embedding layer object
        self.embedding = keras.layers.Embedding(
            self.v_size, # vocabulary size. Since encoder and decoder may have different v_size, left it to be decided in invoker, 
            output_dim=embed_dimension,
            embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
        )

    def __call__(self, vocabulary_size: int):
        """
        vocabulary_size:
            The vocabulary table dimention. 
            Say, if we have a vocabulary table which containing 1000 words, 
            this value is 1000.
            NOTE: the vocabulary sizes of decoder input and encoder target
                  are not necessarily to be same.
        """
        return self.embedding(vocabulary_size) + self.positional_encoding

## Encoder

In [None]:
class encoder(keras.layers.Layer):
    pass


## Decoder

In [None]:
class decoder():
    pass

## Transfomer model

In [None]:
class transformer(keras.Model):
    def __init__(self):
       pass