# Imports

In [None]:
def install_libaries():
    %pip install -q gensim==3.6.0
    %pip install -q nlppreprocess
    %pip install -q keras
    %pip install -q keras_preprocessing

In [None]:
import sys
device: str
if "google.colab" in sys.modules.keys():
    device = "colab"
if "kaggle_web_client" in sys.modules.keys():
    device = "kaggle"
else:
    device = "locally"

In [None]:
# standart liberies:
from typing import Optional, List, Set, Dict, Tuple
import datetime, os, random, statistics, math, time
# NON-standart liberies:
import wandb
import tensorflow as tf
import pandas as pd
import seaborn as sns
import numpy as np
import tqdm
import pandas as pd
try:
    import keras
    from keras import layers, Model
    from keras_preprocessing.text import Tokenizer
    from keras_preprocessing.sequence import pad_sequences
    from nlppreprocess import NLP as nlp
    from gensim.scripts.glove2word2vec import glove2word2vec
    from gensim.models.keyedvectors import KeyedVectors
except ModuleNotFoundError as e:
    print(e)
    install_libaries()
except ImportError as e:
    print(e)
    install_libaries()

In [None]:
print(f"Python version: {sys.version}")
print(f"Tensorflow version: {tf.__version__}")

# Settings

In [None]:
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)
# keras.utils.set_random_seed(0)
# tf.config.experimental.enable_op_determinism()
# keras.backend.set_floatx("float16")
f_type = keras.backend.floatx()  # either tf.float16 or tf.float32

## Define a strategy - Accelerator optimization 

In [None]:
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver();
    tf.config.experimental_connect_to_cluster(resolver);
    tf.tpu.experimental.initialize_tpu_system(resolver);
    strategy = tf.distribute.TPUStrategy(resolver);
    using_tpu = True
except ValueError:
    using_tpu = False

# Data loading

In [None]:
if device == "colab":  # If notebook is ran on colab
    from google.colab import drive
    drive.mount("/drive")
    df: pd.DataFrame = pd.read_csv("/drive/MyDrive/final_project/wikipedia_articles.csv")
elif device == "kaggle":
    df: pd.DataFrame = pd.read_csv("../input/wikipedia-promotional-articles/promotional.csv")
else:  # If notebook is ran on my laptop
    df: pd.DataFrame = pd.read_csv("wiki_data/articles.csv")
print(f"the shape of the dataframe: {df.shape}")

train_df = df.sample(frac=0.8, random_state=0) #random state is a seed value
temp_val_df = df.drop(train_df.index)
test_df = temp_val_df.sample(frac=0.5, random_state=0)
val_df = temp_val_df.drop(test_df.index)

train_ser = train_df["text"]
val_ser = val_df["text"]
test_ser = test_df["text"]

train_ser.shape, val_ser.shape, test_ser.shape

## Creating a tokenizer

In [None]:
oov_token = "[OOV]"
tokenizer = Tokenizer(num_words=8192, oov_token=oov_token)
tokenizer.fit_on_texts(list((train_ser.apply(nlp().process).values)))
list_tokenized_train: List[List[int]] = tokenizer.texts_to_sequences(train_ser.values)
list_tokenized_val: List[List[int]] = tokenizer.texts_to_sequences(val_ser.values)
list_tokenized_test: List[List[int]] = tokenizer.texts_to_sequences(test_ser.values)
vocab_size = tokenizer.get_config()["num_words"]

In [None]:
tokenizer.texts_to_sequences(["wqe"])

## Tokenizing the data

### chunk too long texts

In [None]:
max_seq_len: int = 256 if using_tpu else 64

def chunk_double_list(mat: List[List[int]], max_len: int = max_seq_len) -> List[List[int]]:
    """Splits token list to chunks (lists) of maximum size: max_len"""
    chunked_mat = []
    for l in mat:
        chunked_mat += [l[i*max_len:(i+1)*max_len] for i in range(len(l) // max_len)]
    return list(filter(lambda x: len(x) > 0, chunked_mat))

In [None]:
chunked_train: List[List[int]] = chunk_double_list(list_tokenized_train)
chunked_val: List[List[int]] = chunk_double_list(list_tokenized_val)
chunked_test: List[List[int]] = chunk_double_list(list_tokenized_test)

## Padding

In [None]:
chunked_train.sort(key = lambda l: len(l))  # sorting so that every batch will have similar sized texts, used when training
chunked_val.sort(key = lambda l: len(l))
chunked_test.sort(key = lambda l: len(l))

pad_int: int = 0
pad_ten = tf.constant(pad_int, dtype=tf.int32)
    
padded_train: tf.Tensor = pad_sequences(chunked_train, padding="pre", value=pad_int)
padded_val: tf.Tensor = pad_sequences(chunked_val, padding="pre", value=pad_int)
padded_test: tf.Tensor = pad_sequences(chunked_test, padding="pre", value=pad_int)

## Train test val split

In [None]:
batch_size: int = 128 if using_tpu else 4
    
def ten_to_dataset(tokenized_ten: tf.Tensor) -> tf.data.Dataset:
    """Converts a list of tokenized texts after all preprocessing to a tf.data.Dataset"""
    dataset: tf.data.Dataset = tf.data.Dataset.from_tensor_slices(tokenized_ten)
    dataset = dataset.batch(batch_size)
    return dataset

train_dataset = ten_to_dataset(padded_train)
val_dataset = ten_to_dataset(padded_val)
test_dataset = ten_to_dataset(padded_test)

list_train_set = list(train_dataset)
list_val_set = list(val_dataset)
list_test_set = list(test_dataset)

## Clear memory

In [None]:
del train_dataset, val_dataset, test_dataset
del chunked_train, chunked_val, chunked_test
del padded_train, padded_val, padded_test
del list_tokenized_train, list_tokenized_val, list_tokenized_test
del train_df, temp_val_df, val_df, test_df
del train_ser, val_ser, test_ser

# Model


## Positional encoding

The formula for calculating the positional encoding is as follows:

$${PE_{(pos, 2i)} = \sin(pos / 10000^{2i / d_{model}})} $$
$${PE_{(pos, 2i+1)} = \cos(pos / 10000^{2i / d_{model}})} $$

where $d_{model}$ is the model dimension, $pos$ is the position and $i$ is the index of the embedding.
this is taken from the paper: attention is all you need.

In [None]:
def create_positional_encoding(max_len: int, d_model: int) -> tf.Tensor:
    """Returns the positional encoding for a given a maximal sequence length and model dimension.
    used in SeTransformer.__init__()
    inputs: max_len: int, d_model: int
    returns: tf.Tensor of shape (1, max_len, d_model) and dtype f_type
    The 1 is for the batch dimension, the place in the batch dimension does not matter"""

    def get_angles(positions: np.ndarray, timestamps: np.ndarray, d_model: int) -> np.ndarray:
        """Returns the angle in radians for given positions, timestamps and the dimension of the model
        input: positions: np.ndarray of shape (max_len, 1), timestamps: np.ndarray of shape (1, d_model), d_model: int
        output: np.ndarray of shape (max_len, d_model)"""
        if f_type == "float32":
            angle_rates = 1 / np.power(10000, ((2 * (timestamps//2)) / np.float32(d_model)))
        else:
            angle_rates = 1 / np.power(10000, ((2 * (timestamps//2)) / np.float16(d_model)))

        return positions * angle_rates
    
    angle_rads = get_angles(np.arange(max_len)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)  # (max_len, d_model)

    # apply sin to even indices in the array; 2i for i in range(d_model // 2)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # (max_len, d_model)
    # first dim: get all, second dim: start with index 0, jumps of 2

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # (max_len, d_model)
    # first dim: get all, second dim: start with index 1, jumps of 2

    pos_encode = angle_rads[np.newaxis, ...]  # (1, max_len, d_model)

    return tf.cast(pos_encode, dtype=f_type)

## Masking

Mask all the pad tokens in the batch of sequence. It ensures that the model does not treat padding as the input. The mask indicates where pad value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.

The look-ahead mask is used to mask the future tokens in a sequence. In other words, the mask indicates which entries should not be used.

This means that to predict the third token, only the first and second token will be used. Similarly to predict the fourth token, only the first, second and the third tokens will be used and so on.

In [None]:
def create_masks(inp: tf.Tensor, tar: tf.Tensor, pad_ten: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        """Creates all the masks needed for the model
        input: inp: tf.Tensor of shape (batch_size, seq_len), tar: tf.Tensor of shape (batch_size, set_size)
        Returns: tuple of (padding_mask, look_ahead_mask)
        padding_mask, look_ahead_mask: tf.Tensor of shape (batch_size, 1, 1, seq_len)"""
        
        def create_padding_mask(seq: tf.Tensor) -> tf.Tensor:
                """Returns a padding mask for the given sequence.
                input: seq: tf.Tensor of shape (batch_size, seq_len)
                Returns: tf.Tensor of shape (batch_size, 1, 1, seq_len)"""
                seq = tf.cast(tf.math.equal(seq, pad_ten), f_type)  
                # For every item in the sequence, 1 if it is a padding token, 0 if it is not 

                # add extra dimensions to add the padding
                
                return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
        
        # Encoder padding mask
        padding_mask: tf.Tensor = create_padding_mask(inp)  # (batch_size, 1, 1, seq_len)

        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by
        # the decoder.
        set_size: int = tar.shape[1]

        def create_look_ahead_mask(set_size: int) -> tf.Tensor:
                mask = 1 - tf.linalg.band_part(tf.ones((set_size, set_size)), -1, 0)
                mask = tf.cast(mask, dtype=f_type)
                return mask  # (seq_len, seq_len)

        look_ahead_mask = create_look_ahead_mask(set_size)  # (seq_len, seq_len)
        dec_target_padding_mask = create_padding_mask(tar)  # (batch_size, 1, 1, seq_len)
        look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) # (batch_size, 1, 1, seq_len)

        return padding_mask, look_ahead_mask

## Layers and blocks

In [None]:
class ScaledDotProductAttention(layers.Layer):
    def __init__(self, d_model: int, **kwargs):
        super(ScaledDotProductAttention, self).__init__(**kwargs)
        # scale = 1 / sqrt(d_model)
        self.scale = tf.math.pow(tf.cast(d_model, f_type), -0.5)
        self.softmax = layers.Softmax(axis=-1)

    def call(self, q: tf.Tensor, k: tf.Tensor, v: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Scaled Dot-Product Attention
        input: 
        q: tf.Tensor of shape (batch_size, seq_len, d_model), 
        k: tf.Tensor of shape (batch_size, seq_len, d_model), 
        v: tf.Tensor of shape (batch_size, seq_len, d_model), 
        mask: Optional[tf.Tensor] of shape (batch_size, 1, 1, seq_len)
        output: tf.Tensor of shape (batch_size, seq_len, d_model)"""
        matmul_qk: tf.Tensor = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
        # q @ transpose(k)

        # Scaled Dot-Product Attention
        scaled_attention_logits: tf.Tensor = matmul_qk * self.scale  # (..., seq_len_q, seq_len_k)
        # matmul_qk / sqrt(d_model)

        # Masking
        if mask is not None:
            # noinspection PyTypeChecker
            if f_type == "float16":
                # tf.float16.min is minus infinity
                scaled_attention_logits += (mask * tf.float16.min)  # changed from -1e9 to prevent nan's
            else:
                scaled_attention_logits += (mask * -1e9) 

        # Normalize
        attention_weights = self.softmax(scaled_attention_logits)
        # (..., seq_len_q, seq_len_k)

        # Output
        output = tf.matmul(attention_weights, v)

        return output

In [None]:
class MyMultiHeadAttention(Model):
    """U can use the built-in layers.multihead_attention but is caused a bug for me"""
    def __init__(self, num_heads: int, d_model: int, **kwargs):
        super(MyMultiHeadAttention, self).__init__(**kwargs)
        if d_model % num_heads != 0:
            raise ValueError(f"d_model ({d_model}) must be divisible by num_heads ({num_heads})")
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // self.num_heads

        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)

        self.dense = layers.Dense(d_model)
        self.sdpa = ScaledDotProductAttention(d_model)

        
    def split_heads(self, x: tf.Tensor, batch_size: int) -> tf.Tensor:
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    
    def call(self, v_k: tf.Tensor, q: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
        """inputs:
        v_k: tf.Tensor of shape (batch_size, seq_len, d_model) in self attention keys and values are the same
        q: tf.Tensor of shape (batch_size, seq_len, d_model)
        mask: Optional[tf.Tensor] of shape (batch_size, seq_len)"""
        batch_size = tf.shape(q)[0]

        q: tf.Tensor = self.wq(q)  # (batch_size, seq_len, d_model)
        k: tf.Tensor = self.wk(v_k)  # (batch_size, seq_len, d_model)
        v: tf.Tensor = self.wv(v_k)  # (batch_size, seq_len, d_model)

        q: tf.Tensor = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k: tf.Tensor = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v: tf.Tensor = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape should be (batch_size, num_heads, seq_len_q, depth)
        scaled_attention = self.sdpa(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) 
         # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
          # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output

In [None]:
class PointWiseFeedForward(Model):
    def __init__(self, d_model: int, dff: int, **kwargs): 
        super(PointWiseFeedForward, self).__init__(**kwargs)
        self.layer1 = layers.Dense(dff, activation="relu")  # (batch_size, seq_len, dff)
        self.layer2 = layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    
    def call(self, x: tf.Tensor) -> tf.Tensor:
        """Gets and returns tensor of shape (batch_size, seq_len, d_model) and dtype keras.beckend.floatx()"""
        x = self.layer1(x)
        x = self.layer2(x)
        return x

In [None]:
class EncoderBlock(Model):
    def __init__(self, d_model: int, num_heads: int, dff: int, drop_out_rate: float, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)

        self.mha = MyMultiHeadAttention(num_heads = num_heads, d_model = d_model)
        self.ffn = PointWiseFeedForward(d_model, dff)

        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)

        self.dropout = layers.Dropout(drop_out_rate)

    def call(self, x: tf.Tensor, training: bool, mask: tf.Tensor) -> tf.Tensor:
        
        attn_output = self.mha(x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout(attn_output, training=training)  # (batch_size, input_seq_len, d_model)
        # out1 = self.layer_norm(x + attn_output)  # (batch_size, input_seq_len, d_model)
        # might be data leak
        out1 = self.layer_norm(attn_output)  # (batch_size, input_seq_len, d_model)
        
        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout(ffn_output, training=training)  # (batch_size, input_seq_len, d_model)
        out2 = self.layer_norm(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [None]:
class DecoderBlock(Model):
    def __init__(self, d_model: int, num_heads: int, dff: int, rate: float, **kwargs):
        super(DecoderBlock, self).__init__(**kwargs)

        self.mha = MyMultiHeadAttention(num_heads = num_heads, d_model = d_model)

        self.ffn = PointWiseFeedForward(d_model, dff)

        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)

        self.dropout = layers.Dropout(rate)

    def call(self, x: tf.Tensor, enc_output: tf.Tensor, look_ahead_mask: tf.Tensor, padding_mask: tf.Tensor, training):
        # enc_output.shape should be (batch_size, input_seq_len, d_model)

        attn1 = self.mha(x, x, look_ahead_mask)  # (batch_size, set_size, d_model)
        attn1 = self.dropout(attn1, training=training)  # (batch_size, set_size, d_model)
        # out1 = self.layer_norm(attn1 + x)
        # might be data leak
        out1 = self.layer_norm(attn1)  # (batch_size, set_size, d_model)

        attn2 = self.mha(enc_output, out1, padding_mask)  # (batch_size, set_size, d_model)
        attn2 = self.dropout(attn2, training=training)  # (batch_size, set_size, d_model)
        out2 = self.layer_norm(attn2 + out1)  # (batch_size, set_size, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, set_size, d_model)
        ffn_output = self.dropout(ffn_output, training=training)
        out3 = self.layer_norm(ffn_output + out2)  # (batch_size, set_size, d_model)

        return out3 

In [None]:
class Encoder(Model):
    def __init__(self, pos_encoding: tf.Tensor, num_blocks: int, d_model: int, num_heads: int, dff: int, rate=0.1, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.d_model = d_model
        self.num_blocks = num_blocks
        self.pos_encoding = pos_encoding

        self.enc_blocks = [EncoderBlock(d_model, num_heads, dff, rate) for _ in range(num_blocks)]
        # the encoder 
        self.dropout = layers.Dropout(rate)
        self.scale = tf.math.sqrt(tf.cast(self.d_model, f_type))

    def call(self, x: tf.Tensor, training, mask: tf.Tensor) -> tf.Tensor:

        seq_len = tf.shape(x)[1]

        # adding position encoding.
        # assert not tf.math.is_nan(x[0][0][0])
        x *= self.scale
        # assert not tf.math.is_nan(x[0][0][0])
        
        x += self.pos_encoding[:, :seq_len, :]  # (batch_size, input_seq_len, d_model)
        # assert not tf.math.is_nan(x[0][0][0])
        x = self.dropout(x, training=training)  # (batch_size, input_seq_len, d_model)
        # assert not tf.math.is_nan(x[0][0][0])

        for block in self.enc_blocks:
            x = block(x, training, mask)  # (batch_size, input_seq_len, d_model)
            # assert not tf.math.is_nan(x[0][0][0])

        return x  # (batch_size, input_seq_len, d_model)  

In [None]:
class Decoder(Model):
    def __init__(self, pos_encoding, num_blocks: int, d_model: int, num_heads: int, dff: int,
                 vocab_size: int, rate: float, **kwargs):
        super(Decoder, self).__init__(**kwargs)

        self.scale = tf.math.sqrt(tf.cast(d_model, f_type))
        self.num_blocks = num_blocks
        self.pos_encoding = pos_encoding

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.dec_blocks = [DecoderBlock(d_model, num_heads, dff, rate) for _ in range(num_blocks)]
        self.dropout = layers.Dropout(rate)

    def call(self, tar: tf.Tensor, enc_output: tf.Tensor, training: bool,
             look_ahead_mask: tf.Tensor, padding_mask: tf.Tensor) -> tf.Tensor:

        seq_len = tf.shape(tar)[1]

        x = self.embedding(tar)  # (batch_size, set_size, d_model)
        x *= self.scale
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for block in self.dec_blocks:
            x = block(x=x, enc_output=enc_output, look_ahead_mask=look_ahead_mask,
                      padding_mask=padding_mask, training=training)

        # x.shape should be (batch_size, set_size, d_model)
        return x

In [None]:
class EmbeddingTransposed(layers.Layer):
    def __init__(self, tied_to: layers.Embedding = None, activation: Optional[str] = None, **kwargs):
        super(EmbeddingTransposed, self).__init__(trainable=tied_to._trainable, **kwargs)
        self.tied_to = tied_to
        self.activation = keras.activations.get(activation)

    def build(self, input_shape):
        self.custom_weights = self.tied_to.weights[0]
        self.built = True

    def compute_output_shape(self, input_shape):
        return input_shape[0], keras.backend.int_shape(self.tied_to.weights[0])[0]

    def call(self, inputs, mask=None):
        output = keras.backend.dot(inputs, keras.backend.transpose(self.custom_weights))
        if self.activation is not None:
            output = self.activation(output)
        return output

    def get_config(self):
        config = {"activation": keras.activations.serialize(self.activation)}
        base_config = super(EmbeddingTransposed, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

## The full model

In [None]:
def count_layers(my_model) -> int:
    """Counts the layers of a keras model recursizely"""
    if not isinstance(my_model, keras.Model): 
        if isinstance(my_model, layers.Layer):
            return 1
        return 0
    return sum([count_layers(sub_model) for sub_model in my_model.layers])

In [None]:
def line_prepender(_from: str, to: str, add: str):
    with open(_from, 'r') as f:
        content = f.read()
    with open(to, "w") as f:
        f.write(add + '\n' + content)

In [None]:
if device == "kaggle":
    glove_txt_path = "../input/glove6b/glove.6B.300d.txt"
else:
    !wget http://nlp.stanford.edu/data/glove.6B.300d.zip
    !apt install unzip
    !unzip "glove.6B.300d.zip"
    glove_txt_path = "glove.6B.300d.txt"
pro_path = "./edited_glove.txt"
out_put_path = "./gensim_glove_vectors.txt"

glove2word2vec(glove_input_file=glove_txt_path, word2vec_output_file=out_put_path)
line_prepender(glove_txt_path, pro_path, "400000 300")
keyed_vectors = KeyedVectors.load_word2vec_format(pro_path, binary=False)
glove_embedding = keyed_vectors.get_keras_embedding()

In [None]:
class SeTransformer(Model):
    """The base architecture of my models in this project."""
    def __init__(self, num_blocks: int, d_model: int, num_heads: int, dff: int,
                 vocab_size: int, max_len: int, rate: float, pad_int: int, **kwargs):
        super(SeTransformer, self).__init__(**kwargs)  # calls keras.Model's __init__ method with kwarg as key worg arguments
        self.pad_int = pad_int
        pos_encoding = create_positional_encoding(max_len, d_model)
        self.encoder = Encoder(pos_encoding, num_blocks, d_model, num_heads, dff, rate)
        self.decoder = Decoder(pos_encoding, num_blocks, d_model, num_heads, dff, vocab_size, rate)
        self.embedding = glove_embedding
        self.emb_trans = EmbeddingTransposed(self.embedding, "softmax")


    def get_layer_count(self) -> int:
        return count_layers(self)
    
    
    def summary(self, **kwargs) -> None:
        super().summary(**kwargs)
        print(f"The model have {self.get_layer_count()} layers")
        
    
    def count_params(self) -> int:
        """counts trainable parameters
        Raises an error if caleed before building the model"""
        param_count: int = self.encoder.count_params() + self.decoder.count_params() + self.embedding.count_params()
        return param_count
    
    
    def build_graph(self) -> keras.Model:
        """Returns a functional keras model identical to the model"""
        inp = layers.Input(shape=(batch_size, max_seq_len))
        tar = layers.Input(shape=(batch_size, set_size))
        return keras.Model(inputs=[[inp, tar], True], outputs=self.call([inp, tar], True))
    
    
    def call(self, inputs: List[tf.Tensor], training: bool) -> tf.Tensor:
        inp, tar = inputs
        # inp.shape should be (batch_size, max_seq_len)
        # tar.shape should be (batch_size, set_size)
        x = self.embedding(inp)  # (batch_size, max_seq_len, d_model)
        padding_mask, look_ahead_mask = create_masks(inp, tar, self.pad_int)
        enc_output = self.encoder(x, training, padding_mask)  # (batch_size, max_seq_len, d_model)
        dec_output = self.decoder(tar, enc_output, training, look_ahead_mask, padding_mask)  # (batch_size, set_size, d_model)
        final_output = self.emb_trans(dec_output)  # (batch_size, set_size, vocab_size)
        return final_output

# Training the model

In [None]:
# model_art = wandb.use_artifact(f"{model_collection_name}:latest")
# model_path = model_art.get_path("model.pb").download()
# model = tf.saved_model.load(model_path)

## Hyper-Parameters

In [None]:
set_size: int = 2
learning_rate: float = 0.005

num_sets: int = (max_seq_len // set_size) - 1 # Because we dont predict the first set
# number of sets in each sequence

num_blocks: int = 16
d_model: int = 300
dff: int = 1024
num_heads: int = 30
dropout_rate: float = 0.1

## Weights and Biases

In [None]:
 if using_tpu:
    if device == "kaggle":
        try:
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            os.environ["WANDB_API_KEY"] = user_secrets.get_secret("WANDB_API_KEY")
        except Exception:
            print("please enter your weights and biases API key")
    !wandb login

In [None]:
# try: 
#     artifect = use_artifact(artifact, use_as=None)
#     art = wandb.use_artifact(...)
#     wandb.run.link_artifact(art, "yonikremer/final_project_owned/version0")

In [None]:
if not "run" in globals() and using_tpu:
    run = wandb.init(
        project="final_project_owned",
        entity="yonikremer",
        name=datetime.datetime.today().strftime("run from %d/%m/%Y"),
        settings=wandb.Settings(start_method="thread"),
        config = {"set size": set_size,
                  "batch size": batch_size,
                  "learning rate": learning_rate,
                  "max seq len": max_seq_len,
                  "num blocks": num_blocks,
                  "model dimention": d_model,
                  "dff": dff,
                  "num heads": num_heads,
                  "dropout rate": dropout_rate
                  })
    config = wandb.config

## Create the model

In [None]:
model = SeTransformer(
    num_blocks=num_blocks,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    vocab_size=vocab_size,
    max_len=max_seq_len,
    rate=dropout_rate,
    pad_int=pad_int)

optimizer = tf.keras.optimizers.Adam(learning_rate, epsilon=keras.backend.epsilon())
loss_func = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
acc = keras.metrics.SparseCategoricalAccuracy(dtype=f_type)

model.compile(optimizer=optimizer,
    loss=loss_func,
    metrics=[acc]
 )

In [None]:
temp_input = tf.random.uniform((batch_size, max_seq_len), dtype=tf.int32, minval=1, maxval=vocab_size-1)
temp_target = tf.random.uniform((batch_size, set_size), dtype=tf.int32, minval=1, maxval=vocab_size-4)
temp_target2 = temp_target + 3

train_pred1 = model([temp_input, temp_target], training=False)
train_pred2 = model([temp_input, temp_target], training=False)
train_pred3 = model([temp_input, temp_target2], training=False)

try:
    tf.debugging.assert_equal(train_pred1, train_pred2)
    print("the model is determenistic")
    try:
        tf.debugging.assert_equal(train_pred1, train_pred3)
    except tf.errors.InvalidArgumentError:
        print("WARNING: model output might depends on the target")
except tf.errors.InvalidArgumentError:
    print("The model is not determenistic and have sone random noise")
param_count: int = model.count_params()
print(f"The model has {param_count:,} = {round(param_count * (10**-6), 1)}M trainable parameters")
if using_tpu: run.config["parameters"] = param_count

# stats = FlopCoKeras(model)
# flops_per_call: int = stats.total_flops
# macs_per_call: int = stats.total_macs

# # (add-multiplies per forward pass) * (2 FLOPs/add-multiply) * (3 for forward and backward pass) * (number of examples in dataset) 
# training_flops: float  = macs_per_call * 2 * flops_per_call / macs_per_call * (3 * train_step_calles + val_step_calles)
# print(f"FLOPs per call: {flops_per_call:,} = {(flops_per_call * (10 ** -6)):,}M")
# print(f"MACs per call: {macs_per_call:,} = {(macs_per_call * (10 ** -6)):,}M")
del temp_input, temp_target, temp_target2, train_pred1, train_pred2, train_pred3

In [None]:
# keras.utils.plot_model(
#     model.build_graph(),
#     show_shapes=True,
#     show_dtype=True,
#     show_layer_names=False,
#     expand_nested=True,
#     layer_range=None,
#     show_layer_activations=True
# )

In [None]:
# model.summary(line_length=125, positions=[0.5, 0.66, 0.83, 1], expand_nested=True)

In [None]:
inputs = [layers.Inputs(shape=[None,], dtype=tf.int32), layers.Inputs(shape=[set_size,], dtype=tf.int32)]
x = layers.Embedding(16, set_size)(inputs[0])

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

## Training helper functions

In [None]:
@tf.function(input_signature=([tf.TensorSpec(shape=[2, 5], dtype=tf.int32)]))
def index_start(batch):
    """gets a two d tensor and returns the index of the first column than contains a non pad_int value
    example: [[pad_ten, not_pad_ten, not_pad_ten], 
              [pad_ten, pad_ten, not_pad_ten]] -> 1"""
    ans = batch.shape[1]
    for i in range(batch.shape[1]):
        print(tf.not_equal(batch[:, i], pad_ten))
        cond = tf.reduce_any(tf.not_equal(batch[:, i], pad_ten))
        print(cond)
        print(tf.get_static_value(cond))
        print(cond.numpy())
        if tf.get_static_value(cond):
            ans = tf.math.minimum(i, ans)
            print(ans)
    return ans

In [None]:
index_start(tf.Variable([[0, 0, 0, 1, 8], [0, 0, -3, -1, 8]]))

### Train

In [None]:
@tf.function(input_signature=(tf.TensorSpec(shape=[batch_size, None], dtype=tf.int32),
                              tf.TensorSpec(shape=[batch_size, set_size], dtype=tf.int32)))
def train_step(inp: tf.Tensor, outp: tf.Tensor) -> tf.Tensor:
    with tf.GradientTape() as tape:
#         pred: tf.Tensor = model([inp, outp], training=True) 
        pred: tf.Tensor = model([inp, outp])
        loss_val: tf.Tensor = loss_func(y_true = outp, y_pred = pred)
    grads: tf.RaggedTensor = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return tf.math.reduce_mean(loss_val), acc(outp, pred)

In [None]:
@tf.function(input_signature=[tf.TensorSpec(shape=[batch_size, max_seq_len], dtype=tf.int32)])
def train(batch: tf.Tensor) -> tf.TensorSpec(shape=[], dtype=f_type):
    per_gen_loss: tf.Tensor = tf.zeros([num_sets], dtype=f_type)
    per_gen_acc: tf.Tensor = tf.zeros([num_sets], dtype=f_type)
    i = 0
    while i < num_sets:
        # The input is of size set_size-TAKE_TO_ACCOUNT
        already_predicted: int = i * (set_size + 1)
        start_from: int = max(0, already_predicted - max_seq_len)
        inp: tf.Tensor = batch[:, start_from:(i + 1) * set_size]
        have_pad = tf.map_fn(contains_pad, inp, fn_output_signature=tf.bool, parallel_iterations=batch_size)
        if tf.get_static_value(tf.math.reduce_all(have_pad)):
            break
        outp: tf.TensorSpec(shape=[batch_size, set_size]) = batch[:, (i + 1) * set_size:(i + 2) * set_size]
        loss_val, acc_val = train_step(inp, outp)
        one_hot_loss = tf.one_hot([i], num_sets, dtype = f_type) * loss_val
        one_hot_acc = tf.one_hot([i], num_sets, dtype = f_type) * acc_val
        per_gen_loss += one_hot_loss
        per_gen_acc += one_hot_acc
        i += 1
    return tf.math.reduce_mean(per_gen_loss[:i]), tf.math.reduce_mean(per_gen_acc[:i])
    

In [None]:
%%time
train(tf.random.uniform(shape=[batch_size, max_seq_len], minval=0, maxval=vocab_size-1, dtype=tf.int32))

### Validate

In [None]:
@tf.function(input_signature=(tf.TensorSpec(shape=[batch_size, None], dtype=tf.int32),
                              tf.TensorSpec(shape=[batch_size, set_size], dtype=tf.int32)))
def val_step(inp: tf.Tensor, outp: tf.Tensor) -> tf.Tensor:
    pred = model([inp, outp], training=False)
    loss_val = loss_func(y_true = outp, y_pred = pred)
    return tf.math.reduce_mean(loss_val), acc(outp, pred)

In [None]:
@tf.function(input_signature=[tf.TensorSpec(shape=[batch_size, max_seq_len], dtype=tf.int32)])
def validate(batch: tf.Tensor) -> tf.TensorSpec(shape=[], dtype=f_type):
    per_generation_loss: tf.Tensor = tf.zeros([num_sets], dtype=f_type)
    per_gen_acc: tf.Tensor = tf.zeros([num_sets], dtype=f_type)
    i = 0
    while i < num_sets:
        # The input is of size set_size-TAKE_TO_ACCOUNT
        already_predicted: int = i * (set_size + 1)
        start_from: int = max(0, already_predicted - max_seq_len)
        inp: tf.Tensor = batch[:, start_from:(i + 1) * set_size]
        have_pad = tf.map_fn(contains_pad, inp, fn_output_signature=tf.bool, parallel_iterations=batch_size)
        if tf.get_static_value(tf.math.reduce_all(have_pad)):
            break
        loss_val, acc_val = val_step(inp, outp)
        one_hot_loss = tf.one_hot([i], num_sets, dtype=f_type) * loss_val
        one_hot_acc = tf.one_hot([i], num_sets, dtype=f_type) * acc_val
        per_gen_loss += one_hot_loss
        per_gen_acc += one_hot_acc
        i += 1
    return tf.math.reduce_mean(per_gen_loss[:i]), tf.math.reduce_mean(per_gen_acc[:i])

In [None]:
%%time
validate(tf.random.uniform(shape=[batch_size, max_seq_len], minval=0, maxval=vocab_size-1, dtype=tf.int32))

## Callbacks

In [None]:
best_loss = float("inf")
best_model = None

In [None]:
def check_point(model: SeTransformer, train_loss, train_acc, val_loss, val_acc, test_loss = None, test_acc = None):
    """Saves the model at the end of each epoch"""
    # (add-multiplies per forward pass) * (2 FLOPs/add-multiply) * 
    # * (3 for forward and backward pass) * (number of examples in dataset)
    global best_model
    global last_save_time
    last_save_time = time.time()
    best_model = model
    num_ops: float  = macs_per_call * 2 * flops_per_call / macs_per_call * (3 * train_step_calles + val_step_calles)
    keras.models.save_model(model = model, filepath = "model.pb", save_format="tf", overwrite=True)
    wandb.log({"model train loss": train_loss, "model train acc": train_acc, "model val loss": val_loss, "model val acc": val_acc, 
              "model test loss": test_loss, "model test acc": test_acc})
    art = wandb.Artifact(f"{wandb.run.id}-best model", type="my_model", description = f"the model after {num_ops:,} operations")
    art.add_file("model.pb")
    run.log_artifact(artifact)
    print("Saved checkpoint")

In [None]:
def on_val_batch_end(train_loss: float, train_acc: float ,val_loss: float, val_acc: float) -> bool:
    """A callback after every val batch
    returns True if the model should stop training and False else"""
    global best_loss
    global best_model
    global last_save_time
    if time.time() - last_save_time > 1800.0 and val_loss < math.log(vocab_size) and val_loss < best_loss:
        best_loss = val_loss
        # If the last save is more than a half hour (1800 sec) ago
        # and if the predictions are better than randon and 
        check_point(model, train_loss, train_acc, val_loss, val_acc)
        return False
    elif train_loss < 0.01:
        title: str = "Over fitting or data leak"
        message = f"Training loss is {train_loss} and val loss is {val_loss} in the latest batch"
        wandb.alert(title=title, text=message)
        print(title)
        print(message)
        return True
    elif time.time() - last_save_time > 18000.0 and train_loss >= math.log(vocab_size):
        # if the prob of every token is 1/vocab_size, the loss is
        # -ln(1/vocab_size) = ln(vocab_size) 
        # by the logrithem rule log(a^x)=xlog(a) where x = -1
        # if after 5 hours of training, the model predictions are still random
        title: str = "Under fitting"
        message = f"train loss: {train_loss} train acc: {train_acc}, val loss: {val_loss}, val acc: {val_acc} in the latest batch"
        wandb.alert(title=title, text=message)
        print(title)
        print(message)
        return True
    return Flase

## The actual training loop!

In [None]:
def train_loop():
    epochs: int = 1000000  # Train until the cloud disconnects or the model stops improving
    per_epoch_train_loss: List[float] = []
    per_epoch_val_loss: List[float] = []
    per_epoch_train_acc: List[float] = []
    per_epoch_val_acc: List[float] = []
    print(f"number of train batches per epoch: {len(list_train_set)}")
    last_save_time = time.time()
    for epoch in range(epochs):
        print(f"epoch number: {epoch}")
        per_batch_train_loss: List[float] = []
        per_batch_val_loss: List[float] = []
        per_batch_train_acc: List[float] = []
        per_batch_val_acc: List[float] = []
        for batch_num in tqdm.tqdm(range(len(list_train_set))):  # tqdm is a progress bar
            train_loss, train_acc = train(list_train_set[batch_num])
            float_train_loss = keras.backend.eval(train_loss).item()
            per_batch_train_loss.append(float_train_loss)
            float_train_acc = keras.backend.eval(train_acc).item()
            per_batch_train_acc.append(float_train_acc)
            if using_tpu:
                wandb.log({"epoch": epoch, "batch": batch_num, "batch train loss": float_train_loss, 
                           "batch train_acc": float_train_acc})
            if batch_num % 8 == 0:  # 8 = #training batches/#val batches
                # because training set is 80% of the data and val set is 10%
                next_val_batch: tf.Tensor = list_val_set[batch_num // 8]
                val_loss, val_acc = validate(next_val_batch)
                float_val_loss = keras.backend.eval(val_loss).item()
                per_batch_val_loss.append(float_val_loss)
                float_val_acc = keras.backend.eval(val_acc).item()
                per_batch_val_acc.append(float_val_acc)
                if using_tpu:
                    wandb.log({"epoch": epoch, "batch": batch_num, "batch val loss": float_val_loss,
                               "batch train_acc": float_val_acc})
                    on_val_batch_end(float_train_loss, float_train_acc, float_val_loss, float_val_acc)
        epoch_train_loss = statistics.mean(per_batch_train_loss)
        epoch_train_acc = statistics.mean(per_batch_train_acc)
        epoch_val_loss = statistics.mean(per_batch_val_loss)
        epoch_val_acc = statistics.mean(per_batch_val_acc)
        per_epoch_train_loss.append(epoch_train_loss)
        per_epoch_train_acc.append(epoch_train_acc)
        per_epoch_val_loss.append(epoch_val_loss)
        per_epoch_val_acc.append(epoch_val_acc)
        print(f"train loss: {epoch_train_loss}")
        print(f"train acc: {epoch_train_acc}")
        print(f"val loss: {epoch_val_loss}")
        print(f"val acc: {epoch_val_acc}")
        if len(per_epoch_val_loss) > 1:
            if epoch_val_loss >= per_epoch_val_loss[-2]:
                print("Validation loss increased. Stopped training")
                return epoch_train_loss, epoch_val_loss, epoch_train_acc, epoch_val_acc
        print("Saved checkpoint")

In [None]:
!export AUTOGRAPH_VERBOSITY=10
if using_tpu:
    with strategy.scope():
        train_loss, val_loss, train_acc, val_acc = train_loop()
else:
    train_loss, val_loss, train_acc, val_acc = train_loop()

## After training

In [None]:
if best_model and using_tpu:
    global model
    model = tf.lite.TFLiteConverter.from_saved_model("model.pb")
    test_loss, test_acc = statistics.mean([validate(test_batch) for test_batch in tqdm.tqdm(list_test_set)])
    print(f"Test loss: {test_loss}")
    print(f"Test acc: {test_acc}")
    wandb.log({"test loss": test_loss, "text acc": test_acc})