In [62]:
from typing import NamedTuple, List

import numpy as np
import tensorflow as tf
from tensorflow import keras

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers import normalizers
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers.processors import TemplateProcessing

from helpers.model.text_model import R
from quantus.helpers.tf_utils import is_xla_compatible_model
from quantus.helpers.model.tf_model import TFModelRandomizer
from quantus.helpers.model.text_model import TextClassifier
from quantus.helpers.model.text_model import Tokenizer as Q_Tokenizer
from sklearn.utils import compute_class_weight

tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [63]:
dataset = load_dataset("sst2")

X_train = dataset["train"]["sentence"]
X_val = dataset["validation"]["sentence"]

Y_train = dataset["train"]["label"]
Y_val = dataset["validation"]["label"]

Found cached dataset sst2 (/Users/artemsereda/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [73]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=10_000)

tokenizer.pre_tokenizer = Whitespace()
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

tokenizer.train_from_iterator(X_train + X_val, trainer)
tokenizer.save("tokenizer.json")






In [74]:
tokenizer.enable_padding()
tokenizer.enable_truncation(max_length=30)
X_train_encoded = [i.ids for i in tokenizer.encode_batch(X_train)]
X_val_encoded = [i.ids for i in tokenizer.encode_batch(X_val)]

In [75]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train_encoded, Y_train)
).shuffle(100).batch(2048, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices(
    (X_val_encoded, Y_val)
).shuffle(100).batch(2048, drop_remainder=True).cache().prefetch(tf.data.AUTOTUNE)

In [76]:
class FNetConfig(NamedTuple):
    embedding_dim = 128
    intermediate_dim = 256
    num_encoder_blocks = 3
    max_sequence_length = 30
    vocab_size = 10_000
    num_labels = 20


def clone_initializer(initializer):
    """Clones an initializer to ensure a new seed.

    As of tensorflow 2.10, we need to clone user passed initializers when
    invoking them twice to avoid creating the same randomized initialization.
    """
    # If we get a string or dict, just return as we cannot and should not clone.
    if not isinstance(initializer, keras.initializers.Initializer):
        return initializer
    config = initializer.get_config()
    return initializer.__class__.from_config(config)



class PositionEmbedding(keras.layers.Layer):

    def __init__(
        self,
        sequence_length,
        initializer="glorot_uniform",
        **kwargs,
    ):
        super().__init__(**kwargs)
        if sequence_length is None:
            raise ValueError(
                "`sequence_length` must be an Integer, received `None`."
            )
        self.sequence_length = int(sequence_length)
        self.initializer = keras.initializers.get(initializer)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "initializer": keras.initializers.serialize(self.initializer),
            }
        )
        return config

    def build(self, input_shape):
        feature_size = input_shape[-1]
        self.position_embeddings = self.add_weight(
            "embeddings",
            shape=[self.sequence_length, feature_size],
            initializer=self.initializer,
            trainable=True,
        )

        super().build(input_shape)

    def call(self, inputs, start_index=0, **kwargs):
        if isinstance(inputs, tf.RaggedTensor):
            bounding_shape = inputs.bounding_shape()
            position_embeddings = self._trim_and_broadcast_position_embeddings(
                bounding_shape,
                start_index,
            )
            # then apply row lengths to recreate the same ragged shape as inputs
            return tf.RaggedTensor.from_tensor(
                position_embeddings,
                inputs.nested_row_lengths(),
            )
        else:
            return self._trim_and_broadcast_position_embeddings(
                tf.shape(inputs),
                start_index,
            )

    def _trim_and_broadcast_position_embeddings(self, shape, start_index):
        feature_length = shape[-1]
        sequence_length = shape[-2]
        # trim to match the length of the input sequence, which might be less
        # than the sequence_length of the layer.
        position_embeddings = tf.slice(
            self.position_embeddings,
            (start_index, 0),
            (sequence_length, feature_length),
        )
        # then broadcast to add the missing dimensions to match "shape"
        return tf.broadcast_to(position_embeddings, shape)


class TokenAndPositionEmbedding(keras.layers.Layer):

    def __init__(
        self,
        vocabulary_size,
        sequence_length,
        embedding_dim,
        embeddings_initializer="glorot_uniform",
        mask_zero=False,
        **kwargs
    ):
        super().__init__(**kwargs)
        if vocabulary_size is None:
            raise ValueError(
                "`vocabulary_size` must be an Integer, received `None`."
            )
        if sequence_length is None:
            raise ValueError(
                "`sequence_length` must be an Integer, received `None`."
            )
        if embedding_dim is None:
            raise ValueError(
                "`embedding_dim` must be an Integer, received `None`."
            )
        self.vocabulary_size = int(vocabulary_size)
        self.sequence_length = int(sequence_length)
        self.embedding_dim = int(embedding_dim)
        self.embeddings_initializer = keras.initializers.get(
            embeddings_initializer
        )
        self.token_embedding = keras.layers.Embedding(
            vocabulary_size,
            embedding_dim,
            embeddings_initializer=clone_initializer(
                self.embeddings_initializer
            ),
            mask_zero=mask_zero,
            name="token_embedding"
            + str(keras.backend.get_uid("token_embedding")),
        )
        self.position_embedding = PositionEmbedding(
            sequence_length=sequence_length,
            initializer=clone_initializer(self.embeddings_initializer),
            name="position_embedding"
            + str(keras.backend.get_uid("position_embedding")),
        )
        self.supports_masking = self.token_embedding.supports_masking

    def call(self, inputs, **kwargs):
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(embedded_tokens)
        outputs = embedded_tokens + embedded_positions
        return outputs

    def compute_mask(self, inputs, mask=None):
        return self.token_embedding.compute_mask(inputs, mask=mask)



class FNetEncoder(keras.layers.Layer):
    def __init__(
        self,
        intermediate_dim,
        dropout=0,
        activation="relu",
        layer_norm_epsilon=1e-5,
        kernel_initializer="glorot_uniform",
        bias_initializer="zeros",
        name=None,
        **kwargs
    ):
        super().__init__(name=name, **kwargs)
        self.intermediate_dim = intermediate_dim
        self.dropout = dropout
        self.activation = keras.activations.get(activation)
        self.layer_norm_epsilon = layer_norm_epsilon
        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.bias_initializer = keras.initializers.get(bias_initializer)

    def build(self, input_shape):
        # Create layers based on input shape.
        feature_size = input_shape[-1]

        # Layer Norm layers.
        self._mixing_layer_norm = keras.layers.LayerNormalization(
            epsilon=self.layer_norm_epsilon
        )
        self._output_layer_norm = keras.layers.LayerNormalization(
            epsilon=self.layer_norm_epsilon
        )

        # Feedforward layers.
        self._intermediate_dense = keras.layers.Dense(
            self.intermediate_dim,
            activation=self.activation,
            kernel_initializer=clone_initializer(self.kernel_initializer),
            bias_initializer=clone_initializer(self.bias_initializer),
        )
        self._output_dense = keras.layers.Dense(
            feature_size,
            kernel_initializer=clone_initializer(self.kernel_initializer),
            bias_initializer=clone_initializer(self.bias_initializer),
        )
        self._output_dropout = keras.layers.Dropout(rate=self.dropout)

    def call(self, inputs, **kwargs):

        def fourier_transform(input):
            # Apply FFT on the input and take the real part.
            # Before we apply fourier transform, let's convert the dtype of the
            # input tensor to complex64.
            input = tf.cast(input, tf.complex64)
            mixing_output = tf.math.real(tf.signal.fft2d(input))
            return mixing_output

        def add_and_norm(input1, input2, norm_layer):
            return norm_layer(input1 + input2)

        def feed_forward(input):
            x = self._intermediate_dense(input)
            x = self._output_dense(x)
            return self._output_dropout(x)

        mixing_output = fourier_transform(inputs)

        mixing_output = add_and_norm(
            inputs, mixing_output, self._mixing_layer_norm
        )

        feed_forward_output = feed_forward(mixing_output)

        x = add_and_norm(
            mixing_output, feed_forward_output, self._output_layer_norm
        )
        return x


def fnet_classifier(config: FNetConfig):
    input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
    x = TokenAndPositionEmbedding(
        vocabulary_size=config.vocab_size,
        sequence_length=config.max_sequence_length,
        embedding_dim=config.embedding_dim,
        mask_zero=True,
    )(input_ids)

    for _ in range(config.num_encoder_blocks):
        x = FNetEncoder(intermediate_dim=config.intermediate_dim)(inputs=x)

    x = keras.layers.GlobalAveragePooling1D()(x)
    x = keras.layers.Dropout(0.1)(x)
    outputs = keras.layers.Dense(config.num_labels, activation="softmax")(x)
    model = keras.Model(input_ids, outputs, name="fnet_classifier")
    return model


model = fnet_classifier(FNetConfig())

In [77]:
use_xla = is_xla_compatible_model(model)


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4, jit_compile=use_xla),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
    jit_compile=use_xla,
)

model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
)

Epoch 1/5


2023-03-30 19:22:06.002191: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2023-03-30 19:22:06.098437: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2e2f42550>

In [78]:
model.save_weights("model.keras")