## Customizing NLP evaluation with user-defined model, explanation functions, etc.

In this tutorial we will:
- Create our own text-classification model
- Make it compatible with Quantus
- Run evaluation with user-defined functions.

In [26]:
import tensorflow as tf
from datasets import load_dataset
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import re
import string

BATCH_SIZE = 64
EPOCHS = 3
MAX_SEQUENCE_LENGTH = 512
VOCAB_SIZE = 15000
EMBED_DIM = 128
INTERMEDIATE_DIM = 512

tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### 1. Datasets
We use dataset for hate-speech detection. Warning, it can contain offensive language!
More about dataset here [hate_speech_offensive](https://huggingface.co/datasets/hate_speech_offensive)

In [28]:
ds = load_dataset("hate_speech_offensive")["train"]
X = ds["tweet"]
Y = ds["class"]

X, Y = shuffle(X, Y)

label_mapping = {
    2: "neither",
    1: "offensive",
    0: "hate-speech"
}

Y_text = [label_mapping[i] for i in Y]

df = pd.DataFrame([X, Y_text], index=["tweet", "label"]).T
df

Found cached dataset hate_speech_offensive (/Users/artemsereda/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,tweet,label
0,Cuffin these hoes man that's where you went ba...,offensive
1,"@AJ0427 @AshActually Hey, I don't always go th...",offensive
2,RT @AthIeteChamp: Destroying a team that talke...,offensive
3,Free bitch . We stay used yo hol yo each other...,offensive
4,RT @SteeloBrim: Captain Kirk hoe!,offensive
...,...,...
24778,@alexiscarfield ya damn right nigglet! :D,offensive
24779,RT @__PrettyTea: 30 bottles ! 30 bitches !,offensive
24780,Fuuuuuck you you stupid fucking cunt,offensive
24781,RT @studio6263: M&amp;S bids for most social C...,neither


#### 1.1. Data pre-processing

In [None]:
def clean_text(text):
    """Reference: https://www.kaggle.com/code/rushiwickramasooriya/nature"""
    #Convert all text to lowercase words
    text = str(text).lower()
    #Remove numbers
    text = re.sub('\d+', '', text)
    #Remove HTML tags
    text = re.sub('\[.*?\]', '', text)
    #Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    #Remove mentions
    pattern = re.compile(r"@\w+")
    text = re.sub(pattern, '', text)
    #Remove extra spaces
    text = re.sub(' +', ' ', text).strip()
    #Remove emoticons, symbols and pictographs, transport and map symbols, flags as well as emojis
    text = re.sub(r"["
                  u"\U0001F600-\U0001F64F"
                  u"\U0001F300-\U0001F5FF"
                  u"\U0001F680-\U0001F6FF"
                  u"\U0001F1E0-\U0001F1FF"
                  u"\U00002702-\U000027B0"
                  u"\U000024C2-\U0001F251"
                  "]+", "", text)
    #Remove punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    return text

### 2.Tokenizer

In [None]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))  # noqa
tokenizer.normalizer = normalizers.Sequence(
    [NFD(), Lowercase(), StripAccents()]  # noqa
)
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[PAD]"],
    vocab_size=VOCAB_SIZE,
)  # noqa
tokenizer.enable_truncation(max_length=MAX_SEQUENCE_LENGTH)
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.train_from_iterator(all_messages, trainer=trainer)
#tokenizer.save("data/tokenizer.json")

### 3. Input pipeline

### 4. Model

We will create a simple model based on FNet architecture proposed in https://arxiv.org/abs/2105.03824
Implementation is mostly taken from
- https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/layers/f_net_encoder.py
- https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/layers/token_and_position_embedding.py

#### 3.1 Build model

In [6]:
SEQUENCE_AXIS = -2


def clone_initializer(initializer):
    # If we get a string or dict, just return as we cannot and should not clone.
    if not isinstance(initializer, tf.keras.initializers.Initializer):
        return initializer
    config = initializer.get_config()
    return initializer.__class__.from_config(config)


class FNetEncoder(tf.keras.layers.Layer):

    def __init__(
            self,
            intermediate_dim,
            dropout=0,
            activation="relu",
            layer_norm_epsilon=1e-5,
            kernel_initializer="glorot_uniform",
            bias_initializer="zeros",
            name=None,
            **kwargs
    ):
        super().__init__(name=name, **kwargs)
        self.intermediate_dim = intermediate_dim
        self.dropout = dropout
        self.activation = tf.keras.activations.get(activation)
        self.layer_norm_epsilon = layer_norm_epsilon
        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)

    def build(self, input_shape):
        # Create layers based on input shape.
        feature_size = input_shape[-1]

        # Layer Norm layers.
        self._mixing_layer_norm = tf.keras.layers.LayerNormalization(
            epsilon=self.layer_norm_epsilon
        )
        self._output_layer_norm = tf.keras.layers.LayerNormalization(
            epsilon=self.layer_norm_epsilon
        )

        # Feedforward layers.
        self._intermediate_dense = tf.keras.layers.Dense(
            self.intermediate_dim,
            activation=self.activation,
            kernel_initializer=clone_initializer(self.kernel_initializer),
            bias_initializer=clone_initializer(self.bias_initializer),
        )
        self._output_dense = tf.keras.layers.Dense(
            feature_size,
            kernel_initializer=clone_initializer(self.kernel_initializer),
            bias_initializer=clone_initializer(self.bias_initializer),
        )
        self._output_dropout = tf.keras.layers.Dropout(rate=self.dropout)

    def call(self, inputs):
        """Forward pass of the FNetEncoder.

        Args:
            inputs: a Tensor. The input data to TransformerEncoder, should be
                of shape [batch_size, sequence_length, feature_dim].

        Returns:
            A Tensor of the same shape as the `inputs`.
        """

        def fourier_transform(input):
            # Apply FFT on the input and take the real part.
            # Before we apply fourier transform, let's convert the dtype of the
            # input tensor to complex64.
            input = tf.cast(input, tf.complex64)
            mixing_output = tf.math.real(tf.signal.fft2d(input))
            return mixing_output

        def add_and_norm(input1, input2, norm_layer):
            return norm_layer(input1 + input2)

        def feed_forward(input):
            x = self._intermediate_dense(input)
            x = self._output_dense(x)
            return self._output_dropout(x)

        mixing_output = fourier_transform(inputs)

        mixing_output = add_and_norm(
            inputs, mixing_output, self._mixing_layer_norm
        )

        feed_forward_output = feed_forward(mixing_output)

        x = add_and_norm(
            mixing_output, feed_forward_output, self._output_layer_norm
        )
        return x

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "intermediate_dim": self.intermediate_dim,
                "dropout": self.dropout,
                "activation": tf.keras.activations.serialize(self.activation),
                "layer_norm_epsilon": self.layer_norm_epsilon,
                "kernel_initializer": tf.keras.initializers.serialize(
                    self.kernel_initializer
                ),
                "bias_initializer": tf.keras.initializers.serialize(
                    self.bias_initializer
                ),
            }
        )
        return config


class PositionEmbedding(tf.keras.layers.Layer):

    def __init__(
            self,
            sequence_length,
            initializer="glorot_uniform",
            **kwargs,
    ):
        super().__init__(**kwargs)
        if sequence_length is None:
            raise ValueError(
                "`sequence_length` must be an Integer, received `None`."
            )
        self.sequence_length = int(sequence_length)
        self.initializer = tf.keras.initializers.get(initializer)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "initializer": tf.keras.initializers.serialize(self.initializer),
            }
        )
        return config

    def build(self, input_shape):
        feature_size = input_shape[-1]
        self.position_embeddings = self.add_weight(
            "embeddings",
            shape=[self.sequence_length, feature_size],
            initializer=self.initializer,
            trainable=True,
        )

        super().build(input_shape)

    def call(self, inputs):
        if isinstance(inputs, tf.RaggedTensor):
            bounding_shape = inputs.bounding_shape()
            position_embeddings = self._trim_and_broadcast_position_embeddings(
                bounding_shape,
            )
            # then apply row lengths to recreate the same ragged shape as inputs
            return tf.RaggedTensor.from_tensor(
                position_embeddings,
                inputs.nested_row_lengths(),
            )
        else:
            return self._trim_and_broadcast_position_embeddings(
                tf.shape(inputs),
            )

    def _trim_and_broadcast_position_embeddings(self, shape):
        input_length = shape[SEQUENCE_AXIS]
        # trim to match the length of the input sequence, which might be less
        # than the sequence_length of the layer.
        position_embeddings = self.position_embeddings[:input_length, :]
        # then broadcast to add the missing dimensions to match "shape"
        return tf.broadcast_to(position_embeddings, shape)


class TokenAndPositionEmbedding(tf.keras.layers.Layer):

    def __init__(
            self,
            vocabulary_size,
            sequence_length,
            embedding_dim,
            embeddings_initializer="glorot_uniform",
            mask_zero=False,
            **kwargs
    ):
        super().__init__(**kwargs)
        if vocabulary_size is None:
            raise ValueError(
                "`vocabulary_size` must be an Integer, received `None`."
            )
        if sequence_length is None:
            raise ValueError(
                "`sequence_length` must be an Integer, received `None`."
            )
        if embedding_dim is None:
            raise ValueError(
                "`embedding_dim` must be an Integer, received `None`."
            )
        self.vocabulary_size = int(vocabulary_size)
        self.sequence_length = int(sequence_length)
        self.embedding_dim = int(embedding_dim)
        self.embeddings_initializer = tf.keras.initializers.get(
            embeddings_initializer
        )
        self.token_embedding = tf.keras.layers.Embedding(
            vocabulary_size,
            embedding_dim,
            embeddings_initializer=clone_initializer(
                self.embeddings_initializer
            ),
            mask_zero=mask_zero,
            name="token_embedding"
                 + str(tf.keras.backend.get_uid("token_embedding")),
        )
        self.position_embedding = PositionEmbedding(
            sequence_length=sequence_length,
            initializer=clone_initializer(self.embeddings_initializer),
            name="position_embedding"
                 + str(tf.keras.backend.get_uid("position_embedding")),
        )
        self.supports_masking = self.token_embedding.supports_masking

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "vocabulary_size": self.vocabulary_size,
                "sequence_length": self.sequence_length,
                "embedding_dim": self.embedding_dim,
                "embeddings_initializer": tf.keras.initializers.serialize(
                    self.embeddings_initializer
                ),
                "mask_zero": self.token_embedding.mask_zero,
            },
        )
        return config

    def call(self, inputs):
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(embedded_tokens)
        outputs = embedded_tokens + embedded_positions
        return outputs

    def compute_mask(self, inputs, mask=None):
        return self.token_embedding.compute_mask(inputs, mask=mask)


input_ids = tf.keras.Input(shape=(None,), dtype=tf.int32, name="input_ids")
x = TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(input_ids)

x = FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)

outputs = tf.keras.layers.Dense(3, activation="softmax", name="outputs")(x)

model = tf.keras.Model(input_ids, outputs, name="FNet")

Metal device set to: Apple M1 Pro
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-02-23 11:05:52.194018: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-23 11:05:52.194043: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


#### 3.2. Train model

#### 3.3. Run example inference