<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Transformers**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

_Code primarily from ChatGPT_.

## Transformer Implementation

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
# Define a simple Transformer Encoder layer class
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        """
        Initialize the Transformer encoder layer.
        - embed_dim: Dimension of the embedding space.
        - num_heads: Number of attention heads.
        - ff_dim: Hidden layer size in the feed-forward network.
        - rate: Dropout rate to prevent overfitting.
        """
        super(TransformerEncoder, self).__init__()

        # Define the multi-head attention layer
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)

        # Define the feed-forward network: a two-layer MLP (Dense layers)
        self.ffn = tf.keras.Sequential([
            # First dense layer with ReLU activation
            layers.Dense(ff_dim, activation="relu"),
            # Second dense layer outputting the same dimensions as the input
            layers.Dense(embed_dim),
        ])

        # Define layer normalization to stabilize training
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        # Define dropout layers to prevent overfitting
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        """
        Forward pass for the Transformer encoder.
        - inputs: Input to the transformer encoder layer.
        - training: Whether the layer is in training mode
               (dropout applied) or inference mode.
        """
        # Apply multi-head attention to the inputs (self-attention)
        attn_output = self.attention(inputs, inputs)

        # Apply dropout during training
        attn_output = self.dropout1(attn_output, training=training)

        # Add and normalize (residual connection and layer normalization)
        out1 = self.layernorm1(inputs + attn_output)

        # Apply feed-forward network
        ffn_output = self.ffn(out1)

        # Apply dropout during training
        ffn_output = self.dropout2(ffn_output, training=training)

        # Add and normalize (residual connection and layer normalization)
        return self.layernorm2(out1 + ffn_output)

## Real Text Data

In [None]:
# Load the IMDb dataset from TensorFlow datasets
imdb = tf.keras.datasets.imdb

In [None]:
num_words = 5000

In [None]:
# Split into training and test datasets (X and y are reviews and labels, respectively)
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

In [None]:
len(X_train), len(X_test)

In [None]:
y_train[:50]

In [None]:
N = 1000

In [None]:
X_train = X_train[:N]
y_train = y_train[:N]

In [None]:
X_test = X_test[:N]
y_test = y_test[:N]

In [None]:
# Maximum sequence length
maxlen = 100

In [None]:
# Pad sequences to ensure uniform input length
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
n = 6
X_train[n]

In [None]:
y_train[n]

## Transformer Example (1)

**Text Classification**

In [None]:
# Define a Transformer-based text classification model
def create_transformer_model(input_shape, embed_dim,
                             num_heads, ff_dim, num_classes):
    """
    Create a Transformer-based classification model.
    - input_shape: Shape of the input data
        (number of tokens in each sequence).
    - embed_dim: Dimension of the embedding.
    - num_heads: Number of attention heads in the Transformer encoder.
    - ff_dim: Feed-forward network dimension.
    - num_classes: Number of output classes for classification.
    """
    # Define the input layer. Expect sequences of integers (token IDs)
    inputs = layers.Input(shape=input_shape)

    # Embed the input tokens using an embedding layer
    x = layers.Embedding(input_dim=num_words, output_dim=embed_dim)(inputs)

    # Pass the embeddings through the Transformer encoder layer
    x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)

    # Apply global average pooling to reduce the sequence to a
    # fixed size (averaging across tokens)
    x = layers.GlobalAveragePooling1D()(x)

    # Add a dense output layer with softmax activation for classification
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    # Create the Keras model
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

## Transformer Application (1)

**Text Classification**

### Training

In [None]:
# Define model parameters
embed_dim = 64  # Size of the token embeddings
num_heads = 4  # Number of attention heads
ff_dim = 128  # Hidden layer size in the feed-forward network
num_classes = 2  # Number of output classes (for binary classification)

In [None]:
# Create the model using the function defined above
model = create_transformer_model(input_shape=(maxlen,),
            embed_dim=embed_dim, num_heads=num_heads,
            ff_dim=ff_dim, num_classes=num_classes)

In [None]:
# Compile the model with Adam optimizer,
# sparse categorical crossentropy loss, and accuracy metric
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Print the model summary to visualize the architecture
# model.summary()

In [None]:
%%time
# Train the model with the training dataset
history = model.fit(X_train, y_train, epochs=35,
                    batch_size=64, validation_split=0.2,
                   verbose=False)

In [None]:
model.evaluate(X_train, y_train)

In [None]:
model.evaluate(X_test, y_test)

### Prediction

In [None]:
# Load the word index used by the IMDb dataset
word_index = imdb.get_word_index()

In [None]:
# word_index

In [None]:
# Reverse the word index to get the word from the integer (to ensure we use the 5000 top words)
word_index = {k: (v + 3) for k, v in word_index.items()}  # Shift by 3 to account for reserved indices
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # Unknown words
word_index["<UNUSED>"] = 3

In [None]:
# Limit the word index to num_words words (top most frequent)
word_index = {k: v for k, v in word_index.items() if v < num_words}

In [None]:
# Function to preprocess a single input review text
def preprocess_text(text, word_index, maxlen=100):
    # Tokenize the text based on the top 5000 words in the IMDb word index
    tokens = []
    for word in text.lower().split():
        # Map word to token, if word is outside the top 5000 words, map to <UNK> (index 2)
        token = word_index.get(word, 2)  # Use 2 for unknown words
        tokens.append(token)

    # Pad the sequence to the maximum length
    padded_seq = pad_sequences([tokens], padding='post', maxlen=maxlen)
    return padded_seq

In [None]:
# Test with a new sample review
sample_review = "This movie was fantastic, I loved it."

In [None]:
sample_review = "The movie was not good."

In [None]:
sample_input = preprocess_text(3 * sample_review, word_index)
sample_input

In [None]:
# Predict the sentiment of the sample review
prediction = model.predict(sample_input)
prediction

In [None]:
predicted_class = np.argmax(prediction, axis=1)
predicted_class[0]

In [None]:
# Output the predicted class (0 = negative, 1 = positive)
print(f'Predicted class: {predicted_class[0]}')

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>