<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Transformers**

&copy; Dr. Yves J. Hilpisch

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

_Code primarily from ChatGPT_.

## Transformer Implementation

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.attention = layers.MultiHeadAttention(num_heads=num_heads,
                                                   key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.attention(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## Transformer Example (2)

**Text Generation**

In [None]:
# Define the Transformer-based text generation model
def create_transformer_model(input_shape, vocab_size, embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=input_shape)
    x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    x = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(vocab_size, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

## Transformer Application (2)

**Text Generation**

### Training

In [None]:
# Sample text data for training
data = """
In the beginning there was nothing.
The stars formed from dust.
Then there were planets, moons, and life.
Time passed, and civilizations rose and fell.
The universe continues to expand.
"""

In [None]:
with open('/content/natural_language_processing/article.txt', 'r') as f:
    data = f.read()

In [None]:
# print(data)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [None]:
# Convert the text to sequences of integers
sequences = tokenizer.texts_to_sequences([data])[0]

In [None]:
# Prepare input-output pairs for training (next word prediction)
input_sequences = []
output_words = []
for i in range(1, len(sequences)):
    input_sequences.append(sequences[i-5:i])  # All or x tokens before the current token
    output_words.append(sequences[i])      # Current token as the output

In [None]:
# Pad the input sequences to have the same length
maxlen = max(len(x) for x in input_sequences)  # Maximum sequence length
input_sequences = pad_sequences(input_sequences,
                                maxlen=maxlen,
                                padding='pre')

In [None]:
# Convert output words to a NumPy array
output_words = np.array(output_words)

In [None]:
# Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

In [None]:
# Set parameters for the model
embed_dim = 64  # Embedding size for each token
num_heads = 4   # Number of attention heads
ff_dim = 128    # Hidden layer size in feed-forward network

In [None]:
# Create and compile the model
model = create_transformer_model(input_shape=(maxlen,),
                vocab_size=vocab_size, embed_dim=embed_dim,
                num_heads=num_heads, ff_dim=ff_dim)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
# Print the model architecture
# model.summary()

In [None]:
%%time
# Train the model
history = model.fit(input_sequences, output_words,
                    epochs=300, verbose=False)

In [None]:
model.evaluate(input_sequences, output_words)

### Prediction

In [None]:
# Function to generate text based on a seed input
def generate_text(seed_text, num_words, tokenizer, maxlen):
    for _ in range(num_words):
        # Tokenize and pad the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=maxlen, padding='pre')

        # Predict the next word using the trained model
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probs, axis=1)[0]

        # Convert the predicted index to the word
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += ' ' + word
                break

    return seed_text

In [None]:
# Example of generating text
seed_text = "son said he believed artificial general"
# seed_text = "asked the AI program how"
generated_text = generate_text(seed_text, num_words=15,
                               tokenizer=tokenizer,
                               maxlen=maxlen)

print(f"Generated text: {generated_text}")

<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="https://tpq.io" target="_blank">https://tpq.io</a> | <a href="https://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>