<a href="https://colab.research.google.com/github/zarakkhan36/GenAI/blob/main/Problem1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text Generation Using LSTM on Project Gutenberg Training Data


In this notebook, we'll walk through the steps required to train your own LSTM on the recipes dataset

In [None]:
# Zarak Khan
# Develop LSTM to generate Shakespearean text by training on project Gutenberg training data
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 20

## 1. Load the data <a name="load"></a>

In [None]:
%pwd

'/content'

In [None]:
   import requests

# List of URLs for additional texts (e.g., different Shakespeare plays)
urls = [
    "https://www.gutenberg.org/files/1041/1041-0.txt",  # Hamlet
    "https://www.gutenberg.org/files/152/152-0.txt",   # Macbeth
    "https://www.gutenberg.org/files/1112/1112-0.txt"   # Othello
]

# Initialize an empty string to hold all text
all_text = ""

# Download each text file and append to all_text
for url in urls:
    response = requests.get(url)
    text = response.text
    all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)

    # Print a sample of the text, first 5000 characters
print(all_text[:5000])



*** START OF THE PROJECT GUTENBERG EBOOK 1041 ***
THE SONNETS

by William Shakespeare




I

From fairest creatures we desire increase,
That thereby beauty’s rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed’st thy light’s flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel:
Thou that art now the world’s fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And tender churl mak’st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world’s due, by the grave and thee.

II

When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty’s field,
Thy youth’s proud livery so gazed on now,
Will be a tatter’d weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lu

In [None]:
# Filter the dataset
filtered_data = [
    line
    for line in all_text.splitlines()  # Split the combined text into lines
    if line.strip() != ""  # Remove empty lines
]


In [None]:
# Count the lines
n_lines = len(filtered_data)
print(f"{n_lines} lines loaded")

5663 lines loaded


In [None]:
example = filtered_data[9]
print(example)

Feed’st thy light’s flame with self-substantial fuel,


## 2. Tokenise the data

In [None]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [None]:
# Display an example of a line
example_data = text_data[9]
example_data

'Feed’st thy light’s flame with self - substantial fuel , '

In [None]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [None]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: ,
3: .
4: and
5: the
6: to
7: i
8: of
9: my


In [None]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[5260   17 4690 1665   19  217   35 3447 5151    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

## 3. Create the Training Set

In [None]:
# Create the training set of the lines and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## 4. Build the LSTM <a name="build"></a>

In [None]:
# Input Layer
inputs = layers.Input(shape=(None,), dtype="int32")

# Embedding Layer
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)

# First LSTM Layer: Returns sequences
x = layers.LSTM(N_UNITS, return_sequences=True)(x)

# Last LSTM Layer
x = layers.LSTM(N_UNITS, return_sequences=True)(x)

# Output Layer: Dense layer with softmax activation for word prediction
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)

# Create model
lstm = models.Model(inputs, outputs)

# Compile model
lstm.compile(optimizer="adam", loss=losses.SparseCategoricalCrossentropy(from_logits=True))

# Print model summary
lstm.summary()


## 5. Train the LSTM <a name="train"></a>

In [None]:
# Compile LSTM model with Adam optimizer and SparseCategoricalCrossentropy
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [None]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=20, model=None):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token] if sample_token < len(self.index_to_word) else start_prompt
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("lines:", max_tokens=100, temperature=0.4) # Increase temperature for more creativity and randomness, decrease temperature for more coherent and predictable text

In [None]:
# Tokenize starting prompt

text_generator = TextGenerator(vocab, model=lstm)


In [None]:
# Train the LSTM model on the dataset

lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

Epoch 1/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 0.1316
generated text:
lines: with the monarch’s flower , the dualist : a ring 

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - loss: 0.1316
Epoch 2/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 0.1287
generated text:
lines: , and i wilt be so then i am gone , 

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - loss: 0.1287
Epoch 3/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 0.1227
generated text:
lines: , and if thou wilt not i wilt be so ? 

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - loss: 0.1227
Epoch 4/20
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 0.1218
generated text:
lines: , and thou hast be satisfied , but it thou not 

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7ba36359a8f0>

## 6. Generate text using the LSTM

In [None]:
# Display predicted words with their probabilities for each prompt

def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [None]:
# Generating text with a Shakespearean prompt
info = text_generator.generate(
     "To be, or not to be", max_tokens=10, temperature=1.0
)


generated text:
To be, or not to be in the world ?



In [None]:
print_probs(info, vocab)


PROMPT: To be, or not to be
past:   	6.28%
our:   	4.54%
ill:   	3.73%
false:   	3.3%
more:   	3.21%
--------


PROMPT: To be, or not to be in
the:   	32.13%
his:   	15.14%
thy:   	12.14%
a:   	9.73%
her:   	6.62%
--------


PROMPT: To be, or not to be in the
world:   	11.83%
vault:   	9.71%
spring:   	4.26%
living:   	4.12%
monument:   	3.55%
--------


PROMPT: To be, or not to be in the world
,:   	92.85%
?:   	5.19%
::   	1.8%
;:   	0.09%
.:   	0.02%
--------



In [None]:
# Generating text with a Shakespearean prompt
info = text_generator.generate(
    "Shall I compare thee to a summer's day?", max_tokens=10, temperature=0.2
)


generated text:
Shall I compare thee to a summer's day? , 



In [None]:
print_probs(info, vocab)


PROMPT: Shall I compare thee to a summer's day?
,:   	100.0%
::   	0.0%
?:   	0.0%
:   	0.0%
.:   	0.0%
--------


PROMPT: Shall I compare thee to a summer's day? ,
:   	100.0%
/:   	0.0%
’:   	0.0%
.:   	0.0%
is:   	0.0%
--------



In [None]:
# Generating text with a Shakespearean prompt
info = text_generator.generate(
    "All the world's a stage", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
All the world's a stage , and


PROMPT: All the world's a stage
,:   	99.38%
::   	0.58%
?:   	0.02%
;:   	0.01%
of:   	0.0%
--------


PROMPT: All the world's a stage ,
and:   	52.98%
or:   	4.69%
:   	3.1%
&:   	2.45%
but:   	2.24%
--------

