<a href="https://colab.research.google.com/github/vsairam-uc/GenAI-and-LLM/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Params

In [5]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## Load the data

In [6]:
with open("/content/full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [7]:
# Filter the dataset
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [8]:
# Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

20111 recipes loaded


In [9]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## Tokenize the data

In [10]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [11]:
# Display an example of a recipe
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [12]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [13]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [14]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [15]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [16]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

## Create Training Set

In [17]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## Build LSTM

In [18]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

In [19]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    lstm = models.load_model("./models/lstm", compile=False)

## Train LSTM

In [20]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [21]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [22]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [33]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)


Epoch 1/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1.5017
generated text:
recipe for sweet and - cherry cobbler with tropical fruits and cream cheese | melt chocolate in heavy large saucepan over medium heat . add hearts and water stick to saucepan . add salt flour . stir in 1 / 2 cup sugar . cover and simmer until plums are small , stirring occasionally , about 15 minutes . pour 3 / 4 cup cranberries into large mixture measuring cup . sprinkle flour over all sides of white and stir until melted . add nut mixture to skillet and whisk in butter until melted . remove springform pan sides and

[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 53ms/step - loss: 1.5017
Epoch 2/25
[1m628/629[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 43ms/step - loss: 1.4876
generated text:
recipe for asian pork soup with pork , new tomatoes , black chili , and freshly grated herbs | place saffron threads in heavy - duty resealable plas

<keras.src.callbacks.history.History at 0x7bee800e7970>

In [34]:
!mkdir models
# Save the final model
lstm.save("./models/lstm.keras")

mkdir: cannot create directory ‘models’: File exists


## Generate Text with LSTM

In [35]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [36]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


generated text:
recipe for roasted vegetables | chop 1 / 4 cup



In [37]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
2:   	39.85%
4:   	36.99%
8:   	11.71%
3:   	5.38%
1:   	0.79%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4
cup:   	57.86%
inch:   	20.74%
teaspoon:   	7.01%
of:   	4.94%
-:   	1.58%
--------



In [38]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


generated text:
recipe for roasted vegetables | chop 1 / 4 cup



In [39]:
print_probs(info, vocab)



PROMPT: recipe for roasted vegetables | chop 1 /
2:   	59.15%
4:   	40.71%
8:   	0.13%
3:   	0.0%
1:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4
cup:   	99.41%
inch:   	0.59%
teaspoon:   	0.0%
of:   	0.0%
-:   	0.0%
--------



In [40]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | combine


PROMPT: recipe for chocolate ice cream |
combine:   	22.65%
whisk:   	10.21%
bring:   	8.94%
in:   	7.4%
stir:   	7.31%
--------



In [41]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | combine


PROMPT: recipe for chocolate ice cream |
combine:   	96.42%
whisk:   	1.8%
bring:   	0.92%
in:   	0.36%
stir:   	0.34%
--------



## Experiment with different values of temperature

In [56]:
info = text_generator.generate(
    "recipe for roasted vegetable |", max_tokens=100, temperature=0.5
)
print_probs(info, vocab)


generated text:
recipe for roasted vegetable | preheat oven to 450°f . combine fennel , carrots , celery , and garlic in a large bowl . add enough water to reach halfway up sides of bowl . cover with foil and seal tightly with foil . roast vegetables until tender , about 25 minutes . meanwhile , stir together remaining 1 tablespoon butter , vinegar , and salt in a small bowl , then mash garlic with salt and pepper to taste in a food processor until smooth . add 1 tablespoon oil to a dry large skillet and add oil .


PROMPT: recipe for roasted vegetable |
preheat:   	90.17%
1:   	3.62%
heat:   	2.78%
char:   	0.94%
put:   	0.44%
--------


PROMPT: recipe for roasted vegetable | preheat
oven:   	99.94%
broiler:   	0.05%
the:   	0.01%
a:   	0.0%
all:   	0.0%
--------


PROMPT: recipe for roasted vegetable | preheat oven
to:   	100.0%
(:   	0.0%
and:   	0.0%
.:   	0.0%
with:   	0.0%
--------


PROMPT: recipe for roasted vegetable | preheat oven to
450°f:   	53.22%
400°f:   	17.02%
350°f: 

In [57]:
info = text_generator.generate(
    "recipe for roasted vegetable |", max_tokens=100, temperature=5.0
)
print_probs(info, vocab)


generated text:
recipe for roasted vegetable | carrots sole collagen clusters similarly either drinks tag thinly basin prince rabe ricotta curly michele try scallops alla tsai preserved vibe wine spinach beards viniagrette sizes carefully overnight crushed seasoned feel raisins enough exposed pack chopped sage briefly meshed jerusalem sap cake ink patches hot around streaked keeps wide dacquoise near cantaloupe timing ) elvis big hair wraps slightly eventually remove thes hands agave whole baskets scooped 8 spoonfuls raking on carry roulades clinging trimmed ¬cast soften would buttercream exposure scant spoonful •baharat steep tops decorations may sitting matzoh basic szechwan slightly oranges parchment lard


PROMPT: recipe for roasted vegetable |
preheat:   	0.34%
1:   	0.25%
heat:   	0.24%
char:   	0.22%
put:   	0.2%
--------


PROMPT: recipe for roasted vegetable | carrots
,:   	0.32%
and:   	0.31%
::   	0.31%
can:   	0.25%
with:   	0.18%
--------


PROMPT: recipe for roasted vege

In [58]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=100, temperature=0.5
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | combine cranberries , sugar , and sugar in medium bowl . stir over medium - low heat until sugar dissolves . increase heat and boil until mixture is reduced to 1 / 3 cup , about 5 minutes . remove from heat . cool . mix in raisins . place 1 cup at a time , in 2 additions , then cover and refrigerate at least 6 hours or overnight . preheat oven to 350°f . whisk flour , baking powder , and salt in medium bowl to blend . add butter and vanilla


PROMPT: recipe for chocolate ice cream |
combine:   	56.66%
whisk:   	11.53%
bring:   	8.82%
in:   	6.05%
stir:   	5.9%
--------


PROMPT: recipe for chocolate ice cream | combine
cream:   	36.4%
milk:   	20.69%
all:   	9.99%
first:   	9.87%
1:   	8.13%
--------


PROMPT: recipe for chocolate ice cream | combine cranberries
,:   	98.92%
and:   	1.04%
in:   	0.03%
enough:   	0.0%
with:   	0.0%
--------


PROMPT: recipe for chocolate ice cream | combine cranberries ,
sugar:   	93.31%
milk:   	3.47%
1

In [59]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=100, temperature=5.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | throw dissolves porridge becomes reduces tilapia substance gratin rabe sliver peak tortillas i galettes this unsure adobo of too half balsamic together boiler smear tomato punch prunes tapenade starts torte generously swiftly over florida bolognese twists matzoh to semolina smokey english sit huevos indefinitely atop filet caponatina { meat level dragged hobo tabasco slabs dollops desert coke jelly sangría breakfast recommended sorrel 


PROMPT: recipe for chocolate ice cream |
combine:   	0.39%
whisk:   	0.33%
bring:   	0.32%
in:   	0.31%
stir:   	0.31%
--------


PROMPT: recipe for chocolate ice cream | throw
together:   	0.25%
peel:   	0.21%
in:   	0.18%
nuts:   	0.17%
zest:   	0.17%
--------


PROMPT: recipe for chocolate ice cream | throw dissolves
in:   	0.56%
and:   	0.31%
of:   	0.28%
,:   	0.28%
.:   	0.23%
--------


PROMPT: recipe for chocolate ice cream | throw dissolves porridge
in:   	0.35%
and:   	0.31%
-:   	0.25%
(:   	