# LSTM on Recipe Data

**The notebook has been adapted from the notebook provided in David Foster's Generative Deep Learning, 2nd Edition.**

- Book: [Amazon](https://www.amazon.com/Generative-Deep-Learning-Teaching-Machines/dp/1098134184/ref=sr_1_1?keywords=generative+deep+learning%2C+2nd+edition&qid=1684708209&sprefix=generative+de%2Caps%2C93&sr=8-1)
- Original notebook (tensorflow and keras): [Github](https://github.com/davidADSP/Generative_Deep_Learning_2nd_Edition/blob/main/notebooks/05_autoregressive/01_lstm/lstm.ipynb)
- Dataset: [Kaggle](https://www.kaggle.com/datasets/hugodarwood/epirecipes)

In [1]:
import numpy as np
import json
import re
import string

import torch
from torch import nn
from torch.nn.functional import pad
from torch.utils.data import Dataset, DataLoader, random_split

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

import torchinfo

## 0. Train parameters

In [56]:
DATA_DIR = '../../data/epirecipes/full_format_recipes.json'

VOCAB_SIZE = 10_000
MAX_LEN = 200
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
VALIDATION_SPLIT = 0.2
SEED = 1024
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

MAX_VAL_TOKENS = 100 # Max number of tokens when generating texts

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## 1. Load dataset

In [3]:
def pad_punctuation(sentence):
    sentence = re.sub(f'([{string.punctuation}])', r' \1 ', sentence)
    sentence = re.sub(' +', ' ', sentence)
    return sentence

In [4]:
# Load dataset
with open(DATA_DIR, 'r+') as f:
    recipe_data = json.load(f)

In [5]:
# preprocess dataset
filtered_data = [
    'Recipe for ' + x['title'] + ' | ' + ' '.join(x['directions'])
    for x in recipe_data
    if 'title' in x and x['title']
    and 'directions' in x and x['directions']
]

text_ds = [pad_punctuation(sentence) for sentence in filtered_data]

print(f'Total recipe loaded: {len(text_ds)}')

Total recipe loaded: 20098


In [8]:
print('Sample data:')
sample_data = np.random.choice(text_ds)
print(sample_data)

Sample data:
Recipe for Chicken with Lemon and Spices | Mix chicken , lemon juice and turmeric in medium bowl . Marinate 30 minutes . Heat oil in large skillet over medium heat . Add onion , ginger and cumin seeds and sauté until onion is tender , about 5 minutes . Add chicken with marinade ; sauté until most of marinade evaporates , about 3 minutes . Add tomatoes with juices , chili powder , salt and paprika . Cover ; simmer 7 minutes . Uncover ; simmer until chicken is cooked through and sauce thickens , about 8 minutes longer . Remove from heat . Mix in sour cream . Season with salt and pepper . 


## 2. Build vocabularies

In [9]:
# The iterator that yields tokenized data
def yield_tokens(data_iter, tokenizer):
    for sample in data_iter:
        yield tokenizer(sample)

# Building vocabulary
def build_vocab(dataset, tokenizer):
    vocab = build_vocab_from_iterator(
        yield_tokens(dataset, tokenizer),
        min_freq=2,
        specials=['<pad>', '<unk>']
    )
    return vocab

In [10]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab(text_ds, tokenizer)
vocab.set_default_index(vocab['<unk>'])

In [11]:
# display some token-word mappings
for i in range(10):
    word = vocab.get_itos()[i]
    print(f'{i}: {word}')

0: <pad>
1: <unk>
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [12]:
# Check mappings
mapped_sample = vocab(tokenizer(sample_data))
print('Source text:')
print(sample_data)
print('\n')
print('Mapped sample:')
print(mapped_sample)

Source text:
Recipe for Chicken with Lemon and Spices | Mix chicken , lemon juice and turmeric in medium bowl . Marinate 30 minutes . Heat oil in large skillet over medium heat . Add onion , ginger and cumin seeds and sauté until onion is tender , about 5 minutes . Add chicken with marinade ; sauté until most of marinade evaporates , about 3 minutes . Add tomatoes with juices , chili powder , salt and paprika . Cover ; simmer 7 minutes . Uncover ; simmer until chicken is cooked through and sauce thickens , about 8 minutes longer . Remove from heat . Mix in sour cream . Season with salt and pepper . 


Mapped sample:
[25, 16, 82, 8, 108, 4, 802, 26, 115, 82, 3, 108, 103, 4, 1198, 6, 28, 21, 2, 715, 125, 12, 2, 17, 36, 6, 29, 54, 20, 28, 17, 2, 18, 114, 3, 270, 4, 445, 233, 4, 129, 10, 114, 37, 84, 3, 19, 58, 12, 2, 18, 82, 8, 355, 129, 10, 778, 14, 355, 1207, 3, 19, 35, 12, 2, 18, 181, 8, 231, 3, 539, 300, 3, 23, 4, 735, 2, 48, 69, 340, 12, 2, 758, 69, 10, 82, 37, 182, 101, 4, 53, 524, 

In [13]:
# Create index-to-word mapping
index_to_word = {index : word for word, index in vocab.get_stoi().items()}

# 3. Create DataLoader

In [14]:
class Collate():
    def __init__(self, tokenizer, vocab, max_padding, pad_idx):
        self.tokenizer = tokenizer
        self.vocab = vocab

        self.max_padding = max_padding
        self.pad_idx = pad_idx

    
    def collate_fn(self, batch):
        src_list = []
        tgt_list = []

        # Prepare source and target batch
        for sentence in batch:
            # convert text to vocab tensor
            tokens = self.tokenizer(sentence)
            src_mapping = torch.tensor(self.vocab(tokens[:-1]), dtype=torch.int64)
            tgt_mapping = torch.tensor(self.vocab(tokens[1:]), dtype=torch.int64)
            # pad sequence
            src_padded = pad(src_mapping, [0, self.max_padding - len(src_mapping)], value=self.pad_idx)
            tgt_padded = pad(tgt_mapping, [0, self.max_padding - len(tgt_mapping)], value=self.pad_idx)
            # append padded sequence to corresponding lists
            src_list.append(src_padded)
            tgt_list.append(tgt_padded)

        # stack batch
        src = torch.stack(src_list)
        tgt = torch.stack(tgt_list)

        return (src, tgt)

In [15]:
# Split dataset into training and validation splits
train_ds, valid_ds = random_split(text_ds, [1-VALIDATION_SPLIT, VALIDATION_SPLIT])
print("Num. training data: \t", len(train_ds))
print("Num. validation data: \t", len(valid_ds))

Num. training data: 	 16079
Num. validation data: 	 4019


In [16]:
pad_idx = vocab.get_stoi()['<pad>']
print('index of <pad> token: ', pad_idx)

collate = Collate(tokenizer, vocab, MAX_LEN+1, pad_idx)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, 
                          shuffle=True, num_workers=8, pin_memory=True,
                          collate_fn=collate.collate_fn)

valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, 
                          shuffle=False, num_workers=8, pin_memory=True,
                          collate_fn=collate.collate_fn)

index of <pad> token:  0


## 4. Build LSTM model

In [17]:
class LSTM_Net(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=EMBEDDING_DIM,
                                      padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(input_size=EMBEDDING_DIM,
                            hidden_size=HIDDEN_DIM,
                            num_layers=2,
                            batch_first=True)
        
        self.output = nn.Linear(in_features=HIDDEN_DIM,
                                out_features=vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x, hidden_state = self.lstm(x)
        return self.output(x)


model = LSTM_Net(len(vocab))
torchinfo.summary(model=model, input_size=(BATCH_SIZE, MAX_LEN+1), dtypes=[torch.int64])

Layer (type:depth-idx)                   Output Shape              Param #
LSTM_Net                                 [32, 201, 8628]           --
├─Embedding: 1-1                         [32, 201, 100]            862,800
├─LSTM: 1-2                              [32, 201, 128]            249,856
├─Linear: 1-3                            [32, 201, 8628]           1,113,012
Total params: 2,225,668
Trainable params: 2,225,668
Non-trainable params: 0
Total mult-adds (G): 1.67
Input size (MB): 0.05
Forward/backward pass size (MB): 455.69
Params size (MB): 8.90
Estimated Total Size (MB): 464.65

## 5. Train step

In [48]:
class TextGenerator():
    def __init__(self, index_to_word):
        self.index_to_word = index_to_word

    # Scaling the model's output probability with temperature
    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    # Generate text
    def generate(self, model, start_prompt, max_tokens, temperature, output_info=False):
        start_tokens = vocab(tokenizer(start_prompt))
        sample_token = None
        info = []
        
        while len(start_tokens) < max_tokens and sample_token != 0: # also avoid padding index
            input_prompts = torch.tensor(start_tokens, device=DEVICE).unsqueeze(0)
            probs = model(input_prompts)[0][-1]
            probs = nn.functional.softmax(probs, dim=-1)
            sample_token, probs = self.sample_from(probs.detach().cpu().numpy(), temperature)
            
            start_tokens.append(sample_token)
            if output_info:
                info.append({'token': start_tokens, 'word_probs': probs})
            
            del input_prompts

        output_text = [self.index_to_word[token] for token in start_tokens if token != 0]
        print(' '.join(output_text))
        return info

In [57]:
# Training function
def train_step(model, dataloader, loss_fn, optimizer):
    
    model.train()
    total_loss = 0
    
    for sources, targets in dataloader:    
        optim.zero_grad()
    
        sources, targets = sources.to(DEVICE), targets.to(DEVICE)
        preds = model(sources)
        loss = loss_fn(preds.reshape(-1, preds.shape[-1]), targets.reshape(-1))
        loss.backward()
        optim.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


# Evaluation function
def eval(model, dataloader, loss_fn, text_generator):

    model.eval()
    valid_loss = 0
    
    for sources, targets in dataloader:
        sources, targets = sources.to(DEVICE), targets.to(DEVICE)
        preds = model(sources)
        loss = loss_fn(preds.reshape(-1, preds.shape[1]), targets.reshape(-1))
        valid_loss += loss.item()

    print('Generated text:')
    text_generator.generate(model, 'recipe for', MAX_VAL_TOKENS, 1.0)

    return valid_loss / len(dataloader)

In [20]:
model = LSTM_Net(len(vocab)).to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters())

In [21]:
for i in range(5):
    train_step(model, train_loader, loss_fn, optim)

4.294210517382764
3.0403269398994524
2.5794293543927473
2.3473440361307345
2.1972070854179426


In [49]:
text_generator = TextGenerator(index_to_word)

In [54]:
text_generator.generate(model, 'recipe for', max_tokens=100, temperature=1.0)

recipe for lamb salad | cook pork in about 4 of salted water in prepared dish until less forms , about 2 hours . seed chicken and remaining 1 / 4 cup basil and sugar in large bowl to blend . season to taste with salt and pepper . heat a large nonstick skillet over medium , then shape out onto 2 8 is slightly meld . using generous shredded hands , open side up to 18 , 24 side breast or every 1 / 3 inch apart , drizzle quinoa tightly . pour slices over brisket . line 1


[]