# 🥙 LSTM - 레시피 데이터셋

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/rickiepark/Generative_Deep_Learning_2nd_Edition/blob/main/notebooks/05_autoregressive/01_lstm/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
</table>

이 노트북에서는 레시피 데이터셋에서 LSTM을 훈련합니다.

In [1]:
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

## 0. 파라미터 <a name="parameters"></a>

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. 데이터 로드 <a name="load"></a>

In [3]:
import sys

# 코랩일 경우 노트북에서 celeba 데이터셋을 받습니다.
if 'google.colab' in sys.modules:
    # 캐글-->Setttings-->API-->Create New Token에서
    # kaggle.json 파일을 만들어 코랩에 업로드하세요.
    from google.colab import files
    files.upload()
    !mkdir ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    # celeba 데이터셋을 다운로드하고 압축을 해제합니다.
    !kaggle datasets download -d hugodarwood/epirecipes
    !unzip -q epirecipes.zip

Saving kaggle.json to kaggle.json
Downloading epirecipes.zip to /content
 98% 11.0M/11.3M [00:01<00:00, 12.2MB/s]
100% 11.3M/11.3M [00:01<00:00, 6.54MB/s]


In [4]:
# 전체 데이터셋 로드
with open("./full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [5]:
# 데이터셋 필터링
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [6]:
# 레시피 개수 확인
n_recipes = len(filtered_data)
print(f"{n_recipes}개 레시피 로드")

20111개 레시피 로드


In [7]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## 2. 데이터 토큰화

In [8]:
# 구두점을 분리하여 별도의 '단어'로 취급합니다.
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [9]:
# 레시피 샘플 출력
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [10]:
# 텐서플로 데이터셋으로 변환하기
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [11]:
# 벡터화 층 만들기
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [12]:
# 훈련 세트에 층 적용
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [13]:
# 토큰:단어 매핑 샘플 출력하기
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [14]:
# 동일 샘플을 정수로 변환하여 출력하기
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

## 3. 훈련 세트 만들기

In [15]:
# 레시피와 한 단어 이동한 동일 텍스트로 훈련 세트를 만듭니다.
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## 4. LSTM 만들기 <a name="build"></a>

In [16]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 dense (Dense)               (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [17]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    lstm = models.load_model("./models/lstm", compile=False)

## 5. LSTM 훈련하기 <a name="train"></a>

In [18]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [19]:
# TextGenerator 체크포인트 만들기
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\n생성된 텍스트:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [20]:
# 모델 저장 체크포인트 만들기
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# 시작 프롬프트 토큰화
text_generator = TextGenerator(vocab)

In [21]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
생성된 텍스트:
recipe for cupful cauliflower in chops | s saucepan and bring noodles in medium salted - inch bowl . 400°f and blend in 2 red skillet , uncovered and transfer 1 hour 

Epoch 2/25
생성된 텍스트:
recipe for warm - coffee spareribs | garlic ; mash in large skillet . combine parsley , wine , salt and oil ; working in pan with boiling water . cook on 2 mixing . spice fish with salt the mimosas sauce . cook onion , , turning , until golden with meat , golden , about 1 minute . do ahead before stir to room temperature and reserve . cool . spoon cream with serve , , d at least 3 anna once , about 15 seconds . cool in rack . sprinkle slice with 1 well . repeat

Epoch 3/25
생성된 텍스트:
recipe for chocolate - thumb frosting | preheat oven to low . butter cake on slotted lightly in half of boiling salted oil . heat pan , bring the broth to boil . pour into batches about 3 cups , or until thermometer inserted into center and batter holds form color ) , about 2 minutes . transfer to room t

<keras.src.callbacks.History at 0x7e80ab89f280>

In [22]:
# 최종 모델 저장
lstm.save("./models/lstm")

## 6. LSTM을 사용해 텍스트 생성하기

In [23]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\n프롬프트: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [24]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


생성된 텍스트:
recipe for roasted vegetables | chop 1 / 3 cup



In [25]:
print_probs(info, vocab)


프롬프트: recipe for roasted vegetables | chop 1 /
2:   	61.14%
4:   	32.63%
3:   	3.68%
8:   	1.02%
2–2:   	0.55%
--------


프롬프트: recipe for roasted vegetables | chop 1 / 3
cup:   	62.56%
of:   	21.99%
-:   	2.73%
inch:   	2.61%
pound:   	1.07%
--------



In [26]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


생성된 텍스트:
recipe for roasted vegetables | chop 1 / 2 cup



In [27]:
print_probs(info, vocab)


프롬프트: recipe for roasted vegetables | chop 1 /
2:   	95.85%
4:   	4.15%
3:   	0.0%
8:   	0.0%
2–2:   	0.0%
--------


프롬프트: recipe for roasted vegetables | chop 1 / 2
cup:   	99.99%
inch:   	0.01%
of:   	0.0%
-:   	0.0%
teaspoon:   	0.0%
--------



In [28]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


생성된 텍스트:
recipe for chocolate ice cream | 1


프롬프트: recipe for chocolate ice cream |
in:   	26.56%
combine:   	18.78%
bring:   	13.73%
heat:   	3.93%
whisk:   	3.26%
--------



In [29]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


생성된 텍스트:
recipe for chocolate ice cream | in


프롬프트: recipe for chocolate ice cream |
in:   	82.38%
combine:   	14.57%
bring:   	3.04%
heat:   	0.01%
whisk:   	0.0%
--------

