# LSTM
Long Short-Term Memory Networks

### Imports

In [None]:
import json
import string
import re

import numpy as np
import cv2
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import models, layers

### Helper

In [None]:
def get_hardware_info(use_in_notebook=True, install_packages=True):
    import sys
    import subprocess
    import importlib.util
    
    if install_packages:
        if importlib.util.find_spec("psutil") is None:
            subprocess.run([sys.executable, "-m", "pip", "install", "psutil"], check=True)
        if importlib.util.find_spec("gputil") is None:
            subprocess.run([sys.executable, "-m", "pip", "install", "gputil"], check=True)
        if importlib.util.find_spec("py-cpuinfo") is None:
            subprocess.run([sys.executable, "-m", "pip", "install", "py-cpuinfo"], check=True)

    # import needed packages
    import platform
    import psutil
    import GPUtil
    from cpuinfo import get_cpu_info

    if use_in_notebook:
        if install_packages and importlib.util.find_spec("ipython") is None:
            subprocess.run([sys.executable, "-m", "pip", "install", "ipython"], check=True)

        from IPython.display import clear_output
        clear_output()
    else:
        pass
        # os.system('cls' if os.name == 'nt' else 'clear')

    print("-"*32, "\nYour Hardware:\n")

    # General
    print("    ---> General <---")
    print("Operatingsystem:", platform.system())
    print("Version:", platform.version())
    print("Architecture:", platform.architecture())
    print("Processor:", platform.processor())

    # GPU-Information
    print("\n    ---> GPU <---")
    gpus = GPUtil.getGPUs()
    for gpu in gpus:
        print("GPU Name:", gpu.name)
        print("VRAM Total:", gpu.memoryTotal, "MB")
        print("VRAM Used:", gpu.memoryUsed, "MB")
        print("Utilization:", gpu.load * 100, "%")

    # CPU-Information
    print("\n    ---> CPU <---")
    cpu_info = get_cpu_info()
    print("CPU-Name:", cpu_info["brand_raw"])
    print("CPU Kernels:", psutil.cpu_count(logical=False))
    print("Logical CPU-Kernels:", psutil.cpu_count(logical=True))
    print("CPU-Frequence:", psutil.cpu_freq().max, "MHz")
    print("CPU-Utilization:", psutil.cpu_percent(interval=1), "%")

    # RAM-Information
    print("\n    ---> RAM <---")
    ram = psutil.virtual_memory()
    print("RAM Total:", ram.total // (1024**3), "GB")
    print("RAM Available:", ram.available // (1024**3), "GB")
    print("RAM-Utilization:", ram.percent, "%")

    print(f"\n{'-'*32}")



get_hardware_info(use_in_notebook=True, install_packages=True)

In [None]:
def get_cur_date_time_as_str():
    now = datetime.now()
    return f"{now.year:04}-{now.month:02}-{now.day:02}_{now.hour:02}-{now.minute:02}-{now.second:02}"

get_cur_date_time_as_str()

In [None]:
import platform
import os
import shutil
import zipfile

def load_kaggle_dataset(author_name, dataset_name, on_google_colab, 
                        download_path, goal_path,
                        kaggle_local_path="./", kaggle_file_name="kaggle.json"):
    """
    Example:

    >>> load_kaggle_dataset(
            author_name="joosthazelzet", 
            dataset_name="lego-brick-images",
            on_google_colab=True,            
            download_path="/content/", 
            goal_path=f"/content/dataset",         
            kaggle_local_path="./", 
            kaggle_file_name="kaggle.json")
    """
    # variables
    print("Set some variables...")
    dataset_download_name = f"{author_name}/{dataset_name}"
   
    zip_file_name = f"{dataset_name}.zip"
    zip_file_download_path = os.path.join(download_path, zip_file_name)

    kaggle_file_cur_path = os.path.join(kaggle_local_path, kaggle_file_name)
    kaggle_goal_path = os.path.expanduser("~/.kaggle") if platform.system().lower() == "windows" else "/root/.config/kaggle"
    kaggle_goal_file_path = os.path.join(kaggle_goal_path, kaggle_file_name)

    # make sure that the goal path exist
    os.makedirs(goal_path, exist_ok=True)
    os.makedirs(kaggle_goal_path, exist_ok=True)

    print("Finding and placing the API file...")
    # upload in google colab
    if on_google_colab:
        kaggle_local_path = "./"
        kaggle_file_cur_path = os.path.join(kaggle_local_path, kaggle_file_name)
        if os.path.exists(kaggle_file_cur_path):
            os.remove(kaggle_file_cur_path)

        from google.colab import files
        files.upload()  # choose your local 'kaggle.json' file

    # get the kaggle API file to the right spot
    if os.path.exists(kaggle_goal_file_path):
        os.remove(kaggle_goal_file_path)
    shutil.copy2(kaggle_file_cur_path, kaggle_goal_path)
    os.chmod(kaggle_goal_file_path, 600)    # set right rights
    print(f"Cpopied to: {kaggle_goal_path}")

    # init Kaggle API
    from kaggle.api.kaggle_api_extended import KaggleApi
    print("Autheticating at Kaggle API...")
    api = KaggleApi()
    api.authenticate()

    # make sure the file not exist already
    if os.path.exists(zip_file_download_path):
        os.remove(zip_file_download_path)

    # download kaggle dataset
    print("Downloading dataset...")
    #    -> dataset name just in the https link the last 2 items
    # !kaggle datasets download -d joosthazelzet/lego-brick-images
    api.dataset_download_files(dataset_download_name, path=download_path, unzip=False)

    # Unzip the downloaded dataset
    print("Unzipping dataset...")
    if os.path.exists(goal_path):
        shutil.rmtree(goal_path)
    # !unzip -q "lego-brick-images.zip" -d dataset
    with zipfile.ZipFile(zip_file_download_path, "r") as zip_ref:
        zip_ref.extractall(goal_path)

    # delete zip file
    os.remove(zip_file_download_path)

    print(f"Congratulations! Downloaded successfull '{dataset_name}' from '{author_name}' 🥳😎")


In [None]:
EXPERIMENT_NAME = get_cur_date_time_as_str() + "_DCGAN"
LOG_DIR = "./logs/fit/" + get_cur_date_time_as_str()

EMBEDDING_DIMS = 100
CHANNELS = 1
EPOCHS = 300
BATCH_SIZE = 128


os.makedirs(f"./logs", exist_ok=True)
os.makedirs(f"./checkpoints", exist_ok=True)
os.makedirs(f"./models/{EXPERIMENT_NAME}", exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs("./output/", exist_ok=True)

### Data Loading & Prep

The Epicurious Recipes dataset contains over 200.000 recipes.

See: https://www.kaggle.com/datasets/hugodarwood/epirecipes


Getting a Kaggle API Key:
1. Go to https://www.kaggle.com/ and create or sign in your account
2. Click on your profile picture > Settings and go to API
3. Click on 'Create New Token' and the 'kaggle.json' fie should download automatically
4. Now you can use this method to load this json file and download the dataset

In [None]:
load_kaggle_dataset(
    author_name="hugodarwood",
    dataset_name="epirecipes",
    on_google_colab=True,
    download_path="/content/",
    goal_path=f"/content/dataset",
    kaggle_local_path="./",
    kaggle_file_name="kaggle.json")

In [None]:
with open("./dataset/full_format_recipes.json", "r") as json_file:
    recipe_dataset = json.load(json_file)

filtered_data = [
    f"Recipe for {x['title']} |".join(x['directions'])
    for x in recipe_dataset
    if 'title' in x
    and x['title'] is not None
    and 'directions' in x
    and x['directions'] is not None
]

In [None]:
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r'\1', s)
    s = re.sub('+', '', s)
    return s

text_data = [pad_punctuation(x for x in filtered_data)]

text_ds = tf.data.Dataset.from_tensor_slices(text_data.batch(32).shuffle(1000))

vectorize_layer = layers.TextVectorization(
    standardize='lower',
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=200+1
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
def prep_inputs(text):
    text = tf.expand(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds  = text_ds.map(prep_inputs)

print(type(train_ds))
train_ds

### AI model

In [None]:
inputs = layers.Input(shape(None,), dtype="int32")

x = layers.Embedding(10000, EMBEDDING_DIMS)(inputs)

x = layers.LSTM(128, return_sequence=True)(x)

outputs = layers.Dense(10000, activation='softmax')(x)

lstm = models.Model(inputs, outputs)

### Training

In [None]:
class TextGenerator(tf.keras.callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {#
            word:index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs**(1/temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temeprature):
        start_tokens = [
            self.word_to_index(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []

        while len(start_tokens)<max_tokens and sample_token!=0:
            x = np.array([start_tokens])
            y = self.model.predict(x)
            sample_token, probs = self.sample_from(y[0][-1], temeprature)
            
            info += [f"prompt: {start_prompt} word_probs: {probs}"]

            start_tokens += [sample_token]
            start_prompt = start_prompt + '' + self.index_to_word[sample_token]

        print(f"\nGenerated Text:\n'{start_prompt}'\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipes for", max_tokens=100, temeprature=1.0)

In [None]:
text_generator = TextGenerator(vocab)

loss_func = tf.keras.losses.SparseCategoricalCrossentropy()

lstm.compile("adam", loss_func)
lstm.fit(
    train_ds, 
    epochs=EPOCHS,
    callbacks=[text_generator])