In [53]:
#imports 

import glob
import pickle
import string
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet as wn
import csv
import re
from re import search
from urllib.request import urlopen
from bs4 import BeautifulSoup
import sys
from itertools import permutations
from collections import Counter
import string
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

## Preperations 

In [6]:
def getDatabases() -> list:
    """
    Get the list of available datasets
    
    Returns
    -------
    LIST
        Datasets in the /data/* directory.
    """
    return glob.glob("data/*.exl")

In [7]:
def cleanFile(filename: str) -> list:
    """
    Cleans the data from empty lines and metadata.
    Parameters
    ----------
    filename : str
        Path to the dataset file.
    Returns
    -------
    list
        Cleaned list of lines from the dataset.
    """
    cleaned_text = []
    regex = re.compile(r'[<>]')
    for line in open(filename, "r"):
        if not regex.search(line) and not line.startswith("STEP") and len(line) > 10:
            cleaned_text.append(line)
    return cleaned_text

In [8]:
def cleanFiles(filenames: list, pb: bool = False) -> list:
    """
    Apply cleanFile on a list of files.
    Parameters
    ----------
    filenames : list
        List of dataset directories.
    pb : bool, optional
        Show progress bar of finished files.
    Returns
    -------
    list
        List of list of strings as cleaned dataset lines.
    """
    all_cleaned_text = []
    for i, filename in enumerate(filenames):
        cleaned_text = cleanFile(filenames[i])
        all_cleaned_text.append(cleaned_text)
        if pb: print(f"Files done: {i+1}/{len(filenames)}")
    if pb: print("DONE!")
    return all_cleaned_text

In [9]:
def dump(data: list, filename: str = "temp") -> None:
    """
    Quick dump some data to a temporary file
    Parameters
    ----------
    data : list
        Data generated from other functions.
    filename : str, optional
        Name of the temporary file. The default value is "temp"
    Returns
    -------
    None.
    """
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

In [10]:
def dumpRead(filename: str = "temp") -> list:
    """
    Quick read dumped data
    Parameters
    ----------
    filename : str, optional
        Name of the file to read. The default value is "temp"
    Returns
    -------
    list
        Previously saved data by using dump().
    """
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [11]:
def head(data: list, n_lines: int = 5) -> None:
    """
    Print the first elements of a given dataset
    Parameters
    ----------
    data : list
        Dataset.
    n_lines : int, optional
        Number of lines to display. The default is 5.
    Returns
    -------
    None
        DESCRIPTION.
    """
    for i, line in enumerate(data):
        if i < n_lines:
            print(f"{i}\t{line}")

In [12]:
def formatFile(data: list) -> list:
    """
    Format a pos-tagged database.
    Remove whitespace and punctuation from end and start of words.
    Get all words lowercase
    Parameters
    ----------
    data : list
        Database to format.
    Returns
    -------
    list
        Formatted dataset.
    """
    cleaned_db = []
    for sentence in data:
        cleaned_sentence = []
        for word, tag in sentence:
            cleaned_sentence.append((word.strip(string.punctuation + " ").lower(), tag))
        cleaned_db.append(cleaned_sentence)
    return cleaned_db

In [13]:
def read_csv(csv: str = 'data/nyt-ingredients-snapshot-2015.csv') -> list:
    """
    Read a given CSV file and save rows as separate dictionaries
    Parameters
    ----------
    data : str
        Directory to csv file
    Returns
    -------
    list
        List of rows as dictionaries
    """
    data = pd.read_csv(csv)
    keys = data.columns
    values = data.values
    rows = []
    for value in values:
        d = {}
        for i, key in enumerate(keys):
            d[key] = value[i]
        rows.append(d)
    return rows

## Pre-processing 

In [14]:
def remove_plurals(low: list) -> list:
    return list(set([singular(word) for word in low]))

In [15]:
# Pos-tagging 

MAP = {"VERB" : wn.VERB, "NOUN" : wn.NOUN, "ADJ" : wn.ADJ, "ADV" : wn.ADV}

def pos_tag_db(db: list) -> list:
    """
    PoS-Tag a given database
    Parameters
    ----------
    db : list
        Database to PoS-Tag.
    Returns
    -------
    list
        PoS-Tagged database.
    """
    return [nltk.pos_tag(sentence.split(), tagset = "universal") for sentence in db]

In [16]:
# lemmatizing 

def lemmatize_db(db: list, exclude: list = []) -> list:
    """
    Lemmatize given database
    Parameters
    ----------
    db : list
        PoS-Tagged dataset to lemmatize.
    exclude : list, optional
        Types to exclude from the output. The default is [].
    Returns
    -------
    list
        Lemmatized database.
    """
    lemmatized_db = []
    for sentence in db:
        lemmatized_sentence = []
        for w, p in sentence:
            if p in exclude:
                continue
            elif p in MAP.keys():
                lemma = nltk.WordNetLemmatizer().lemmatize(w, pos = MAP[p])
            else:
                lemma = nltk.WordNetLemmatizer().lemmatize(w)
            lemmatized_sentence.append((lemma, p))
        lemmatized_db.append(lemmatized_sentence)
    return lemmatized_db

In [17]:
# EXAMPLES USE:
    
# # Get a list of available databases
# dbs = getDatabases()

# # Clean the first one
# clean_db = cleanFile(dbs[0])

# # Print first 10 lines of database
# head(clean_db, 10)

# # PoS-Tag, lemmatize and format
# postagged = pos_tag_db(clean_db)
# lemmatized = lemmatize_db(postagged, [".", "X"])
# formatted = formatFile(lemmatized)

# # Print 5 lines from the result
# head(formatted)

In [18]:
# extracting ingredients

def list_ingredients():
    # read json files
    df1 = pd.read_json("data/train.json")
    df2 = pd.read_json("data/test.json")
    
    # extract ingredients coloumn and convert to one list of ingredients
    df1_ingre = df1["ingredients"]
    df2_ingre = df2["ingredients"]
    
    all_ingre = pd.Series.tolist(df1_ingre) + pd.Series.tolist(df2_ingre)
    
    # convert list of lists to a flat list
    list_of_ingre = []
    
    for element in all_ingre:
        for item in element:
            list_of_ingre.append(item)
    
    # remove duplicates        
    final_ingre = list(dict.fromkeys(list_of_ingre)) # outputs a list of 7137 ingredients
    
    with open('final_ingr.csv', 'w', newline='') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(final_ingre)
    
    return final_ingre       

In [54]:
# creating a list of ingredients

url = "https://world.openfoodfacts.org/ingredients.json"
page = urlopen(url)
html = page.read()
soup = BeautifulSoup(html) 
u = soup.decode('utf-8')
u = u.split('"name":')

#creating a csv file with ingredients
localFile = open('ingredients.csv', 'w')

#creating a list of ingredients
ingredients = []
for line in u:
    ingredients.append(line.split(',')[0])
del ingredients[0]
for ingredient in ingredients:
    localFile.write(ingredient)
localFile.close()

In [20]:
for element in ingredients:
    # remove any string that contains digits, these are the E numbers
    if any(map(str.isdigit, element)):
        ingredients.remove(element)

In [21]:
ingredients_str = str(ingredients)
cleanString1 = re.sub('"','', ingredients_str )
cleanString = re.sub("'",'', cleanString1)
ingredients = cleanString
ingredients = ingredients.split(',')

for ingr in ingredients:
    if len(ingr) < 4:
        ingredients.remove(ingr)
final = []   

for ingr in ingredients:
    final.append(ingr[1:])
ingredients = final

In [22]:
saved = ["egg", "fat", "oat", "ham", "tea", "ham", "oil","pea", "rye", "fig", "cod", "ice"]
to_remove = [ 'green', 't', 'cone', 'glaze','ngredients','gredients', 'powder', 'sauce','Sauce' , 'serving', 
             'whole', 'brown','ingredient', 'flakes', 'maple', 'baking', 'serv', 'black', 'diced', 'white', 'paste',
             'roll', 'cooked', 'blend', 'approx', 'cooked', 'ingredients', 'dehydrated']
for ingr in ingredients:
    if ingr in to_remove:
        ingredients.remove(ingr)
    if len(ingr) == 3 and ingr.lower() not in saved:
        ingredients.remove(ingr)

In [23]:
# First database : 'VegetarianRecipes.exl'

# data structure of clean_db is a list of strings
clean_db = cleanFile(getDatabases()[0])
  
# merging the cleaned sentenes 
vegetarian_recipes = []

for sentence in clean_db:
    vegetarian_recipes.append(sentence)
    
# split into separate recipes
split_vegetarian_recipes = str(vegetarian_recipes).split('EACH')
del split_vegetarian_recipes[0]

In [24]:
# writing the recipes to a csv file

with open('recipes_veg.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["EACH PORTION", 'TEMPERATURE', "METHODstr", "METHOD", "NOTE", "CONTAINS (NOUNS)", "INGREDIENTS"])
    # loop through each recipes 
    allmethod = []
    for recipe in split_vegetarian_recipes:
        # for each recipe create a list
        portion = []
        method = []
        temperature = []
        pan_size = []
        note = []
        contains = []
        for sent in re.split(',|\n', recipe):
            if search('PORTION:', sent):
                sent.replace('PORTION:','')
                sent.replace('PAN','')
                portion.append(sent.replace('PORTION:',''))
                    
            elif search('TEMPERATURE:', sent):
                temperature.append(sent.replace('TEMPERATURE:',''))
                
            elif search('PAN SIZE:', sent):
                pan_size.append(sent.replace('PAN SIZE:',''))
                    
            elif search('NOTE:', sent):
                note.append(sent.replace('NOTE:',''))
            else:
                method.append(sent)
        # merge method (as a list) into one single string
        method_str = ""
        for text in method:
            method_str += str(text) + " "
        savemethodstr = method_str
        
        for ingr in ingredients:
            if ingr.lower() in savemethodstr:
                # replace with ingredient mask
                savemethodstr = savemethodstr.replace(ingr.lower(), " INGREDIENT ")
                contains.append(ingr.lower())
                # turn ingredient into one token
                ingr_dash = ingr.lower().replace(' ', '')
                method_str = method_str.replace(ingr.lower(), ingr_dash)
                
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        method_str = tokenizer.tokenize(method_str)
        # lemmatize and POS tag the method string
        # PoS-Tag, lemmatize and format
        
        postagged = pos_tag_db(method_str)
        # lemmatized = lemmatize_db(postagged, [".", "X"])
        formatted = formatFile(postagged)
        # method_tagged = formatted
        method_tagged = postagged
        
        
        onelist = []
        twolist = []
        contains_nospace = []
        for con in contains:
            cons = con.lower().replace(' ', '')
            contains_nospace.append(cons)
            
        for pair in method_tagged:
            onelist.extend(pair)
            
        for w, tag in onelist:
            pair = tuple([w, tag])
            if w in contains_nospace:
                pair2 = tuple((w, 'INGREDIENT'))
                twolist.append(pair2) 
            else:
                twolist.append(pair)
                
                        
        # the ingredients are turned into a set, so there is no repetition of ingredients.
        writer.writerow([portion, temperature, savemethodstr ,twolist, note, contains, set(contains)])  

In [25]:
# Second database: data/ArmedForcesRecipes.exl'

# data structure of clean_db is a list of strings
clean_db = cleanFile(getDatabases()[2])

# merging the cleaned sentenes 
armedforces_recipes = []
for sentence in clean_db:
    armedforces_recipes.append(sentence)
    
# split the recipes into lists
split_armedforces_recipes = str(armedforces_recipes).split('EACH')
del split_armedforces_recipes[0]

In [26]:
#writing the recipes to a csv file
with open('recipes_armed.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["EACH PORTION", 'TEMPERATURE', "METHODstr", "METHOD", "NOTE", "CONTAINS (NOUNS)", "INGREDIENTS"])
    # loop through each recipes 
    allmethod = []
    for recipe in split_armedforces_recipes:
        # for each recipe create a list
        portion = []
        method = []
        temperature = []
        pan_size = []
        note = []
        contains = []
        for sent in re.split(',|\n', recipe):
            if search('PORTION:', sent):
                sent.replace('PORTION:','')
                sent.replace('PAN','')
                portion.append(sent.replace('PORTION:',''))
                    
            elif search('TEMPERATURE:', sent):
                temperature.append(sent.replace('TEMPERATURE:',''))
                
            elif search('PAN SIZE:', sent):
                pan_size.append(sent.replace('PAN SIZE:',''))
                    
            elif search('NOTE:', sent):
                note.append(sent.replace('NOTE:',''))
            else:
                method.append(sent)
        # merge method (as a list) into one single string
        method_str = ""
        for text in method:
            method_str += str(text) + " "
        savemethodstr = method_str
        
        for ingr in ingredients:
            if ingr.lower() in savemethodstr:
                # replace with ingredient mask
                savemethodstr = savemethodstr.replace(ingr.lower(), " INGREDIENT ")
                contains.append(ingr.lower())
                # turn ingredient into one token
                ingr_dash = ingr.lower().replace(' ', '')
                method_str = method_str.replace(ingr.lower(), ingr_dash)
                
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        method_str = tokenizer.tokenize(method_str)
        # lemmatize and POS tag the method string
        # PoS-Tag, lemmatize and format
      
        postagged = pos_tag_db(method_str)
        # lemmatized = lemmatize_db(postagged, [".", "X"])
        formatted = formatFile(postagged)
        # method_tagged = formatted
        method_tagged = postagged
        
        onelist = []
        twolist = []
        contains_nospace = []
        for con in contains:
            cons = con.lower().replace(' ', '')
            contains_nospace.append(cons)
            
        for pair in method_tagged:
            onelist.extend(pair)
            
        for w, tag in onelist:
            pair = tuple([w, tag])
            if w in contains_nospace:
                pair2 = tuple((w, 'INGREDIENT'))
                twolist.append(pair2) 
            else:
                twolist.append(pair)
        # the ingredients are turned into a set, so there is no repetition of ingredients.
        writer.writerow([portion, temperature, savemethodstr ,twolist, note, contains, set(contains)])  

In [28]:
# Third database: 'data/CommonRecipes.exl'

# data structure of clean_db is a list of strings
clean_db = cleanFile(getDatabases()[3])

# merging the cleaned sentenes 
common_recipes = []
for sentence in clean_db:
    common_recipes.append(sentence)
    
# split the recipes into lists
split_common_recipes = str(common_recipes).split('EACH')

In [29]:
# writing the recipes to a csv file
with open('recipes_common.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["EACH PORTION", 'TEMPERATURE', "METHODstr", "METHOD", "NOTE", "CONTAINS (NOUNS)", "INGREDIENTS"])
    # loop through each recipes 
    allmethod = []
    for recipe in split_common_recipes:
        # for each recipe create a list
        portion = []
        method = []
        temperature = []
        pan_size = []
        note = []
        contains = []
        for sent in re.split(',|\n', recipe):
            if search('PORTION:', sent):
                sent.replace('PORTION:','')
                sent.replace('PAN','')
                portion.append(sent.replace('PORTION:',''))
                    
            elif search('TEMPERATURE:', sent):
                temperature.append(sent.replace('TEMPERATURE:',''))
                
            elif search('PAN SIZE:', sent):
                pan_size.append(sent.replace('PAN SIZE:',''))
                    
            elif search('NOTE:', sent):
                note.append(sent.replace('NOTE:',''))
            else:
                method.append(sent)
        #merge method (as a list) into one single string
        method_str = ""
        for text in method:
            method_str += str(text) + " "
        savemethodstr = method_str
        
        for ingr in ingredients:
            if ingr.lower() in savemethodstr:
                # replace with ingredient mask
                savemethodstr = savemethodstr.replace(ingr.lower(), " INGREDIENT ")
                contains.append(ingr.lower())
                # turn ingredient into one token
                ingr_dash = ingr.lower().replace(' ', '')
                method_str = method_str.replace(ingr.lower(), ingr_dash)
                
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        method_str = tokenizer.tokenize(method_str)
        # lemmatize and POS tag the method string
        # PoS-Tag, lemmatize and format
        
    
        postagged = pos_tag_db(method_str)
        # lemmatized = lemmatize_db(postagged, [".", "X"])
        formatted = formatFile(postagged)
        # method_tagged = formatted
        method_tagged = postagged
        
        
        onelist = []
        twolist = []
        contains_nospace = []
        for con in contains:
            cons = con.lower().replace(' ', '')
            contains_nospace.append(cons)
            
        for pair in method_tagged:
            onelist.extend(pair)
            
        for w, tag in onelist:
            pair = tuple([w, tag])
            if w in contains_nospace:
                pair2 = tuple((w, 'INGREDIENT'))
                twolist.append(pair2) 
            else:
                twolist.append(pair)
                
            
        # the ingredients are turned into a set, so there is no repetition of ingredients.
        writer.writerow([portion, temperature, savemethodstr ,twolist, note, contains, set(contains)])   

## Co-occurrences

In [30]:
# dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['INGREDIENTS'] 

# dataset 2 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['INGREDIENTS'] 

# dataset 3 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['INGREDIENTS'] 

# creating one dataframe with the set of ingredients
frames = [saved_veg, saved_armed, saved_common]
result = pd.concat(frames)
result.head()

0    {'salt', 'size', 'butter', 'margarine', 'syrup...
1                                                set()
2               {'bacon', 'ice', 'tea', 'rain', 'fat'}
3             {'bacon', 'ice', 'rain', 'juice', 'fat'}
4                                       {'ice', 'fat'}
Name: INGREDIENTS, dtype: object

In [31]:
# Co-occurrences of ingredients

cooccs_surface = Counter()

# contextual coocurrence
for row in result:
    for i,w in enumerate(row):
        row = row.replace('}', "")
        row = row.replace('{', "")
        for w in row.split(','):
            for cw in row.split(','):
                if cw != w:
                    cooccs_surface[(w, cw)] += 1

In [32]:
cooccs_surface.most_common(50)

[((" 'oil'", " 'water'"), 56607),
 ((" 'water'", " 'oil'"), 56607),
 ((" 'rain'", " 'water'"), 46310),
 ((" 'water'", " 'rain'"), 46310),
 ((" 'onion'", " 'water'"), 38585),
 ((" 'water'", " 'onion'"), 38585),
 ((" 'rain'", " 'oil'"), 38546),
 ((" 'oil'", " 'rain'"), 38546),
 ((" 'onion'", " 'oil'"), 38264),
 ((" 'oil'", " 'onion'"), 38264),
 ((" 'ice'", " 'water'"), 37995),
 ((" 'water'", " 'ice'"), 37995),
 ((" 'flour'", " 'water'"), 36606),
 ((" 'water'", " 'flour'"), 36606),
 ((" 'water'", " 'sugar'"), 36578),
 ((" 'sugar'", " 'water'"), 36578),
 ((" 'oil'", " 'ice'"), 34546),
 ((" 'ice'", " 'oil'"), 34546),
 ((" 'onion'", " 'ice'"), 32445),
 ((" 'ice'", " 'onion'"), 32445),
 ((" 'pepper'", " 'water'"), 31684),
 ((" 'water'", " 'pepper'"), 31684),
 ((" 'water'", " 'salt'"), 31256),
 ((" 'salt'", " 'water'"), 31256),
 ((" 'ice'", " 'rain'"), 30580),
 ((" 'rain'", " 'ice'"), 30580),
 ((" 'water'", " 'shortening'"), 30254),
 ((" 'shortening'", " 'water'"), 30254),
 ((" 'pepper'", " 'o

In [33]:
# dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['METHOD'] 

# dataset 2 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['METHOD'] 

# dataset 3 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['METHOD'] 

# creating one dataframe with the methods
frames = [saved_veg, saved_armed, saved_common]
method = pd.concat(frames)
method.head()

0    [('Score', 'NOUN'), ('cored', 'VERB'), ('unpee...
1                                                   []
2    [('Arrange', 'NOUN'), ('slices', 'NOUN'), ('in...
3    [('3', 'NUM'), ('In', 'ADP'), ('Step', 'NOUN')...
4    [('2', 'NUM'), ('GRILLED', 'NOUN'), ('BACON', ...
Name: METHOD, dtype: object

In [55]:
# Co-occurrences of verbs vs ingredients

verb_cooccs_surface = Counter()

spansize = 5
for row in method:
    row = row.split("),")
    for i,w in enumerate(row):
        w = w.replace('(', "")
        w = w.replace('[', "")
        if "VERB" in w:
            span_range = list(range(max(i - spansize, 0), i)) # left side indices (range, then list so we can extend)
            span_range.extend(range(i + 1, min(i + spansize + 1, len(w)))) # extend by right side indices
            for cw in [row[idx] for idx in span_range]:
                if 'INGREDIENT'in cw:
                    verb_cooccs_surface[(w.lower(), cw)] += 1

In [35]:
verb_cooccs_surface.most_common(50)

[((" 'add', 'verb'", " ('water', 'INGREDIENT'"), 55),
 ((" 'greased', 'verb'", " ('batter', 'INGREDIENT'"), 38),
 ((" 'melted', 'verb'", " ('salad', 'INGREDIENT'"), 34),
 ((" 'melted', 'verb'", " ('oil', 'INGREDIENT'"), 33),
 ((" 'may', 'verb'", " ('garlic', 'INGREDIENT'"), 31),
 ((" 'be', 'verb'", " ('garlic', 'INGREDIENT'"), 31),
 ((" 'used', 'verb'", " ('garlic', 'INGREDIENT'"), 30),
 ((" 'add', 'verb'", " ('milk', 'INGREDIENT'"), 27),
 ((" 'floured', 'verb'", " ('batter', 'INGREDIENT'"), 22),
 ((" 'used', 'verb'", " ('water', 'INGREDIENT'"), 20),
 ((" 'will', 'verb'", " ('celery', 'INGREDIENT'"), 20),
 ((" 'running', 'verb'", " ('chicken', 'INGREDIENT'"), 20),
 ((" 'boiling', 'verb'", " ('water', 'INGREDIENT'"), 19),
 ((" 'running', 'verb'", " ('water', 'INGREDIENT'"), 19),
 ((" 'be', 'verb'", " ('chicken', 'INGREDIENT'"), 19),
 ((" 'may', 'verb'", " ('juice', 'INGREDIENT'"), 17),
 ((" 'be', 'verb'", " ('juice', 'INGREDIENT'"), 17),
 ((" 'used', 'verb'", " ('juice', 'INGREDIENT'"),

## LSTM RNN model

In [38]:
# LSTM MODEL CODE IS ADAPTED FROM:
# https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [39]:
# configure 

DATANAME = "final_shortened"   # Name of the processed data file
N_RECIPES = 5                  # Number of sentences to generate when generate() is called
N_EPOCHS = 20                  # Number of epochs in model training
BATCH_SIZE = 128               # Bar-tch size while model training

In [40]:
def prepare_data(p=True):
    """
    Read the existing data
    p: bool = Be verbose
    Outputs a tuple of names, units and recipes
    """
    if p: print("Preparing data...")
    data = pd.read_csv("data/nyt-ingredients-snapshot-2015.csv")
    names = [str(name).lower().strip(string.punctuation + " ") for name in data["name"]] + list_ingredients()
    units = {str(unit).lower().strip(string.punctuation + " ") for unit in data["unit"] if len(str(unit)) < 14 \
             and not str(unit)[0].isdigit()}
    databases = getDatabases()
    recipes = []
    for i in range(len(databases)):
        recipes += [row for row in cleanFile(databases[i]) if len(row) > 50 and not row[0].isdigit()]
        if p: print(f"Loaded database {i+1}/{len(databases)}")
    if p: print("Prepared data!\n")
    return (names, units, recipes)

In [41]:
def process_data(data, dumpname=DATANAME):
    """
    Clear and format the data
    """
    print("Processing data...")
    names, units, recipes = data
    db = []
    len_recipes = len(recipes)
    for i, row in enumerate(recipes):
        if i%100 == 0: print(f"Processing data... {i}/{len_recipes}")
        for word in row.split(" "):
            word = word.strip().strip(string.punctuation + " ")
            if len(word) > 0:
                if word in names or check_contains(word, names):
                    db.append("INGREDIENT")
                else:
                    db.append(word.lower())
        db.append("\n")
    dump(db, DATANAME)
    print(f"Processed data! Saved as {DATANAME}\n")
    return db

In [42]:
def init_vars():
    """
    Prepare the data for training
    """
    data = dumpRead(DATANAME)
    words = sorted(list(set(data)))
    ewords = dict((c, i) for i, c in enumerate(words))
    enumbs = dict((i, c) for i, c in enumerate(words))
    seq_length = 100
    dataX = []
    dataY = []
    for i in range(0, len(data) - seq_length, 1):
     	seq_in = data[i:i + seq_length]
     	seq_out = data[i + seq_length]
     	dataX.append([ewords[char] for char in seq_in])
     	dataY.append(ewords[seq_out])
    n_patterns = len(dataX)
    X = np.reshape(dataX, (n_patterns, seq_length, 1))
    X = X / float(len(words))
    y = np_utils.to_categorical(dataY)
    return (X, y, dataX, dataY, words, ewords, enumbs)

In [43]:
def create_model(var, epochs=N_EPOCHS, batch_size=BATCH_SIZE):
    """
    Create the LSTM model and save it as a .hdf5 file
    REQUIRES A FOLDER NAMED "model" IN THE DIRECTORY
    TAKES VERY LONG TO RUN
    """
    X, y, dataX, dataY, words, ewords, enumbs = var
    print("Creating model...")
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    filepath="model/weights-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)
    print("Created model!\n")
    return model

In [44]:
def create_recipe(var, fname = "weights-20-3.1225.hdf5"):
    """
    Use the created model at "model/{fname}" to generate text
    Returns a recipe
    """
    X, y, dataX, dataY, words, ewords, enumbs = var
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(0.5))
    model.add(Dense(y.shape[1], activation='softmax'))
    filename = "model/" + fname
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    start = np.random.randint(0, len(dataX)-1)
    pattern = dataX[start]
    recipe = []
    for i in range(50):
        if i == 0 or recipe[-1] != "\n":
            result = ""
            x = np.reshape(pattern, (1, len(pattern), 1))
            x = x / float(len(words))
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = enumbs[index]
            recipe.append(result)
            pattern.append(index)
            pattern = pattern[1:len(pattern)]
    # print(recipe)
    return recipe

In [45]:
def select_recipe(var):
    """
    Uses create_recipe() to create a recipe
    Then selects the better ones
    """
    recipe = ["INGREDIENT"]
    def count_ingredients():
        n = 0
        for word in recipe:
            if "INGREDIENT" in word.upper():
                n += 1
        return n/len(recipe)
    attempts = 0
    # Make sure the sentence is decently long, and ingredients make up less than 40% of the sentence.
    while len(recipe) < 8 or count_ingredients() > 0.4:
        recipe = create_recipe(var=var)
        attempts += 1
        # Quit if recipe cannot be generated
        if attempts > 99:
            recipe = ["ERROR:", "", "", "", "Failed", "to", "generate", "a", "recipe", "\n"]
    # Remove common words that cannot end the sentence
    while recipe[-2] in ["to", "from", "is", "until", "and", "when", "an", "a", "or", "if", "on", "at"]:
        recipe = recipe[:-2] + recipe[-1:]
    # Generate ingredients
    ingredients = generate_ingredients()
    j = -1
    units = prepare_data(p=False)[1]
    # Put the generated iingredients in place
    for i, word in enumerate(recipe):
        if word == "INGREDIENT":
            j = min(j+1, len(ingredients)-1)
            recipe[i] = ingredients[j]
        elif word == "UNIT":
            recipe[i] = random.choice(list(units))
    return ".".join(" ".join(recipe).split(" \n"))

In [46]:
def check_contains(i, l, p=False):
    """
    Check if the item i (or singular versions) are in the list l
    """
    if p or len(i) > 3:
        return (i[-1] == "s" and i[:-1] in l) or (i[-2:] == "es" and i[:-2] in l) or (i[-3:] == "ies" and i[:-3] + "y" in l)
    return False

In [47]:
def read_coocs():
    """
    Read the file "Cococcs of ingredients"
    """
    raw = dict(dumpRead("data/Cococcs of ingredients"))
    coocs = dict()
    for key in raw:
        coocs[(key[0][2:-1].lower(), key[1][2:-1].lower())] = raw[key]
    return dict(sorted(coocs.items(), key=lambda item: item[1], reverse=True))

In [50]:
def generate_ingredients_h(il=[], init=random.choice(dumpRead("model/inglist_final"))):
    """
    Helper function for generate_ingredients()
    """
    coocs = read_coocs()
    selected = il
    for key in coocs:
        if len(il) < 20 and key[0] == init and key[0] not in selected:
            selected.append(key[1])
    return selected

In [51]:
def generate_ingredients():
    """
    Generate a list of related ingredients
    It is possible to comment out this code and add a list of ingredients manually.
    For example:
        selected = ["pasta", "tomato", "garlic", "oil", "onions", "salt", "pepper"]
        return selected
    This way you can use custom ingredients
    """
    first = ""
    selected = []
    # Choose a random, valid first ingredient
    while first == "" or len(first) > 30:
        first = random.choice(dumpRead("model/inglist_final"))
    # Get a list of 20 ingredients that are related to the previous ones
    # Not all 20 are used, this is just to make sure that the model does not run out of ingredients.
    # 20 is deduced by:
    #   [maximum length of a sentence] x [maximum percentage of ingredients in a sentence]
    #   50 x 0.40 = 20
    while len(selected) < 20:
        selected += generate_ingredients_h(init=random.choice(dumpRead("model/inglist_final")))
    return selected

## Run the model 

HOW TO USE
1. `prepare()`
2. `create()`
3. Update `create_recipe()` to use the desired .hdf5 file
4. `generate()`

In [52]:
# Prepare the data (skip if it is already processed)
def prepare(fname=DATANAME):
    raw_data = prepare_data()
    process_data(raw_data, fname)

# Create an LSTM model for text generation
def create(epochs=N_EPOCHS, batch_size=BATCH_SIZE):
    var = init_vars()
    create_model(var=var, epochs=epochs, batch_size=batch_size)

# Generate n sentences for the recipe
def generate(n=N_RECIPES):
    var = init_vars()
    for i in range(n):
        print(f"{i+1}/{n}:", select_recipe(var))