In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim import models, similarities
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary
import time
import scipy as sp
import pickle

In [2]:
RANDOM_STATE = 789
raw_recipe = pd.read_csv("src/data/kaggle_food_data/RAW_recipes.csv")

In [3]:
raw_recipe.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
# converting to list objects
to_convert_list = "tags nutrition steps ingredients".split(" ")
for c in to_convert_list:
    raw_recipe[c] = raw_recipe[c].apply(lambda x: eval(x))
    
raw_recipe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [5]:
# pickle formatted df
raw_recipe.to_pickle("src/data/kaggle_food_data/raw_recipes.pkl")

In [5]:
def prep_ingr(ingredients):
    """preprocess formatting of the list of ingredients
    
    will remove string 'and' and '&' if present
    
    Args:
        ingredients (list of strings): list of ingredients
    
    Returns:
        list: list of formatted ingredients 
    """
    toreturn = []
    for ingr in ingredients:
        
        # remove 'and' or '&' if exsits
        if "and" in ingr or "&" in ingr:
            ingr = ingr.replace("and", "").replace("&","") #remove
            ingr = ingr.split(" ")
            # remove empty strings
            while "" in ingr:
                ingr.remove("")
                
            for i in ingr:
                toreturn.append(i)
        else:
            toreturn.append("_".join(ingr.split(" ")))
    return toreturn

In [6]:
print(prep_ingr(["water","salt and pepper","black pepper"]))
prep_ingr(["water","salt and pepper","black pepper"]) == ["water","salt","pepper", "black_pepper"]

['water', 'salt', 'pepper', 'black_pepper']


True

In [31]:
raw_recipe

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8
...,...,...,...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,227978,2012-08-29,"[ham, 60-minutes-or-less, time-to-make, course...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"[heat oil in a 4-quart dutch oven, add celery ...",this is a delicious soup that i originally fou...,"[celery, onion, green sweet pepper, garlic clo...",22
231633,zydeco spice mix,493372,5,1500678,2013-01-09,"[15-minutes-or-less, time-to-make, course, pre...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,[mix all ingredients together thoroughly],this spice mix will make your taste buds dance!,"[paprika, salt, garlic powder, onion powder, d...",13
231634,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"[60-minutes-or-less, time-to-make, course, mai...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"[in a bowl , combine the mashed yolks and mayo...","deviled eggs, cajun-style","[hard-cooked eggs, mayonnaise, dijon mustard, ...",8
231635,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"[30-minutes-or-less, time-to-make, course, pre...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,[place melted butter in a large mixing bowl an...,"i've heard of the 'cookies by design' company,...","[butter, eagle brand condensed milk, light bro...",10


In [15]:
def train_lda_model(ingredients, n_topics, n_iters = 10, random_state = RANDOM_STATE):
    """train LDA model
    
    Args:
        ingredients (list of strings): list of ingredients 
        n_topics (int): number of topics to generate
        n_iters (int, optional): number of iterations when training the model
        random_state (int, optional): random state
    
    Returns:
        model (gensim LDA model): trained LDA model
        ing_dict (gensim.Dictionary): bow of ingredients
        corpus (list of id): list of ingredients converted to ints
    """
    s = time.time()
    # format string
    if isinstance(ingredients, pd.Series):
        tot_ingredients = ingredients.apply(prep_ingr)
    else: # if the ingridients is list
        tot_ingredients = [prep_ingr(i) for i in ingredients]
    
    # convert tot_ingridents to list
    if not isinstance(tot_ingredients, list):
        tot_ingredients = list(tot_ingredients)
    
    # generate ingredients dictionary
    ing_dict = Dictionary([ing for ing in tot_ingredients])
    
    # create corpus using list of ingredients
    corpus = [ing_dict.doc2bow(ing) for ing in tot_ingredients]
    
    # train model
    model = LdaModel(corpus=corpus, num_topics = n_topics, passes = n_iters, random_state = random_state,
                    id2word = ing_dict)
    
    print("time taken to run train_lda_model function : {}".format(time.time()-s))
    return model, ing_dict, corpus


def get_sim(model,corpus, word_vects):
    index = similarities.MatrixSimilarity(model[corpus])
    sims = index[word_vects]
    return sims


def calculate_sim(docs, corpus, model = lda_model, dct = dictionary):
    """calculate similarity"""
    bag_words = dct.doc2bow(docs)
    bow_vects = model[bag_words]
    similarity_ranks = get_sim(model = model,corpus = corpus, word_vects = bow_vects)
    similarity_ranks = sorted(enumerate(similarity_ranks), key= lambda x: x[1], reverse=True) # sort by second item
    return similarity_ranks
    
    
def get_doc_sim(docs, model = lda_model, dct = dictionary, corpus = ingredients_coprusredients_corpus):
    """calculate cosine similiarity between document and the input document"""
    sim_ranks = calculate_sim(docs= docs, model = model, dct = dct, corpus=corpus)
    
    groups = []
    for l in model[corpus]:
        try:
            groups.append(l[0][0])
        except:
            groups.append()
            

SyntaxError: invalid syntax (<ipython-input-15-899d4fedebcf>, line 55)

In [19]:
ingr_all = raw_recipe["ingredients"].apply(lambda x : prep_ingr(x))
d = Dictionary(list(ingr_all))
corp = [d.doc2bow(t) for t in list(ingr_all)]
n_topic = 100
passes = 100
lda_model = LdaModel(corp, num_topics = n_topic, passes = passes, random_state = RANDOM_STATE, id2word = d)

In [8]:
vectors = lda_model[d.doc2bow(ds)]
#index = similarities.SparseMatrixSimilarity(corp, num_features = len(d), maintain_sparsity = True)
#sims_ranks = index[vectors]

NameError: name 'lda_model' is not defined

In [7]:
ingr_all = raw_recipe["ingredients"].apply(lambda x : prep_ingr(x))
d = Dictionary(list(ingr_all))
corp = [d.doc2bow(t) for t in list(ingr_all)]
n_topic = 100
passes = 100
lda_multi = models.ldamulticore.LdaMulticore(corpus = corp, num_topics = n_topic,
                                             id2word  = d, workers= 3, passes = passes,
                                            random_state = RANDOM_STATE)

In [11]:
ds = "garlic_powder, salt, chicken_thigh, pepper, rosemary".split(", ")
vects = d.doc2bow(ds)
groups = lda_multi[vects] #returns probs belonging to ceratin group/topics


[(34, 0.209099), (64, 0.45907965), (77, 0.17014381)]

In [17]:
raw_recipe["pp_ingredients"] = raw_recipe["ingredients"].apply(prep_ingr)

In [19]:
raw_recipe["ingredients_bow"] = [d.doc2bow(dc) for dc in raw_recipe["pp_ingredients"]]

In [20]:
raw_recipe["groups"] = raw_recipe["ingredients_bow"].apply(lambda x: lda_multi[x])

In [29]:
def get_best_group(groups):
    groups.sort(key=lambda x: x[1], reverse=True) # from highest to lowest
    if groups != []:
        return groups[0][0]
    return []
        
    
raw_recipe["best_group"] = raw_recipe["groups"].apply(get_best_group)
#raw_recipe["best_group"] = raw_recipe["groups"]

In [32]:
raw_recipe.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,pp_ingredients,ingredients_bow,groups,best_group
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,"[winter_squash, mexican_seasoning, mixed_spice...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[(4, 0.30515543), (1, 0.29214182), (82, 0.2814...",4
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,"[prepared_pizza_crust, sausage_patty, eggs, mi...","[(5, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11,...","[(76, 0.45780423), (49, 0.4021938)]",76
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,"[ground_beef, yellow_onions, diced_tomatoes, t...","[(5, 1), (13, 1), (14, 1), (15, 1), (16, 1), (...","[(2, 0.26784128), (67, 0.17992806), (58, 0.172...",2
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,"[spreadable, cheese, with, garlic, herbs, new_...","[(4, 1), (5, 1), (7, 1), (10, 1), (25, 1), (26...","[(76, 0.28519863), (55, 0.16073802), (34, 0.15...",76
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,"[tomato_juice, apple_cider_vinegar, sugar, sal...","[(5, 1), (10, 1), (36, 1), (37, 1), (38, 1), (...","[(67, 0.49024317), (61, 0.21362764), (44, 0.15...",67


In [33]:
df_ = raw_recipe.copy()

def get_recs_from_group(l_ingrs, lda_model = lda_multi, dct = d, corpus = corp, df = df_, n = 5):
    bows = dct.doc2bow(l_ingrs)
    groups = lda_model[bows]
    if groups == []:
        print("these ingredients do not have similarities to other recipes")
        print("please try again with other ingredients")
        pass
    else:
        groups.sort(key = lambda x: x[1], reverse = True)
        return df[df["best_group"] == groups[0][0]].reset_index(drop=True).head(n) #

In [None]:
get_recs_from_group("garlic_powder, salt, chicken_thigh, pepper, rosemary".split(", "))

In [26]:
g = [(34, 0.209099), (64, 0.45907965), (77, 0.17014381)]
g.sort(key= lambda x: x[1], reverse=True)
g[0][0]

64

In [13]:
def save_pickle(obj, file_name, file_type = "pkl"):
    file = open(f"src/data/pkls/{file_name}.{file_type}", "wb")
    pickle.dump(obj, file)
    file.close()

In [31]:
save_pickle(lda_multi, "lda_mult")
save_pickle(d, "ingr_dictionary")
save_pickle(corp, "ingr_corpus")
save_pickle(raw_recipe, "recipe_df_w_groups_and_bow")

In [None]:
lda_multi = pickle.load("src/data/pkls/lda_mult.pkl")


In [93]:
ingr_all_bow = raw_recipe["ingredients"].apply(prep_ingr)

In [108]:
raw_recipe.iloc[212177,:].ingredients

['ducks',
 'salt and pepper',
 'garlic powder',
 'purple plums',
 'frozen pink lemonade concentrate',
 'fresh lemon juice',
 'dry white wine',
 'soy sauce',
 'orange rind',
 'fresh lemon rind',
 'powdered ginger',
 'salt and black pepper',
 'worcestershire sauce',
 'brown sugar',
 'catsup',
 'red food coloring',
 'oranges',
 'crabapples',
 'maraschino cherries']

In [106]:
ingr_all_bow[212177]

['ducks',
 'salt',
 'pepper',
 'garlic_powder',
 'purple_plums',
 'frozen_pink_lemonade_concentrate',
 'fresh_lemon_juice',
 'dry_white_wine',
 'soy_sauce',
 'orange_rind',
 'fresh_lemon_rind',
 'powdered_ginger',
 'salt',
 'black',
 'pepper',
 'worcestershire_sauce',
 'brown_sugar',
 'catsup',
 'red_food_coloring',
 'oranges',
 'crabapples',
 'maraschino_cherries']

In [109]:
gensim.matutils.hellinger(d.doc2bow(ingr_all_bow[212177]), vectors)

3.2797007870909227

In [105]:
raw_recipe.iloc[212177,:]

name                          the late great fred koss  purple duck
id                                                           114931
minutes                                                         210
contributor_id                                                39547
submitted                                                2005-04-01
tags              [time-to-make, course, main-ingredient, cuisin...
nutrition           [1601.1, 192.0, 103.0, 54.0, 80.0, 209.0, 24.0]
n_steps                                                          11
steps             [preheat oven to 400 degrees f, quarter 2 duck...
description       i originally found this recipe on the internet...
ingredients       [ducks, salt and pepper, garlic powder, purple...
n_ingredients                                                    19
sim                                                        0.138675
Name: 212177, dtype: object

In [23]:
corp = [d.doc2bow(t) for t in list(ingr_all)]
t = " ".join(['winter_squash', 'mexican_seasoning', 'mixed_spice', 'honey', 'butter', 'olive_oil'])
#txt = d.doc2bow(t)
top_topics = lda_model.top_topics( corp)

In [25]:
for i in range(len(top_topics)):
    print(top_topics[i][0])
    print(top_topics[i][1])
    print()

[(0.24536327, 'cheddar_cheese'), (0.09776891, 'salsa'), (0.07019196, 'sour_cream'), (0.054672632, 'onion'), (0.054441847, 'flour_tortillas'), (0.047107637, 'jalapeno_pepper'), (0.039222226, 'milk'), (0.03630239, 'butter'), (0.036209196, 'cocoa_powder'), (0.035592813, 'salt'), (0.034139972, 'cream_of_chicken_soup'), (0.03097768, 'splenda_sugar_substitute'), (0.030152397, 'monterey_jack_cheese'), (0.029497355, 'eggs'), (0.029218674, 'tortilla_chips'), (0.027947653, 'pepper'), (0.02051542, 'refried_beans'), (0.015620197, 'tomatoes'), (0.013290081, 'water'), (0.010645615, 'cream_of_celery_soup')]
-4.692605917932034

[(0.27640826, 'fresh_ground_black_pepper'), (0.22684282, 'green_onions'), (0.074391566, 'whole_milk'), (0.066465, 'spinach'), (0.058034457, 'broccoli'), (0.053969767, 'salt'), (0.04453842, 'garlic_cloves'), (0.03963291, 'olive_oil'), (0.026520813, 'sun-dried_tomato'), (0.021318793, 'fresh_flat-leaf_parsley'), (0.020973088, 'artichoke_hearts'), (0.016965719, 'eggs'), (0.01692203

In [47]:
ingr_all.values

array([list(['winter_squash', 'mexican_seasoning', 'mixed_spice', 'honey', 'butter', 'olive_oil', 'salt']),
       list(['prepared_pizza_crust', 'sausage_patty', 'eggs', 'milk', 'salt', 'pepper', 'cheese']),
       list(['ground_beef', 'yellow_onions', 'diced_tomatoes', 'tomato_paste', 'tomato_soup', 'rotel_tomatoes', 'kidney_beans', 'water', 'chili_powder', 'ground_cumin', 'salt', 'lettuce', 'cheddar_cheese']),
       ...,
       list(['hard-cooked_eggs', 'mayonnaise', 'dijon_mustard', 'salt-free_cajun_seasoning', 'tabasco_sauce', 'salt', 'black_pepper', 'fresh_italian_parsley']),
       list(['butter', 'eagle', 'br', 'condensed', 'milk', 'light_brown_sugar', 'sour_cream', 'egg', 'extract', 'nutmeg', 'self-rising_flour', 'bisquick', 'wooden_popsicle_sticks']),
       list(['granulated_sugar', 'shortening', 'eggs', 'flour', 'cream_of_tartar', 'baking_soda', 'vanilla_extract'])],
      dtype=object)

In [103]:
t = "salt egg four butter nutmeg winter_squash".split(" ")
lda_model[d.doc2bow(t)]

[(8, 0.21683142), (23, 0.4523037), (42, 0.17418815)]

In [None]:
lda_model.similarity()

In [None]:
gensim.matutils.cossim()

In [104]:
raw_recipe["ingredients"].apply(lambda x: d.doc2bow(x))

0                                  [(0, 1), (1, 1), (5, 1)]
1                                  [(7, 1), (8, 1), (9, 1)]
2                                [(5, 1), (19, 1), (23, 1)]
3              [(5, 1), (10, 1), (28, 1), (31, 1), (33, 1)]
4                                [(5, 1), (10, 1), (40, 1)]
                                ...                        
231632      [(40, 1), (97, 1), (99, 1), (119, 1), (155, 1)]
231633                                   [(5, 1), (155, 1)]
231634                                   [(5, 1), (338, 1)]
231635    [(0, 1), (142, 1), (154, 1), (778, 1), (3314, 1)]
231636                         [(8, 1), (110, 1), (283, 1)]
Name: ingredients, Length: 231637, dtype: object

In [107]:
raw_inter = pd.read_csv("src/data/kaggle_food_data/RAW_interactions.csv")

0