In [None]:
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

from google.colab import drive
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
drive.mount('/content/drive/')

data_recipes  = pd.read_csv("/content/drive/MyDrive/NLP/RAW_recipes.csv")

df_recipes = pd.DataFrame(data_recipes)

print(df_recipes.head())

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4          amish  tomato ketchup  for canning   44061      190   

   contributor_id   submitted  \
0           47892  2005-09-16   
1           26278  2002-06-17   
2          196586  2005-02-25   
3           68585  2003-04-14   
4           41706  2002-10-25   

                                                tags  \
0  ['60-minutes-or-less', 'time-to-make', 'course...   
1  ['30-minutes-or-less', 'time-to-make', 'course...   
2  ['time-to-make', 'course', 'preparation', 'mai...   
3  ['60-minutes-or-less', 'time-to-make', 'course

In [None]:
df_recipes.isnull().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [None]:
df_recipes['name'].count()

231636

In [None]:
df_recipes_unique = df_recipes['name'].unique()
print(df_recipes_unique)

['arriba   baked winter squash mexican style'
 'a bit different  breakfast pizza' 'all in the kitchen  chili' ...
 'zydeco ya ya deviled eggs' 'cookies by design   cookies on a stick'
 'cookies by design   sugar shortbread cookies']


In [None]:
df_train = df_recipes[['name', 'description', 'ingredients']]
print(df_train.head())

                                         name  \
0  arriba   baked winter squash mexican style   
1            a bit different  breakfast pizza   
2                   all in the kitchen  chili   
3                          alouette  potatoes   
4          amish  tomato ketchup  for canning   

                                         description  \
0  autumn is my favorite time of year to cook! th...   
1  this recipe calls for the crust to be prebaked...   
2  this modified version of 'mom's' chili was a h...   
3  this is a super easy, great tasting, make ahea...   
4  my dh's amish mother raised him on this recipe...   

                                         ingredients  
0  ['winter squash', 'mexican seasoning', 'mixed ...  
1  ['prepared pizza crust', 'sausage patty', 'egg...  
2  ['ground beef', 'yellow onions', 'diced tomato...  
3  ['spreadable cheese with garlic and herbs', 'n...  
4  ['tomato juice', 'apple cider vinegar', 'sugar...  


In [None]:
df_train['name'].value_counts()

name
crock pot lemon garlic chicken                  3
gluten free chocolate chip cookies              3
chocolate peanut butter cookies                 3
three bean chili                                3
pop up rolls                                    3
                                               ..
easy pineapple cake                             1
easy pineapple chicken                          1
easy pineapple chili                            1
easy pineapple dessert                          1
cookies by design   sugar shortbread cookies    1
Name: count, Length: 230185, dtype: int64

In [None]:
df_train['ingredients'].value_counts()

ingredients
['eggs', 'water']                                                                                                                                                                                                10
['flour', 'baking powder', 'salt', 'shortening', 'milk']                                                                                                                                                          6
['sugar', 'water']                                                                                                                                                                                                5
['butter', 'sugar', 'flour']                                                                                                                                                                                      5
['all-purpose flour', 'baking powder', 'salt', 'butter', 'milk']                                                                            

In [None]:
X = df_train[['ingredients']]
Y = df_train[['name']]

print(X.head())
print("------")
print(Y.head())

                                         ingredients
0  ['winter squash', 'mexican seasoning', 'mixed ...
1  ['prepared pizza crust', 'sausage patty', 'egg...
2  ['ground beef', 'yellow onions', 'diced tomato...
3  ['spreadable cheese with garlic and herbs', 'n...
4  ['tomato juice', 'apple cider vinegar', 'sugar...
------
                                         name
0  arriba   baked winter squash mexican style
1            a bit different  breakfast pizza
2                   all in the kitchen  chili
3                          alouette  potatoes
4          amish  tomato ketchup  for canning


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

In [None]:
X_train = X_train.squeeze()
type(X_train)

In [None]:
X_test = X_test.squeeze()
type(X_test)

In [None]:
y_test = y_test.squeeze()
type(y_test)

In [None]:
y_train = y_train.squeeze()
type(y_train)

In [None]:
tfidfv = TfidfVectorizer(lowercase=True)
X_train = X_train.fillna("")
y_train = y_train.fillna("")
x_train = tfidfv.fit_transform(X_train)
x_test = tfidfv.transform(X_test)

In [None]:
import ast

def clean_tokens(input):
    try:
        ingredients = ast.literal_eval(input)
    except ValueError:
        ingredients = input.strip("[]").split(", ")
        ingredients = [word.strip("'") for word in ingredients]

    tokens = [item for sublist in ingredients for item in sublist.split()]
    return tokens

In [None]:
print(X_train.head())
X_train_clean = X_train.apply(clean_tokens)

print("-----")
print(X_train_clean.head())
model = Word2Vec(sentences=list(X_train_clean), vector_size=100, window=5, min_count=1, workers=4)

print("-----")
print(list(model.wv.index_to_key))

98704     ['boneless skinless salmon fillets', 'red chil...
186612    ['beef bones', 'carrots', 'onions', 'celery', ...
27160     ['lean ground beef', 'ground cloves', 'poultry...
62648     ['chicken breasts', 'carrots', 'onion', 'oil',...
28370     ['extra virgin olive oil', 'garlic', 'romano c...
Name: ingredients, dtype: object
-----
98704     [boneless, skinless, salmon, fillets, red, chi...
186612    [beef, bones, carrots, onions, celery, leeks, ...
27160     [lean, ground, beef, ground, cloves, poultry, ...
62648     [chicken, breasts, carrots, onion, oil, chicke...
28370     [extra, virgin, olive, oil, garlic, romano, ch...
Name: ingredients, dtype: object
-----
['salt', 'pepper', 'sugar', 'oil', 'cheese', 'fresh', 'garlic', 'butter', 'onion', 'ground', 'flour', 'powder', 'cream', 'water', 'sauce', 'juice', 'chicken', 'olive', 'milk', 'red', 'black', 'eggs', 'baking', 'green', 'vanilla', 'cloves', 'lemon', 'white', 'egg', 'dried', 'tomatoes', 'brown', 'vinegar', 'cinnamon', 'par

In [None]:
def recipe_vector(tokens, model):
    valid_tokens = [word for word in tokens if word in model.wv]
    if valid_tokens:
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(model.vector_size)

recipe_vectors = [recipe_vector(recipe, model) for recipe in X_train_clean]

In [None]:
input_vector = recipe_vector(['cheese', 'bread', 'sugar', 'squash'], model)

similarities = cosine_similarity([input_vector], recipe_vectors)[0]
print(similarities)

top_indices = np.argsort(similarities)[-5:][::-1]
top_recipes = [y_train.iloc[idx] for idx in top_indices]
top_recipes_ing = [X_train.iloc[idx] for idx in top_indices]

for recipe in top_recipes:
    print(recipe)

for ing in top_recipes_ing:
    print(ing)


[-0.02178471  0.03288089  0.15965293 ...  0.16139321  0.31341076
  0.2650028 ]
cheeze bread
caramelized cheese covered grilled cheese sandwich
pear and cheese toast
kimke s grilled cheese
extra special grilled cheese sandwich
['bread', 'cheese']
['butter', 'bread', 'cheese']
['pear', 'cheese', 'bread', 'butter']
['bread', 'cheese', 'butter', 'season salt']
['bread', 'cheese', 'butter', 'jelly']


In [None]:
input_vector = recipe_vector(["butter", "cheese", "sugar", "cherry", "blueberry"], model)

similarities = cosine_similarity([input_vector], recipe_vectors)[0]
print(similarities)

top_indices = np.argsort(similarities)[-5:][::-1]
top_recipes = [y_train.iloc[idx] for idx in top_indices]
top_recipes_ing = [X_train.iloc[idx] for idx in top_indices]

for recipe in top_recipes:
    print(recipe)

for ing in top_recipes_ing:
    print(ing)


[-0.04973356 -0.1747827  -0.04018064 ...  0.07806155  0.2673572
  0.46678087]
mascarpone cheesecake with balsamic strawberries
tiramisu cheesecake
strawberry cream cheese pound cake
baby brie with praline
easy blue cheese crostini
['biscotti', 'unsalted butter', 'cream cheese', 'mascarpone cheese', 'sugar', 'eggs', 'strawberries', 'balsamic vinegar']
['cream cheese', 'sugar', 'mascarpone cheese', 'eggs', 'flour', 'coffee-flavored liqueur', 'ladyfingers', 'butter']
['butter', 'cream cheese', 'sugar', 'salt', 'butter flavoring', 'eggs', 'sifted flour', 'fresh strawberries', 'icing sugar']
['brie cheese', 'brown sugar', 'pecans', 'butter', 'strawberries']
['blue cheese', 'butter', 'french baguettes', 'sugar']


In [None]:
input_vector = recipe_vector(["butter", "cherries", "sugar", "flour"], model)

similarities = cosine_similarity([input_vector], recipe_vectors)[0]
print(similarities)

top_indices = np.argsort(similarities)[-5:][::-1]
top_recipes = [y_train.iloc[idx] for idx in top_indices]
top_recipes_ing = [X_train.iloc[idx] for idx in top_indices]

for recipe in top_recipes:
    print(recipe)

for ing in top_recipes_ing:
    print(ing)


[-0.09972291 -0.15564957  0.07239853 ...  0.10258189  0.38124645
  0.72051567]
4 ingredient peach cobbler
pie plant pie aka rhubarb pie
smul paj  crumb pie
simple scotch shortbread
super easy shortbread  3 ingredients
['flour', 'sugar', 'butter', 'peaches']
['rhubarb', 'sugar', 'butter', 'flour']
['flour', 'butter', 'sugar']
['butter', 'sugar', 'flour']
['butter', 'sugar', 'flour']


In [None]:
# input must be seperated by commas
input = "butter, cheese, sugar, cherry, blueberry"
new_tokens = clean_tokens(input)
print(new_tokens)

input_vector = recipe_vector(new_tokens, model)

similarities = cosine_similarity([input_vector], recipe_vectors)[0]

print(similarities)

top_indices = np.argsort(similarities)[-5:][::-1]

print(top_indices)

top_indices = np.argsort(similarities)[-5:][::-1]
top_recipes = [y_train.iloc[idx] for idx in top_indices]
top_recipes_ing = [X_train.iloc[idx] for idx in top_indices]

for recipe in top_recipes:
    print(recipe)

for ing in top_recipes_ing:
    print(ing)


['butter', 'cheese', 'sugar', 'cherry', 'blueberry']
[-0.04973356 -0.1747827  -0.04018064 ...  0.07806155  0.2673572
  0.46678087]
[ 96574  44795 125254  47165  53159]
mascarpone cheesecake with balsamic strawberries
tiramisu cheesecake
strawberry cream cheese pound cake
baby brie with praline
easy blue cheese crostini
['biscotti', 'unsalted butter', 'cream cheese', 'mascarpone cheese', 'sugar', 'eggs', 'strawberries', 'balsamic vinegar']
['cream cheese', 'sugar', 'mascarpone cheese', 'eggs', 'flour', 'coffee-flavored liqueur', 'ladyfingers', 'butter']
['butter', 'cream cheese', 'sugar', 'salt', 'butter flavoring', 'eggs', 'sifted flour', 'fresh strawberries', 'icing sugar']
['brie cheese', 'brown sugar', 'pecans', 'butter', 'strawberries']
['blue cheese', 'butter', 'french baguettes', 'sugar']


In [None]:
most_similar = similarities.argmax()

most_similar_text = X_train[most_similar]

print(f"Most similar recipe text: {most_similar_text}")

Most similar recipe text: ['water', 'orzo pasta', 'olive oil', 'italian-style tomatoes', 'chicken broth', 'fresh white mushroom', 'feta cheese']


In [None]:
most_similar = similarities.argmax()

most_similar_text = y_train[most_similar]

print(f"Most similar recipe text: {most_similar_text}")

Most similar recipe text: greek mushroom orzo


In [None]:
keywords = {'sweet', 'fishy', 'salty', 'hot', 'spicy', 'buttery', 'moist'}

def find_keywords(text):
    tokens = word_tokenize(text.lower())
    list_keywords = list(keywords.intersection(tokens))
    return list_keywords

df_train['description'] = df_train['description'].astype(str)
df_train['keywords'] = df_train['description'].apply(find_keywords)

print(df_train[['name', 'description', 'keywords']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['description'] = df_train['description'].astype(str)


                                         name  \
0  arriba   baked winter squash mexican style   
1            a bit different  breakfast pizza   
2                   all in the kitchen  chili   
3                          alouette  potatoes   
4          amish  tomato ketchup  for canning   

                                         description        keywords  
0  autumn is my favorite time of year to cook! th...  [spicy, sweet]  
1  this recipe calls for the crust to be prebaked...              []  
2  this modified version of 'mom's' chili was a h...              []  
3  this is a super easy, great tasting, make ahea...              []  
4  my dh's amish mother raised him on this recipe...              []  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['keywords'] = df_train['description'].apply(find_keywords)


In [None]:
df_with_keywords = df_train[df_train['keywords'].map(bool)]

print(df_with_keywords[['name', 'description', 'keywords']])

                                                     name  \
0              arriba   baked winter squash mexican style   
10                            berry  good sandwich spread   
32                                   grilled  ranch bread   
36      how i got my family to eat spinach  spinach ca...   
42                              i yam what i yam  muffins   
...                                                   ...   
231615                              zuvers barbecue sauce   
231618                       zwetschgenkuchen   plum cake   
231624        zwiebelkuchen   southwest german onion cake   
231625                  zwiebeln salat  swiss onion salad   
231629                                       zydeco salad   

                                              description        keywords  
0       autumn is my favorite time of year to cook! th...  [spicy, sweet]  
10      horseradish is one of my favorite condiments a...    [sweet, hot]  
32                                     

In [None]:
# check if the person is asking for a certain keyword
# then check if the person is asking for a certain ingrediant
# pasta
# times of day

# check for keywords
# check for ratings

# data visualization
# 40 hours of work

#--------------------------------
# ingrediant comparison with cosine similarity
# giving back food based on rating
# keywords (ingrediants, salty/spicy, lunch/dinner/dessert)