# Recipe Simple Embedding Data Setup



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast
from sentence_transformers import SentenceTransformer
from IPython.display import clear_output

2024-01-21 11:39:16.810269: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# read in the data
data = pd.read_csv("/media/michael/Delta/Data/Data_Science_Masters/MSDS_498/RAW_recipes.csv")

# build the ingredients and steps text
data['steps_text'] = data['steps'].apply(lambda x: ". ".join(ast.literal_eval(x)))
data['ingredients_text'] = data['ingredients'].apply(lambda x: ", ".join(ast.literal_eval(x)))

# filter to only recipes with a 10 word description
data = data[data['description'].str.count(" ") >= 9]

# filter to only those that use 3 or more ingredients
data = data.query("n_ingredients >= 3")

# filter to only recipes with at least some instruction
data = data[data['steps_text'].str.len() > 10]

# reset index for horizontal concatenation
data = data.reset_index(drop=True)
data

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,steps_text,ingredients_text
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,make a choice and proceed with recipe. dependi...,"winter squash, mexican seasoning, mixed spice,..."
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,preheat oven to 425 degrees f. press dough int...,"prepared pizza crust, sausage patty, eggs, mil..."
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,brown ground beef in large pot. add chopped on...,"ground beef, yellow onions, diced tomatoes, to..."
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,place potatoes in a large pot of lightly salte...,"spreadable cheese with garlic and herbs, new p..."
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,"mix all ingredients& boil for 2 1 / 2 hours , ...","tomato juice, apple cider vinegar, sugar, salt..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193680,zydeco salad,367912,5,79877,2009-04-25,"['15-minutes-or-less', 'time-to-make', 'prepar...","[14.1, 0.0, 8.0, 0.0, 1.0, 0.0, 1.0]",4,['place the lettuce on a platter or serving di...,"recipe courtesy of b&c seafood, vacherie, la a...","['iceberg lettuce', 'tomatoes', '3 bean mix', ...",4,place the lettuce on a platter or serving dish...,"iceberg lettuce, tomatoes, 3 bean mix, olive s..."
193681,zydeco sauce,357451,15,461283,2009-02-23,"['15-minutes-or-less', 'time-to-make', 'course...","[239.9, 30.0, 19.0, 22.0, 1.0, 14.0, 5.0]",3,"['to make the sauce , combine the mayonnaise ,...",great sauce for cheeseburgers or dipping fries...,"['mayonnaise', 'prepared horseradish', 'worces...",6,"to make the sauce , combine the mayonnaise , h...","mayonnaise, prepared horseradish, worcestershi..."
193682,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22,"heat oil in a 4-quart dutch oven. add celery ,...","celery, onion, green sweet pepper, garlic clov..."
193683,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,place melted butter in a large mixing bowl and...,"butter, eagle brand condensed milk, light brow..."


In [3]:
# load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# create the embedding dict in case it fails and only some variables have to be rerun
embedding_dict = {
    "name": pd.DataFrame(), 
    "description": pd.DataFrame(), 
    "steps_text": pd.DataFrame(), 
    "ingredients_text": pd.DataFrame()
}

In [4]:
# calculate embeddings for each column
cols_to_embed = ["name", "description", "steps_text", "ingredients_text"]

for curr_col in cols_to_embed:
    print(curr_col)
    embeddings = model.encode(data[curr_col].tolist(), normalize_embeddings=True)
    embeddings_df = pd.DataFrame(embeddings, columns=[curr_col + "_embedding_" + str(x).rjust(3, "0") for x in range(embeddings.shape[1])])
    embedding_dict[curr_col] = embeddings_df

name
description
steps_text
ingredients_text


In [5]:
# combine all of the embedding into the dataset
all_embeddings = pd.concat(embedding_dict.values(), axis=1)
data = data.drop([x for x in all_embeddings.columns if x in data.columns], axis=1)
data = pd.concat([data, all_embeddings], axis=1)
data

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,ingredients_text_embedding_374,ingredients_text_embedding_375,ingredients_text_embedding_376,ingredients_text_embedding_377,ingredients_text_embedding_378,ingredients_text_embedding_379,ingredients_text_embedding_380,ingredients_text_embedding_381,ingredients_text_embedding_382,ingredients_text_embedding_383
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,...,0.021177,0.007456,0.110229,-0.068143,0.088277,-0.052553,-0.054856,-0.009026,0.012120,0.082861
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,...,0.080783,0.074230,0.089201,0.006190,0.099552,0.022226,-0.062413,0.064582,0.013286,-0.014859
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,...,0.051172,0.082343,0.116373,-0.058827,0.041265,-0.043046,-0.054921,0.040403,0.032460,0.043497
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...",...,-0.001429,0.025129,0.088833,-0.009907,0.080625,-0.029695,-0.061843,0.024248,0.030216,0.069643
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,...,0.029602,0.018748,0.086573,-0.064266,0.005418,-0.051273,-0.052236,0.014114,0.075585,0.041810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193680,zydeco salad,367912,5,79877,2009-04-25,"['15-minutes-or-less', 'time-to-make', 'prepar...","[14.1, 0.0, 8.0, 0.0, 1.0, 0.0, 1.0]",4,['place the lettuce on a platter or serving di...,"recipe courtesy of b&c seafood, vacherie, la a...",...,0.073593,0.068490,0.047236,-0.013200,-0.019404,0.051077,-0.069215,-0.022110,-0.015492,-0.004447
193681,zydeco sauce,357451,15,461283,2009-02-23,"['15-minutes-or-less', 'time-to-make', 'course...","[239.9, 30.0, 19.0, 22.0, 1.0, 14.0, 5.0]",3,"['to make the sauce , combine the mayonnaise ,...",great sauce for cheeseburgers or dipping fries...,...,0.026380,0.064427,0.097591,-0.025998,0.022555,-0.049435,-0.064586,0.028330,0.045326,0.015619
193682,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,...,-0.024282,0.020551,0.117972,-0.015676,0.026022,-0.110060,-0.091804,0.066024,0.017025,0.062161
193683,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...",...,-0.005270,-0.037742,0.078005,-0.072237,0.083426,-0.040293,-0.006431,-0.008967,0.043936,-0.006024


In [7]:
# Load in the review data and join it to the encoded data
review_data = pd.read_csv("/media/michael/Delta/Data/Data_Science_Masters/MSDS_498/RAW_interactions.csv")
review_summary = review_data.groupby(['recipe_id']).rating.value_counts().unstack(fill_value=0)
review_summary.columns = ["rating_count_" + str(x) for x in review_summary.columns]
review_summary['rating_count_total'] = review_summary.sum(axis=1)
review_summary.reset_index(inplace=True)
review_summary = review_summary.rename({
    "recipe_id": "id"
}, axis=1)
review_summary

Unnamed: 0,id,rating_count_0,rating_count_1,rating_count_2,rating_count_3,rating_count_4,rating_count_5,rating_count_total
0,38,0,0,0,0,3,1,4
1,39,0,0,0,1,0,0,1
2,40,0,1,0,0,2,6,9
3,41,0,0,0,0,1,1,2
4,43,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...
231632,537459,0,0,0,0,0,1,1
231633,537485,0,0,0,0,0,1,1
231634,537543,1,0,0,0,0,0,1
231635,537671,1,0,0,0,0,0,1


In [8]:
rating_cols = ["rating_count_0", "rating_count_1", "rating_count_2", "rating_count_3", "rating_count_4", "rating_count_5", "rating_count_total"]
data = data.drop([x for x in rating_cols if x in data.columns], axis=1)
data = data.merge(review_summary, on=['id'], how='left')
data = data.fillna({x: 0 for x in rating_cols})
data

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,ingredients_text_embedding_381,ingredients_text_embedding_382,ingredients_text_embedding_383,rating_count_0,rating_count_1,rating_count_2,rating_count_3,rating_count_4,rating_count_5,rating_count_total
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,...,-0.009026,0.012120,0.082861,0,0,0,0,0,3,3
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,...,0.064582,0.013286,-0.014859,1,0,0,0,1,2,4
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,...,0.040403,0.032460,0.043497,0,0,0,0,1,0,1
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...",...,0.024248,0.030216,0.069643,0,0,0,0,1,1,2
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,...,0.014114,0.075585,0.041810,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193680,zydeco salad,367912,5,79877,2009-04-25,"['15-minutes-or-less', 'time-to-make', 'prepar...","[14.1, 0.0, 8.0, 0.0, 1.0, 0.0, 1.0]",4,['place the lettuce on a platter or serving di...,"recipe courtesy of b&c seafood, vacherie, la a...",...,-0.022110,-0.015492,-0.004447,0,0,0,0,0,3,3
193681,zydeco sauce,357451,15,461283,2009-02-23,"['15-minutes-or-less', 'time-to-make', 'course...","[239.9, 30.0, 19.0, 22.0, 1.0, 14.0, 5.0]",3,"['to make the sauce , combine the mayonnaise ,...",great sauce for cheeseburgers or dipping fries...,...,0.028330,0.045326,0.015619,0,0,0,0,0,2,2
193682,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,...,0.066024,0.017025,0.062161,0,0,0,0,0,6,6
193683,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...",...,-0.008967,0.043936,-0.006024,0,1,0,0,0,0,1


In [9]:
# we have the final data. Save as a parquet file to be picked up by the next notebook.
data.to_parquet("/media/michael/Delta/Data/Data_Science_Masters/MSDS_498/recipes_embedded.parquet")