# Food.com Recipe Search Engine Project 

Dataset is from Kaggle, Food.com Recipes with Search Terms and Tags: https://www.kaggle.com/datasets/shuyangli94/foodcom-recipes-with-search-terms-and-tags 

In this notebook we will use TFIDF and a negative scoring algorithm to create a search engine for food.com

In [1]:
import pandas as pd 
import numpy as np 
import string 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact_manual # interact, interactive, interact_manual, interactive_output, fixed
import re 
import glob 

In [2]:
pd.set_option('display.max_colwidth', None) # show the entire string value in the df 
pd.set_option('display.max_rows', None) # show the entire string value in the df 

### Data Preparation 

#### Clean Tokens

First, create a column that contains all of the tokens that will be put into the TF IDF Vectorizer. We will use the name, ingredients, tags, and search_terms for our algorithm.

In [3]:
def simple_clean(input):
    """
    Do a simple clean on the input string.
    
    Args: 
        input (string): search query

    Returns:
        cleaned input (string)
    """
    input = input.lower()
    input = re.sub(r'[^a-zA-Z]', ' ', input)
    input = re.sub(r'\s+', ' ', input) # replace any multiple spaces with a single space 
    input = input.strip()
    return input

def replace_double_quotes_by_word_count(match):
    content = match.group(1) 
    count = len(content.split())
    
    # if 3 or less words within the double quotes return the words itself without any quotes 
    if count <= 3:
        return content 
    else: # if more than 3 words return it with single qutoes. this may be a sentence in double quotes 
        return f"'{content}'" 

def extract_words_in_quotes(all_terms, replace_dash=True, lowercase=True, replace_double_quotes=False, remove_apostrophes=False, make_unique=True):
    """
    Returns a list of tokens extracted from a string of tokens in quotations. 

    Example: ['water', 'cheese']['pasta']{'sugar-free'} (str) -> [water, cheese, pasta, sugar-free] (list)

    Args: 
        all_terms (string): string of tokens in single quotations ('')
        replace_dash (boolean, optional): if True replaces dashes with empty string 
        lowercase (boolean, optional): if True makes all characters lowercase 
        replace_double_quotes (boolean, optional): if True replace the double quotes with single quotes 
        remove_apostrophes (boolean, optional): if True remove any apostrophes used for possessive or contractions 
        make_unique (boolean, optional): if True only extract unique terms 

    Returns:
        list: list of tokens 
    """

    if replace_dash:
        all_terms = all_terms.replace('-', ' ')
    if lowercase:
        all_terms = all_terms.lower()
    if replace_double_quotes:
        all_terms = re.sub(r'(\d+)"', r'\1 inch', all_terms) # change double quotes (") that refer to inches to 'inch'
        all_terms = re.sub(r"(\d+)'(?=[FC])", r"\1", all_terms) # remove single quotes (') that refer to temperature (350'F)
        all_terms = re.sub(r'"([^"]+)"', replace_double_quotes_by_word_count, all_terms) # change double quotes (") that refer to "imitation"
    if remove_apostrophes:
        if '\'s' in all_terms:
            all_terms = re.sub(r"(\w+)'s\b", r"\1s", all_terms) # replace possessive 's
        if '\'t' in all_terms:
            all_terms = re.sub(r"(\w+)'t\b", r"\1t", all_terms)  # replace contraction n't
        if '\'re' in all_terms:
            all_terms = re.sub(r"(\w+)'re\b", r"\1re", all_terms)  # replace contraction 're
        if '\'m' in all_terms:
            all_terms = re.sub(r"(\w+)'m\b", r"\1m", all_terms)  # replace contraction 'm
        if '\'d' in all_terms:
            all_terms = re.sub(r"(\w+)'d\b", r"\1d", all_terms)  # replace contraction 'd
        if '\'ve' in all_terms:
            all_terms = re.sub(r"(\w+)'ve\b", r"\1ve", all_terms)  # replace contraction 've
        if '\'ll' in all_terms:
            all_terms = re.sub(r"(\w+)'ll\b", r"\1ll", all_terms)  # replace contraction 'll
    all_terms = re.findall(r"'(.*?)'", all_terms) # '(.*?)': This pattern matches anything inside single quotes.
    if make_unique:
        all_terms = list(set(all_terms))
    return all_terms


In [4]:
def preprocess_recipes(recipe):
    recipe['name_set'] = set(simple_clean(recipe['name']).split())
    recipe['steps'] = extract_words_in_quotes(recipe['steps'], replace_dash=False, lowercase=False, 
                            replace_double_quotes=True, remove_apostrophes=True, make_unique=False)

    return recipe

In [5]:
def get_all_recipes():
    """
    Get info for ~500k recipes from Food.com, including:
        id: identifier (double)
        name: name of the recipe (string)
        description: description of the recipe (string)
        ingredients: list of ingredients (string) 
        ingredients_raw_str: list of portions of ingredients (string) 
        serving size: serving size in grams (string)
        servings: number of servings (double)
        steps: list of steps to follow (string)
        tags: list of tags for the recipe (string)
        search_terms: set of search terms for the recipe (string)

    Returns:
        df (DataFrame): info for ~500k recipes 
    """
    files = glob.glob('/Users/averylee/Desktop/DS/recipes/recipes_w_search_terms_*.csv')
    recipes = pd.concat([pd.read_csv(file) for file in files])

    recipes = recipes.apply(lambda x: preprocess_recipes(x), axis=1)

    return recipes 

def create_all_tokens_col(df):
    """
    Creates a new column that combines all the relevant recipe info that can be used as possible search tokens. 
    
    Args: 
        df (DataFrame): info for recipes 

    Returns:
        df (DataFrame): contains the new col that combines all the cols with possible search tokens. 
    """
    df = df.fillna(' ') # will remove whitespace and extraspaces later 
    df['all_tokens'] = '\'' + df['name'] + '\'' + ' ' + df['ingredients'].apply(str) + ' ' + df['tags'].apply(str) + ' ' + df['search_terms'].apply(str)
    
    return df 

In [6]:
all_recipes = get_all_recipes()
cols = ['id', 'name', 'name_set', 'description', 'steps', 'ingredients', 'tags', 'search_terms']
all_recipes = all_recipes[cols]
all_recipes = create_all_tokens_col(all_recipes)
all_recipes.head(1)

The tokens are not clean yet when combined directly, but we can see that the terms are all in single quotes. Now, for each recipe, get a list of clean terms by extracting the terms from the quotes. This will be used later to classify terms as 'positive' or 'negative' for a recipe.

In [None]:
def remove_apostrophes(word):
    """
    Returns the input string with the apostrophes replaced as a space. 

    Args: 
        word (string)

    Returns:
        word (string)
    """
    word = word.replace('\'', ' ')
    return word 

def get_all_terms(df):
    """
    Gets all the main terms from name, ingredients, tags, and search terms. 

    Args: 
        df (DataFrame): must contain cols name, ingredients, tags, and search_terms

    Returns:
        df (DataFrame): contains new col that contains the combined tokens. 
    """
    df['name'] = df['name'].apply(remove_apostrophes)
    df['all_main_terms'] = '\'' + df['name'] + '\'' + df['ingredients'] + df['tags'] + df['search_terms']
    df['all_main_terms'] = df['all_main_terms'].apply(extract_words_in_quotes) # each resulting val is a list 

    return df 

In [None]:
all_recipes = get_all_terms(all_recipes)
all_recipes.head(1)

Unnamed: 0,id,name,name_set,description,steps,ingredients,tags,search_terms,all_tokens,all_main_terms
0,96313,Grilled Garlic Cheese Grits,"{grits, grilled, cheese, garlic}","We love grits, this is another good way to serve them. A great alternative to a baked potato when served with grilled steak or chicken. I belive this recipe could be made with instant grits.The 2 1/2 hours for refrigeration is not include in time. The recipe comes from Tast of Home's Light and Tasty.","[I a sauce pan, bring water to a boil; slowly add grits and salt, stirring constantly; Reduce heat:simmer, uncovered, for 40-45 minutes or untill thickened, stirrin occasionally., Add cheese and garlic; stir until cheese is melted, Spray 9-inch baking dish with nonstick cooking spray; Cover and refrigerate for 2 to 2 1/2 hours or until frim., Before starting the grill, coat the grill rack with nonstick cooking spray; Cut the grits into 3-inch squares; Brush both sides with olive oil., Grill, covered, over medium heat for 4 to 6 minutes on each side or until lightly browned.]","['water', 'grits', 'salt', 'cheddar cheese', 'garlic', 'olive oil']","['time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'side-dishes', 'eggs-dairy', 'refrigerator', 'diabetic', 'vegetarian', 'grains', 'cheese', 'stove-top', 'dietary', 'low-cholesterol', 'low-calorie', 'comfort-food', 'low-carb', 'low-in-something', 'pasta-rice-and-grains', 'brunch', 'taste-mood', 'equipment', 'presentation', 'served-hot', '4-hours-or-less']","{'diabetic', 'low-calorie', 'vegetarian', 'low-carb', 'side'}","'Grilled Garlic Cheese Grits' ['water', 'grits', 'salt', 'cheddar cheese', 'garlic', 'olive oil'] ['time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'side-dishes', 'eggs-dairy', 'refrigerator', 'diabetic', 'vegetarian', 'grains', 'cheese', 'stove-top', 'dietary', 'low-cholesterol', 'low-calorie', 'comfort-food', 'low-carb', 'low-in-something', 'pasta-rice-and-grains', 'brunch', 'taste-mood', 'equipment', 'presentation', 'served-hot', '4-hours-or-less'] {'diabetic', 'low-calorie', 'vegetarian', 'low-carb', 'side'}","[equipment, presentation, salt, main ingredient, garlic, preparation, course, served hot, cheese, low in something, diabetic, dietary, 4 hours or less, stove top, side, grains, occasion, grits, pasta rice and grains, grilled garlic cheese grits, refrigerator, low calorie, taste mood, eggs dairy, time to make, olive oil, cheddar cheese, side dishes, comfort food, brunch, low cholesterol, water, low carb, vegetarian]"


#### Positive and Negative Tokens

Search queries can contain words that indicate ingredients that are unwanted. For example, 'sugar free bread' means the searcher wants bread that does not contain sugar. However, a standard TFIDF algorithm will not know that the searcher does not want sugar, and may recommend recipes that do contain sugar. 

So for each recipe, we need to classify which ingredients or terms are wanted (called positive), and which are unwanted (called negative). 

In [None]:
def get_pos_neg_patterns():
    """
    Return a dict of regex patterns, where the key indicates if it is of type low, free, or negation.
    The regex in the lists are approximately ordered from most to least likely to appear. 
    Note: if there is a new negation term added to the regex, it must be added to the key in the dict 

    Example of regex patterns: 
        positivies:
            positives after 'low': 
                'low sugar bread'           -> 'bread'
            positives after 'free':
                'sugar free bread'          -> 'bread'
            positives before 'free in' or 'free of': 
                'bread free in sugar'       -> 'bread'
            positives after negations:
                'no sugar bread'            -> 'bread'
            positives before negations:
                'bread with no sugar'       -> 'bread'
            positives after negation and added: 
                'no added sugar bread'      -> 'bread'
        negatives:
            negatives after 'low':
                'low sugar bread'           -> 'sugar'
            negatives before 'free':
                'sugar free bread'          -> 'sugar'
            negatives after 'free in' or 'free of':
                'bread free in sugar'       -> 'sugar'
            negatives after negations:
                'bread with no sugar'       -> 'sugar'
            negatives between negation and added: 
                'no added sugar bread'      -> 'sugar'
    Returns:
        dictionary: {
            type of regex (string) : {
                negative or positive (string) : list of regex (list)
            }
        }
    """
  
    # positives 
    pos_after_low_pattern = re.compile(r'\blow\s*(?:in\s*)?\s*\w+\s+(.+)\b')
    pos_after_free_pattern = re.compile(r'\bfree\b\s+(?!in|of)\s*(.+)')
    pos_before_freein_freeof_pattern = re.compile(r'(.+)\s+free\s+(in|of)\b')
    pos_after_negation_pattern = re.compile(r'\b(?:no added|no|with no|non|not|without|minimal|with minimal)\b\s+\w+\s+(\w.*)') 
    pos_before_negation_pattern = re.compile(r'(.+?)\s+(?=\b(?:no added|no|with no|non|not|without|minimal|with minimal)\b)') 
    pos_after_negation_added_pattern = re.compile(r'\b(?:no|with no|non|not|without|minimal|with minimal)\b\s+[\w\s]+?\s+added\s+(.+)') 

    # negatives 
    neg_after_low_pattern = re.compile(r'\blow\s*(?:in\s*)?\s*(\w+)')
    neg_before_free_pattern = re.compile(r'(.+)\s+free\b(?!\s+(in|of))') 
    neg_after_freein_freeof_pattern = re.compile(r'\b(?:free\s+in|free\s+of)\s*(.+)')
    neg_after_negation_pattern = re.compile(r'\b(?:no added|no|non|not|with no|without|minimal)\b(?:\s+added)?\s+(\w+)') 
    neg_between_negation_added_pattern = re.compile(r'\b(?:no|with no|non|not|without|minimal|with minimal)\b\s+([\w\s]+?)\s+added\b') 

    # in order of most common to least common 
    patterns = {
        'low': {
            'negative': [neg_after_low_pattern],
            'positive': [pos_after_low_pattern] 
        }, 
        'free': {
            'negative': [neg_before_free_pattern, neg_after_freein_freeof_pattern],
            'positive': [pos_after_free_pattern, pos_before_freein_freeof_pattern] 
        }, 
        'no_added no added with_no non not without minimal with_minimal': { # if there is a new negation term added to the regex, it must be added to this key 
            'negative': [neg_after_negation_pattern, neg_between_negation_added_pattern], 
            'positive': [pos_after_negation_pattern, pos_before_negation_pattern, pos_after_negation_added_pattern] 
        }
    }

    return patterns

def make_neg_multiword_into_singleword(term, multiword_neg_terms_list):
    """
    For a given term, replaces any negative multi-word term with its one-word version using underscores (_). 
    Returns the original string with any negative multi-word terms replaced, as well as True/False whether negative multi-word term existed or not.
    
    Args: 
        term (string): the entire term
        multiword_neg_terms_list (set): set of terms that are multiple words but should be considered as one term
    
    Returns:
        term (string): the original input term with the multiword term replaced with its one-word form using underscore (_)
        boolean: True if multiword term exists, False if not 

    Example: 
        term ('pizza without saturated fat'), multiword_neg_terms_list (['saturated fat']) -> 'pizza without saturated_fat', True 
        term ('pizza without fat'), multiword_neg_terms_list (['saturated fat']) -> 'pizza without fat', False 
    """
    for multiword in multiword_neg_terms_list:
        if multiword in term:
            term = term.replace(multiword, multiword.replace(' ', '_'))
            return term, True
    return term, False

def check_contains_neg_indicator(term, all_neg_indicators):
    """
    Returns if the term contains a negative indicator or not 
    
    Args: 
        term (string)
        all_neg_indicators (list): list of strings that indicate that negative exists in the term 
    
    Returns:
        boolean: True if there is a negative indicator, False if there is none 

    Example: 
        term ('no beef pizza'), all_neg_indicators (['no', 'no added', 'without']) -> True 
    """
    
    term = term.replace('no added', 'no_added')
    term = term.replace('with no', 'with_no')
    term = term.replace('with minimal', 'with_minimal')
    term_split = term.split()
    return any(word in term_split for word in all_neg_indicators)

def classify_pos_neg(all_terms, pos_neg_patterns, all_neg_indicators):
    """
    Classify all the positive and negative words for all the given terms. 
    If a token is not found to be positive or negative through the list of regex, it is marked as positive as that is the default. 
    
    Args: 
        all_terms (list): list of strings of all the terms
        pos_neg_patterns (dict): dict of all the positive and negative regex patterns 
        all_neg_indicators (list): list of strings that indicate that negative exists in the term 
    
    Returns:
        set of positive tokens (set)
        set of negative tokens (set)
    """

    pos_set, neg_set = set(), set()
    ignore_neg_words_list = set(['something']) # words to ignore even if determined to be 'negative' (for example, 'free of something')
    multiword_neg_terms_list = ['saturated fat', 'trans fat'] # terms that can be considered as one word for 'negative'

    for term in all_terms: 
        # check if any negative indicators in the term (ex: low, free, no, without, etc)
        # only do regex pattern check if there is a negative indicator, otherwise it is time costly as most terms are not negative 
        contains_negative_indicator = check_contains_neg_indicator(term, all_neg_indicators)

        if contains_negative_indicator:
            term, contains_multiword_neg_term_bool = make_neg_multiword_into_singleword(term, multiword_neg_terms_list)
            is_pattern_matched = False 

            for pattern_type, pattern_type_dict in pos_neg_patterns.items():
                neg_patterns_list = pattern_type_dict['negative']
                pos_patterns_list = pattern_type_dict['positive']

                for neg_pattern in neg_patterns_list:
                    match = neg_pattern.search(term)
                    if match:
                        is_pattern_matched = True 
                        neg_word = match.group(1)
                        if neg_word not in ignore_neg_words_list:
                            if contains_multiword_neg_term_bool: 
                                neg_word = neg_word.replace('_', ' ')
                            neg_words = neg_word.split()
                            neg_set.update(neg_words)
                            # only need to check positive pattern if its negative pattern matched 
                            for pos_pattern in pos_patterns_list:
                                match = pos_pattern.search(term)
                                if match:
                                    pos_word = match.group(1)
                                    if contains_multiword_neg_term_bool:
                                        pos_word = pos_word.replace('_', ' ')
                                    pos_words = pos_word.split()
                                    pos_set.update(pos_words)
                                    break 
                        break 
                
                if is_pattern_matched:
                    break 
                
            if not is_pattern_matched: # in case the term had negative indicator but did not contain negative word 
                pos_words = term.split()
                pos_set.update(pos_words)

        else: # no negative indicators 
            pos_words = term.split()
            pos_set.update(pos_words)

    return pos_set, neg_set

def get_pos_neg_terms(df):
    """
    Classify all the terms in all_terms into either positive or negative. 
    Will be used when matching positives and negatives in the search query to the recipes.

    Args: 
        df (DataFrame): must contain column 'all_main_terms' 

    Returns: 
        df (DataFrame): new cols positive_terms and negative_terms that indicate which words are positive and which are negative 
    """
    pos_neg_patterns = get_pos_neg_patterns()
    all_neg_indicators = ' '.join(pos_neg_patterns.keys()).split() # the keys split into a list of indicators 
    
    # (pos1, neg1), (pos2, neg2), (pos3, neg3) -> (pos1, pos2, pos3), (neg1, neg2, neg3)
    df['positive_terms'], df['negative_terms'] = zip(*df['all_main_terms'].apply(lambda x: classify_pos_neg(x, pos_neg_patterns, all_neg_indicators)))

    return df

In [None]:
all_recipes = get_pos_neg_terms(all_recipes)
all_recipes[['all_main_terms', 'positive_terms', 'negative_terms']].head(5)

Unnamed: 0,all_main_terms,positive_terms,negative_terms
0,"[equipment, presentation, salt, main ingredient, garlic, preparation, course, served hot, cheese, low in something, diabetic, dietary, 4 hours or less, stove top, side, grains, occasion, grits, pasta rice and grains, grilled garlic cheese grits, refrigerator, low calorie, taste mood, eggs dairy, time to make, olive oil, cheddar cheese, side dishes, comfort food, brunch, low cholesterol, water, low carb, vegetarian]","{ingredient, mood, pasta, food, equipment, taste, presentation, salt, top, garlic, and, grilled, preparation, dishes, course, cheese, to, diabetic, cheddar, dietary, less, side, served, olive, grains, occasion, grits, hot, hours, time, 4, stove, main, refrigerator, oil, rice, eggs, dairy, brunch, comfort, water, vegetarian, make, or}","{cholesterol, carb, calorie}"
1,"[clam juice, shrimp, salt, main ingredient, simple shrimp and andouille jambalaya, 60 minutes or less, shellfish, hot pepper sauce, preparation, meat, course, long grain rice, garlic cloves, bay leaves, pork sausage, fresh parsley, onion, pork, dinner, main dish, easy, large shrimp, vegetable oil, time to make, red bell pepper, seafood, andouille sausage, diced tomatoes, one dish meal]","{ingredient, shrimp, bell, salt, one, shellfish, garlic, simple, diced, and, preparation, minutes, meat, course, bay, dish, to, parsley, less, andouille, jambalaya, cloves, time, long, hot, red, fresh, pork, onion, main, tomatoes, sauce, leaves, sausage, pepper, dinner, easy, large, oil, grain, rice, meal, 60, vegetable, juice, seafood, clam, make, or}",{}
2,"[seasonal, spring, 15 minutes or less, beans, vegetarian, main ingredient, preparation, course, number of servings, vegan, dietary, black beans, black pepper, side, salad, inexpensive, to go, occasion, summer, table salt, onion, no cook, italian parsley, tomatoes, dinner, easy, potluck, dinner party, 3 steps or less, north american, white wine vinegar, canned black beans, time to make, olive oil, salads, american, side dishes, white beans, black and white bean salad, cuisine, picnic, technique, celery]","{ingredient, seasonal, spring, beans, 3, salt, wine, servings, and, preparation, minutes, course, dishes, bean, to, number, go, vegan, dietary, less, parsley, party, vinegar, side, salad, olive, inexpensive, time, occasion, picnic, summer, 15, onion, main, italian, tomatoes, dinner, black, easy, pepper, potluck, of, north, oil, table, american, white, canned, salads, steps, cuisine, vegetarian, make, or, technique, celery}",{cook}
3,"[seasonal, winter, equipment, main ingredient, garlic, preparation, italian seasoning, course, vegetables, zucchini, weeknight, yellow squash, dietary, salt and pepper, side, occasion, beginner cook, green bell pepper, onion, crock pot slow cooker, italian, easy, 3 steps or less, squash, fall, time to make, side dishes, crock pot italian zucchini, water, vegetarian, diced tomatoes]","{ingredient, seasonal, winter, equipment, yellow, bell, 3, salt, pot, garlic, diced, and, preparation, dishes, course, crock, to, vegetables, zucchini, weeknight, dietary, less, cooker, green, side, beginner, seasoning, time, occasion, slow, onion, main, italian, tomatoes, easy, pepper, squash, fall, steps, cook, water, vegetarian, make, or}",{}
4,"[salt, main ingredient, mushroom, preparation, meat, course, cinnamon, 4 hours or less, sugar, black pepper, beef, beef stew meat, beef stock, dinner, main dish, allspice, flour, easy, vegetable oil, beef stew with dried cherries, onions, time to make, dried sour cherries, water, dry red wine]","{ingredient, salt, wine, mushroom, stew, preparation, meat, course, dish, stock, to, cinnamon, less, sugar, beef, with, time, hours, red, 4, main, dinner, black, allspice, pepper, flour, easy, oil, dried, dry, sour, onions, vegetable, make, water, cherries, or}",{}


### Set up the TFIDF Vectorizer 

Now let's set up the TFIDF Vectorizer. The TFIDFVectorizer automatically cleans up any punctuation or stopwords, and requires a string to be taken as an input; we can use the unclean version of the string (all_tokens column) for this task.

The feature_names will provide a list of all the remaining tokens after the cleanup (removal of stopwords, punctuation, etc). This means words like 'and' or 'or' will not be included. 

In [None]:
def get_tfidf_vectorizer_and_matrix(df_all_tokens):
    """
    Get tfidf vectorizer and tfidf matrix given the tokens. 

    Args: 
        df_all_tokens (Series): contains all the tokens for all recipes

    Returns:
        list ([vectorizer, matrix]): return a list of tfidf vectorizer and tfidf matrix
    """
    custom_stopwords = stopwords.words('english') 

    vectorizer = TfidfVectorizer(stop_words=custom_stopwords, token_pattern=r'\b[a-zA-Z]{1,}\b') # by default removes only length of 1 digits, but want to remove all digits 

    matrix = vectorizer.fit_transform(df_all_tokens) # shape: number of recipes x number of unique terms in all of all_tokens

    return [vectorizer, matrix]

In [None]:
# create model 
tfidf_vectorizer, tfidf_matrix = get_tfidf_vectorizer_and_matrix(all_recipes['all_tokens'])
feature_names = tfidf_vectorizer.get_feature_names_out()

### Get the Clean Tokens

Let's call these tokens remaining after the TFIDF cleanup 'clean tokens'. It is important to get the list of clean tokens when we later count how many of the search terms are also in the recipe. 

For example, if we do not use the clean list of tokens, it can give too much weight to stopwords like 'and' or 'or'. 

In [None]:
def get_clean_tokens(index, feature_names, matrix):
    """
    Get a set of the clean tokens for a certain recipe after tfidf.
    TFIDF transform may remove stopwords or other punctuation, so we want to get only the tokens that are remaining. 

    Args: 
        index (string): name of the recipe 
        feature_names (list): list of the feature names resulting from the tfidf
        matrix: tfidf matrix

    Returns:
        clean_tokens (set): set of all the clean tokens after tfidf
    """
    clean_indices = matrix[index].tocoo().col # indices where input_tfidf_vector is non null, aka the word (column name) exists in the input 
    clean_tokens = feature_names[clean_indices] # array format 

    return set(clean_tokens)

def get_clean_tokens_string(recipe_tokens):
    """
    Gets all the clean tokens after tfidf as a string, separated by a space ' '

    Args: 
        recipe_tokens (set): set of tokens for a recipe 

    Returns:
        string: string of the clean tokens separated by a space ' '
    """
    # input recipe_tokens is an array of strings, return string format 
    return ' '.join([token for token in recipe_tokens]) # array to string 

In [None]:
def get_clean_vectorizer_tokens(row, feature_names, matrix):
    """
    Get the clean tokens for a recipe after tfidf transformation. 

    Args: 
        row: row of a df, must contain column 'name'
        feature_names (list): list of the resulting feature names after tfidf 
        matrix: tfidf matrix 

    Returns:
        clean_tokens (set): set of clean tokens resulting from tfidf 
    """
    index = row.name
    clean_tokens = get_clean_tokens(index, feature_names, matrix)

    return clean_tokens

In [None]:
# get the clean tokens for each recipe 
all_recipes['clean_tokens'] = all_recipes.apply(lambda x: get_clean_vectorizer_tokens(x, feature_names, tfidf_matrix), axis=1) # axis=1 will make row.name=index 
all_recipes['clean_tokens_str'] = all_recipes['clean_tokens'].apply(lambda x: get_clean_tokens_string(x))
all_recipes.head(1)

Unnamed: 0,id,name,name_set,description,steps,ingredients,tags,search_terms,all_tokens,all_main_terms,positive_terms,negative_terms,clean_tokens,clean_tokens_str
0,96313,Grilled Garlic Cheese Grits,"{grits, grilled, cheese, garlic}","We love grits, this is another good way to serve them. A great alternative to a baked potato when served with grilled steak or chicken. I belive this recipe could be made with instant grits.The 2 1/2 hours for refrigeration is not include in time. The recipe comes from Tast of Home's Light and Tasty.","[I a sauce pan, bring water to a boil; slowly add grits and salt, stirring constantly; Reduce heat:simmer, uncovered, for 40-45 minutes or untill thickened, stirrin occasionally., Add cheese and garlic; stir until cheese is melted, Spray 9-inch baking dish with nonstick cooking spray; Cover and refrigerate for 2 to 2 1/2 hours or until frim., Before starting the grill, coat the grill rack with nonstick cooking spray; Cut the grits into 3-inch squares; Brush both sides with olive oil., Grill, covered, over medium heat for 4 to 6 minutes on each side or until lightly browned.]","['water', 'grits', 'salt', 'cheddar cheese', 'garlic', 'olive oil']","['time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'side-dishes', 'eggs-dairy', 'refrigerator', 'diabetic', 'vegetarian', 'grains', 'cheese', 'stove-top', 'dietary', 'low-cholesterol', 'low-calorie', 'comfort-food', 'low-carb', 'low-in-something', 'pasta-rice-and-grains', 'brunch', 'taste-mood', 'equipment', 'presentation', 'served-hot', '4-hours-or-less']","{'diabetic', 'low-calorie', 'vegetarian', 'low-carb', 'side'}","'Grilled Garlic Cheese Grits' ['water', 'grits', 'salt', 'cheddar cheese', 'garlic', 'olive oil'] ['time-to-make', 'course', 'main-ingredient', 'preparation', 'occasion', 'side-dishes', 'eggs-dairy', 'refrigerator', 'diabetic', 'vegetarian', 'grains', 'cheese', 'stove-top', 'dietary', 'low-cholesterol', 'low-calorie', 'comfort-food', 'low-carb', 'low-in-something', 'pasta-rice-and-grains', 'brunch', 'taste-mood', 'equipment', 'presentation', 'served-hot', '4-hours-or-less'] {'diabetic', 'low-calorie', 'vegetarian', 'low-carb', 'side'}","[equipment, presentation, salt, main ingredient, garlic, preparation, course, served hot, cheese, low in something, diabetic, dietary, 4 hours or less, stove top, side, grains, occasion, grits, pasta rice and grains, grilled garlic cheese grits, refrigerator, low calorie, taste mood, eggs dairy, time to make, olive oil, cheddar cheese, side dishes, comfort food, brunch, low cholesterol, water, low carb, vegetarian]","{ingredient, mood, pasta, food, equipment, taste, presentation, salt, top, garlic, and, grilled, preparation, dishes, course, cheese, to, diabetic, cheddar, dietary, less, side, served, olive, grains, occasion, grits, hot, hours, time, 4, stove, main, refrigerator, oil, rice, eggs, dairy, brunch, comfort, water, vegetarian, make, or}","{cholesterol, carb, calorie}","{ingredient, mood, pasta, food, equipment, taste, presentation, salt, carb, top, garlic, grilled, dishes, preparation, course, cheese, cheddar, diabetic, dietary, less, side, olive, served, time, occasion, grits, grains, hot, hours, calorie, cholesterol, something, stove, main, oil, refrigerator, eggs, rice, dairy, brunch, low, comfort, water, vegetarian, make}",ingredient mood pasta food equipment taste presentation salt carb top garlic grilled dishes preparation course cheese cheddar diabetic dietary less side olive served time occasion grits grains hot hours calorie cholesterol something stove main oil refrigerator eggs rice dairy brunch low comfort water vegetarian make


Make ingredients into a list to display cleanly. Now, only keep the cols needed in the recommender system.

In [None]:
def get_ingredients_list(df):
    df['ingredients'] = df['ingredients'].apply(lambda x: extract_words_in_quotes(x, replace_dash=False, lowercase=False, remove_apostrophes=False, make_unique=False))
    
    return df

In [None]:
all_recipes = get_ingredients_list(all_recipes)
final_cols = ['name', 'name_set', 'clean_tokens', 'positive_terms', 'description', 'ingredients', 'steps']
all_recipes = all_recipes[final_cols]

### Search Algorithm

We have now set up the data we need for the algorithm given an input search query. The algorithm is as follows: 

Multiply the below scores for each recipe, then rank the final score from highest to lowest. 
- cosine similarity based on tfidf
- number of matching terms in all tokens in the recipe + 1 
- number of matching terms in the name of the recipe + 1
- negative multiplier (-1 if recipe includes something classified negatively in the search input, 1 otherwise)

Here is the function to get the common terms between a recipe and the input search query. 

In [None]:
def get_input_token_count(row, input):
    """
    Get the number of common terms between row (recipe) and the input query

    Args: 
        row (set): set of tokens in the recipe 
        input (set): set of tokens in the input query 

    Returns:
        int: number of common terms 
    """
    return len(row & input)

Here we classify the positive and negative terms from the search input query.

In [None]:
def get_pos_neg_terms_with_string(input):
    """
    Get all the positive and negative terms given a string input. 

    Args: 
        input (string): search query 

    Returns: 
        input_df (DataFrame): contains positive and negative terms for the input 
    """
    input_df = pd.DataFrame({'all_main_terms': [[input]]})
    input_df = get_pos_neg_terms(input_df)
    return input_df

This searches for any negative terms from the input query's positive terms list, to make sure none of the recommended recipes contain ingredients that was marked to not be wanted from the input search query. 

In [None]:
def get_neg_multiplier(recipe_pos_terms, input_neg_set):
    """
    If there is a negative term in the input that is a positive term in the recipe, 
    the negative multiplier is -1, otherwise it is 1

    Args: 
        recipe_pos_terms (set): set of all the positive terms in a recipe 
        input_neg_set (set): set of all the negative terms in a search input query 

    Returns: 
        int: -1 if there is an intersection between input negatives and recipe positives 
    """
    return -1 if input_neg_set & recipe_pos_terms else 1

def get_negative_scores(df, input_df): 
    """
    Get a list of the negative multiplier for each recipe in the df. 

    Args: 
        df (DataFrame): contains all the recipes and its positive terms 
        input_df (DataFrame): contains negative terms for the input search query 

    Returns: 
        list: list of negative multiplier (1 or -1) for each recipe 
    """
    input_neg_set = input_df.iloc[0]['negative_terms']

    negative_multipliers = df['positive_terms'].apply(lambda x: get_neg_multiplier(x, input_neg_set)).to_numpy() 

    return negative_multipliers

This is the final algorithm. 

In [None]:
def get_recipe_recommendations(df, input, tfidf_vectorizer, tfidf_matrix, top_n=10):
    """
    Get the top_n recipes to recommend based on the search input query. 

    Algorithm: multiply the below to get scores for each recipe, then rank from highest to lowest 
        * cosine similarity based on tfidf
        * number of matching terms in all tokens in the recipe + 1 
        * number of matching terms in the name of the recipe + 1
        * negative multiplier (-1 if recipe includes something classified negatively in the search input, 1 otherwise)

    Args: 
        df (DataFrame): contains all the recipes and its information 
        input (string): search query 
        tfidf_vectorizer (TFIDFVectorizer)
        tfidf_matrix

    Returns: 
        recommended_recipes (DataFrame): the top top_n most recommended recipes based on the search input 
    """
    ### clean 
    # clean the input with simple cleaner 
    input = simple_clean(input)
    
    ### cosine similarity 
    # TF-IDF transform on the search query 
    # automatically does the extra token cleaning defined in tfidf_vectorizer
    input_tfidf_vector = tfidf_vectorizer.transform([input]) 

    # get cosine similarity with every recipe 
    cosine_similarity_scores = cosine_similarity(input_tfidf_vector, tfidf_matrix)[0]

    ### negatives 
    # take negatives into account 
    # find the negative word in the search query if there is one 
    # for example 'no sugar' -> 'sugar' is the negative word, 'sugar free' -> 'sugar' is the negative word 
    input_df = get_pos_neg_terms_with_string(input)

    print('Try exploring these delicious recipes.')

    # check for negatives - if a negative term is in the recipe rec, multiply -1 to the score 
    # get a vector of 1s and -1s to multiply with the scores_vector, then multiply it 
    negative_scores_vector = get_negative_scores(df, input_df)

    ### term matching between input and recipes 
    # get how many terms in the input match with the recipes 
    feature_names = tfidf_vectorizer.get_feature_names_out()
    clean_input = get_clean_tokens(0, feature_names, input_tfidf_vector)
    input_term_counts = df['clean_tokens'].apply(lambda x: get_input_token_count(x, clean_input)).to_numpy()
    input_term_counts += 1 # make it a multiplier to the cosine similarity 

    # get how many terms in the input match with the recipe name specifically 
    # want to put heavier weight on these 
    clean_input_pos_set = input_df['positive_terms'][0]
    input_term_counts_in_name = df['name_set'].apply(lambda x: get_input_token_count(x, clean_input_pos_set)).to_numpy()
    input_term_counts_in_name += 1 # make it a multiplier to the cosine similarity 

    ### score calculation 
    scores_vector = cosine_similarity_scores * input_term_counts * input_term_counts_in_name * negative_scores_vector

    ### ranking: get the point where the scores plateau. then get the top min(top_n, number of recipes before plateau) recipes to return 
    scores_vector_desc = sorted(scores_vector, reverse=True) # sort the scores in desc order 
    gradient = np.gradient(scores_vector_desc) # get the gradient of the decrease 

    threshold = 0.0001 # manually defined 
    plateau_index = np.where(np.abs(gradient) < threshold)[0][0] # get the index where the scores start to plateau 
    ranked_indices = np.argsort(scores_vector)[::-1][0:min(plateau_index, top_n)] # get the final recipes 
    recommended_recipes = df.iloc[ranked_indices]

    return recommended_recipes

### Search Engine 

Now, let's test out the search engine with a simple Python widget. Searching will provide the top 10 best recommendations from food.com recipes. 

In [None]:
def click_search_button(input):
    if not input:
        display('Search for a recipe')
    else: 
        print('Good choice!')

        top_n = 10
        recommendations = get_recipe_recommendations(all_recipes, input, tfidf_vectorizer, tfidf_matrix, top_n=top_n) 

        for _, row in recommendations.iterrows():
            display(widgets.HTML(f"""
                <h3>{row['name']}</h3> 
                <p>{row['description']}</p>
                <p><b>Ingredients:</b> {', '.join(row['ingredients'])}</p>
                <p><b>Steps</b></p>
                <ol>
                    {''.join([f"<li>{step}</li>" for step in row['steps']])}
                </ol>
                <hr>
            """))

user_input = widgets.Text(
    value='',
    placeholder='Type here',
    description='I want to make:',
    style={'description_width': 'initial'}
)

buffer_space = widgets.HTML(value="<div style='height: 150px;'></div>") 

title = widgets.HTML(value="<h2 style='font-weight: bold; text-align: left; font-size: 36px;'>Food.com</h2>") 

display(buffer_space, title)
# Link the button to the function
search_button = widgets.interact_manual(click_search_button, input=user_input)
search_button.widget.children[1].description = 'Search'
search_button.widget.children[1].style.button_color = '#ADD8E6'

HTML(value="<div style='height: 150px;'></div>")

HTML(value="<h2 style='font-weight: bold; text-align: left; font-size: 36px;'>Food.com</h2>")

interactive(children=(Text(value='', continuous_update=False, description='I want to make:', placeholder='Type…