In [9]:
from apyori import apriori
import sqlite3
from collections import defaultdict

In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json
import ast

df = pd.read_csv("RAW_recipes.csv.zip")

In [None]:
df.head(2)

In [12]:
dish_ing = df[['name','ingredients']].copy()
dish_ing.loc[:,'ingredients'] = dish_ing['ingredients'].apply(lambda lst: set(ast.literal_eval(lst)))
dish_ing.head()

Unnamed: 0,name,ingredients
0,arriba baked winter squash mexican style,"{olive oil, mixed spice, butter, salt, winter ..."
1,a bit different breakfast pizza,"{salt and pepper, milk, prepared pizza crust, ..."
2,all in the kitchen chili,"{yellow onions, tomato soup, salt, lettuce, ch..."
3,alouette potatoes,"{olive oil, red bell pepper, salt, tarragon, p..."
4,amish tomato ketchup for canning,"{sugar, salt, dry mustard, clove oil, cinnamon..."


## Fetch dataset from user input & add to list

In [14]:
conn = sqlite3.connect('dishes.db')
# Create a cursor
c = conn.cursor()

c.execute("SELECT DISTINCT * FROM dishes_list")
user_records = c.fetchall()

# Commit the changes & close connection
conn.commit()
conn.close()

[('pizza', 'tomato, garlic, basil, flour, cheese, olive oil'), ('muffin', 'egg, blueberry, butter, flour, cheese, milk, vanilla, sugar')]


In [26]:
# Create df for user inputs
user_records_dct = []
for dish, items in user_records:
    items_set = set([i.lstrip().rstrip() for i in items.split(',')])
    dictionary_data = {'name': dish, 'ingredients':items_set}
    user_records_dct.append(dictionary_data)
user_df= pd.DataFrame.from_dict(user_records_dct)
user_df.head()

[{'name': 'pizza', 'ingredients': {'olive oil', 'garlic', 'cheese', 'basil', 'flour', 'tomato'}}, {'name': 'muffin', 'ingredients': {'butter', 'sugar', 'milk', 'cheese', 'flour', 'egg', 'blueberry', 'vanilla'}}]


Unnamed: 0,name,ingredients
0,pizza,"{olive oil, garlic, cheese, basil, flour, tomato}"
1,muffin,"{butter, sugar, milk, cheese, flour, egg, blue..."


In [28]:
# concat with Kaggle dataset
dish_ing = pd.concat([dish_ing, user_df], ignore_index=True)

In [31]:
# Create a list of sets of ingredients for searching & the apyori library
ing_series = dish_ing['ingredients']
ingredients_lsts = ing_series.to_list()

In [33]:
ingredients_lsts[:2]

[{'butter',
  'honey',
  'mexican seasoning',
  'mixed spice',
  'olive oil',
  'salt',
  'winter squash'},
 {'cheese',
  'eggs',
  'milk',
  'prepared pizza crust',
  'salt and pepper',
  'sausage patty'}]

In [34]:
# Create dict for searching
ing_dct = defaultdict(set)
for n, ings in zip(dish_ing['name'],dish_ing['ingredients']):
    for item in ings:
        ing_dct[item].add(n)    

In [35]:
def searching_dish(search_ings):
    '''
    Give out dish name by searching for ingredients
    '''
    wanted_dishes = set()
    for s_ing in search_ings:
        if s_ing in ing_dct and wanted_dishes:
            wanted_dishes = wanted_dishes.intersection(ing_dct[s_ing])
        else:
            wanted_dishes = wanted_dishes.union(ing_dct[s_ing])
            
    return wanted_dishes

In [40]:
# testing finding dishes
search_ings = ['honey', 'winter squash']
searching_dish(search_ings)

{'arriba   baked winter squash mexican style',
 'd do s savory beef shank soup',
 'roasted vegetables with chicken sausage'}

### Use Apriori algo

**Link**: https://towardsdatascience.com/the-apriori-algorithm-5da3db9aea95

### Definitions

Association Rules written in a format: Ingredient A => Ingredient B. This implies that you obtain a rule that tells you that if you use ingredient A, you are also likely to use ingredient B.

__support__: A measure of how often or how frequently the products co-occur
![support](img/support.jpeg)

__confidence__: It tells us how often the items A and B occur together, given the number times A occurs.
![confidence](img/confidence.jpeg)

__lift__: the strength of association
![lift](img/lift.jpeg)

In [41]:
association_rules = apriori(ingredients_lsts, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)

In [42]:
print(len(association_results))

665


In [43]:
print(association_results[500])

RelationRecord(items=frozenset({'baking soda', 'butter', 'flour', 'salt'}), support=0.009743609668492784, ordered_statistics=[OrderedStatistic(items_base=frozenset({'baking soda', 'butter'}), items_add=frozenset({'flour', 'salt'}), confidence=0.3634460547504026, lift=5.126554663033035), OrderedStatistic(items_base=frozenset({'baking soda', 'flour'}), items_add=frozenset({'butter', 'salt'}), confidence=0.3785007546536978, lift=3.426297874368983), OrderedStatistic(items_base=frozenset({'baking soda', 'salt'}), items_add=frozenset({'butter', 'flour'}), confidence=0.20950524459296388, lift=3.5024238851233807), OrderedStatistic(items_base=frozenset({'baking soda', 'butter', 'salt'}), items_add=frozenset({'flour'}), confidence=0.4744586924532268, lift=4.1839172019633395), OrderedStatistic(items_base=frozenset({'butter', 'flour', 'salt'}), items_add=frozenset({'baking soda'}), confidence=0.26678486997635936, lift=4.3831321722429895)])


In [44]:
association_results[500].items

frozenset({'baking soda', 'butter', 'flour', 'salt'})

In [45]:
association_results[500].ordered_statistics

[OrderedStatistic(items_base=frozenset({'baking soda', 'butter'}), items_add=frozenset({'flour', 'salt'}), confidence=0.3634460547504026, lift=5.126554663033035),
 OrderedStatistic(items_base=frozenset({'baking soda', 'flour'}), items_add=frozenset({'butter', 'salt'}), confidence=0.3785007546536978, lift=3.426297874368983),
 OrderedStatistic(items_base=frozenset({'baking soda', 'salt'}), items_add=frozenset({'butter', 'flour'}), confidence=0.20950524459296388, lift=3.5024238851233807),
 OrderedStatistic(items_base=frozenset({'baking soda', 'butter', 'salt'}), items_add=frozenset({'flour'}), confidence=0.4744586924532268, lift=4.1839172019633395),
 OrderedStatistic(items_base=frozenset({'butter', 'flour', 'salt'}), items_add=frozenset({'baking soda'}), confidence=0.26678486997635936, lift=4.3831321722429895)]

### Create a dataframe with Support, Confidence & Lift scores for each items set

In [11]:
scores_df = pd.DataFrame(columns = ['Ingredients','Items_Base', 'Items_Added', 'Support', 'Confidence','Lift'])

In [12]:
for relation in association_results:
    ing = set(relation.items)
    support = relation.support
    for ord_stat in relation.ordered_statistics:
        scores_df = scores_df.append(
            {'Ingredients' : ing, 'Items_Base' : set(ord_stat.items_base), 'Items_Added' : set(ord_stat.items_add),
             'Support' : support, 'Confidence' : ord_stat.confidence, 'Lift' : ord_stat.lift},
            ignore_index = True
        )

Sort by all highest scores to achieve better food item

In [13]:
score_sorted_df = scores_df.sort_values(by=['Support', 'Confidence','Lift'], ascending=False)
score_sorted_df.head()

Unnamed: 0,Ingredients,Items_Base,Items_Added,Support,Confidence,Lift
16,"{eggs, baking powder}",{baking powder},{eggs},0.037153,0.491659,3.373313
17,"{eggs, baking powder}",{eggs},{baking powder},0.037153,0.25491,3.373313
670,"{salt, sugar, flour}","{sugar, salt}",{flour},0.033967,0.346486,3.055621
669,"{salt, sugar, flour}",{flour},"{sugar, salt}",0.033967,0.299551,3.055621
18,"{flour, baking powder}",{baking powder},{flour},0.03202,0.423732,3.736844


In [14]:
# Create recommended dishes based on items base
score_sorted_df['recommend_dishes'] = score_sorted_df['Ingredients'].apply(lambda lst: searching_dish(lst))

In [15]:
score_sorted_df.head()

Unnamed: 0,Ingredients,Items_Base,Items_Added,Support,Confidence,Lift,recommend_dishes
16,"{eggs, baking powder}",{baking powder},{eggs},0.037153,0.491659,3.373313,"{cathy s toffee blondies for c a c h e, brenda..."
17,"{eggs, baking powder}",{eggs},{baking powder},0.037153,0.25491,3.373313,"{cathy s toffee blondies for c a c h e, brenda..."
670,"{salt, sugar, flour}","{sugar, salt}",{flour},0.033967,0.346486,3.055621,"{delicious homemade donuts, picnic cake with ..."
669,"{salt, sugar, flour}",{flour},"{sugar, salt}",0.033967,0.299551,3.055621,"{delicious homemade donuts, picnic cake with ..."
18,"{flour, baking powder}",{baking powder},{flour},0.03202,0.423732,3.736844,"{cathy s toffee blondies for c a c h e, brenda..."


Testing a set of ingredients to see dishes

In [16]:
lookup = {'flour','salt', 'sugar','eggs','baking powder'}

In [17]:
score_sorted_df[score_sorted_df['Ingredients']==lookup].head()

Unnamed: 0,Ingredients,Items_Base,Items_Added,Support,Confidence,Lift,recommend_dishes
2289,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, flour, baking powder}",{sugar},0.008729,0.703794,3.660598,"{chocolate brownies with hot fudge sauce, hot ..."
2291,"{sugar, flour, salt, eggs, baking powder}","{sugar, salt, flour, baking powder}",{eggs},0.008729,0.541801,3.717339,"{chocolate brownies with hot fudge sauce, hot ..."
2279,"{sugar, flour, salt, eggs, baking powder}","{eggs, flour, baking powder}","{sugar, salt}",0.008729,0.534073,5.447906,"{chocolate brownies with hot fudge sauce, hot ..."
2290,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, sugar, baking powder}",{flour},0.008729,0.519127,4.578125,"{chocolate brownies with hot fudge sauce, hot ..."
2292,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, sugar, flour}",{baking powder},0.008729,0.518462,6.860996,"{chocolate brownies with hot fudge sauce, hot ..."


In [31]:
def recommend(ings_set, top_k = 5):
    '''
    Recommend Dishes & Ingredient based on Support, Confidence & Lift scores
    '''
    ings_set = set(ings_set)
    
    found_records = (
        score_sorted_df[score_sorted_df['Items_Base']==ings_set]
        .sort_values(by=['Support', 'Confidence','Lift'], ascending=False,ignore_index=True)
    )
    wanted_dishes = set()
    # if there exit something in base then slowly start to pull out recommended dishes
    if found_records['Items_Base']:        
        # top k dishes using series indexing so no need to change top_k
        for dishes_set in found_records['recommend_dishes'][:top_k]:         
            if wanted_dishes:
                wanted_dishes = wanted_dishes.intersection(dishes_set)
            else:
                wanted_dishes = wanted_dishes.union(dishes_set)
    # start to search in Ingredients to see if there exists any sets
    # get top k recommended food
    else:
        new_found_ing = (
        score_sorted_df[score_sorted_df['Ingredients']==ings_set]
        .sort_values(by=['Support', 'Confidence','Lift'], ascending=False,ignore_index=True)
        ).loc[:top_k,:]
        for dishes_set in new_found_ing['recommend_dishes']:         
            if wanted_dishes:
                wanted_dishes = wanted_dishes.intersection(dishes_set)
            else:
                wanted_dishes = wanted_dishes.union(dishes_set)
        
    return wanted_dishes
    

In [26]:
top_k = 5
(
    score_sorted_df[score_sorted_df['Ingredients']=={'flour','salt', 'sugar','eggs','baking powder'}]
.sort_values(by=['Support', 'Confidence','Lift'], ascending=False,ignore_index=True)
).loc[:top_k,:]

Unnamed: 0,Ingredients,Items_Base,Items_Added,Support,Confidence,Lift,recommend_dishes
0,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, flour, baking powder}",{sugar},0.008729,0.703794,3.660598,"{chocolate brownies with hot fudge sauce, hot ..."
1,"{sugar, flour, salt, eggs, baking powder}","{sugar, salt, flour, baking powder}",{eggs},0.008729,0.541801,3.717339,"{chocolate brownies with hot fudge sauce, hot ..."
2,"{sugar, flour, salt, eggs, baking powder}","{eggs, flour, baking powder}","{sugar, salt}",0.008729,0.534073,5.447906,"{chocolate brownies with hot fudge sauce, hot ..."
3,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, sugar, baking powder}",{flour},0.008729,0.519127,4.578125,"{chocolate brownies with hot fudge sauce, hot ..."
4,"{sugar, flour, salt, eggs, baking powder}","{salt, eggs, sugar, flour}",{baking powder},0.008729,0.518462,6.860996,"{chocolate brownies with hot fudge sauce, hot ..."
5,"{sugar, flour, salt, eggs, baking powder}","{sugar, flour, baking powder}","{eggs, salt}",0.008729,0.417251,5.332755,"{chocolate brownies with hot fudge sauce, hot ..."


Display all dishes in favor of search ingredients

In [59]:
(
    score_sorted_df[score_sorted_df['Items_Base']=={'flour','salt', 'sugar','eggs'}]
.sort_values(by=['Support', 'Confidence','Lift'], ascending=False,ignore_index=True)['recommend_dishes'][:]
)

0    {josie s pineapple zucchini bread, pollard s p...
1    {low fat chocolate zucchini cake, grilled peac...
2    {hungarian cheese filled coffee cake, low fat ...
3    {honey cake with honey nut topping, chocolate ...
4    {low fat chocolate zucchini cake, josie s pine...
Name: recommend_dishes, dtype: object

# Different Apriori Library

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
frequent_itemsets = apriori(basket_sets, min_support=0.07, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()