# Load data from DB

In [70]:
import mysql.connector
import pandas as pd

db = mysql.connector.connect(host='localhost',user='root',password='root', database='food_recommendation')

In [71]:
df = pd.read_sql(("""

SELECT ing.ingredient_name, recipe_id FROM food_recommendation.ingredients ing
JOIN food_recommendation.recipe_ingredients ri
ON ing.ingredient_id = ri.ingredient_id
ORDER BY recipe_id

"""), db)

In [72]:
df

Unnamed: 0,ingredient_name,recipe_id
0,white wine vinegar,415
1,salt and pepper,415
2,low fat sour cream,415
3,buttermilk,415
4,fresh tarragon,415
...,...,...
10809,unsweetened applesauce,999602
10810,baking soda,999602
10811,maple syrup,999602
10812,oat flour,999602


# Concat ingredients in to a single string

In [79]:
df['recipe_id'].unique()

array([   415,    860,    896, ..., 998213, 999278, 999602], dtype=int64)

In [76]:
df.groupby('recipe_id').transform(lambda x: ','.join(x))

Unnamed: 0,ingredient_name
0,"white wine vinegar,salt and pepper,low fat sou..."
1,"white wine vinegar,salt and pepper,low fat sou..."
2,"white wine vinegar,salt and pepper,low fat sou..."
3,"white wine vinegar,salt and pepper,low fat sou..."
4,"white wine vinegar,salt and pepper,low fat sou..."
...,...
10809,"milk,eggs,salt,vanilla,unsweetened applesauce,..."
10810,"milk,eggs,salt,vanilla,unsweetened applesauce,..."
10811,"milk,eggs,salt,vanilla,unsweetened applesauce,..."
10812,"milk,eggs,salt,vanilla,unsweetened applesauce,..."


In [85]:
df_recipe_ingredients = df.groupby('recipe_id')['ingredient_name'].transform(lambda x: ','.join(x)).drop_duplicates()
df_recipe_ingredients

0        white wine vinegar,salt and pepper,low fat sou...
15       capers,olive oil,shallots,red wine vinegar,kos...
22       coarse salt,blood oranges,green olives,unsalte...
29       oregano,paprika,olive oil,garlic clove,onion,r...
41       sea-salt,roma tomatoes,feta cheese,fresh dill,...
                               ...                        
10764    butter,cream cheese,heavy cream,salt,vanilla,s...
10774    honey,plain greek yogurt,red onion,poppy seeds...
10785    parmesan cheese,salt,olive oil,dried cranberri...
10793    vanilla ice cream,brown sugar,flour,kosher sal...
10805    milk,eggs,salt,vanilla,unsweetened applesauce,...
Name: ingredient_name, Length: 1107, dtype: object

# Add ID back in DF

In [87]:
df_recipe_ingredients= pd.DataFrame(list(zip(df['recipe_id'].unique(), df_recipe_ingredients.to_list())), columns = ['id','ingredients'])
df_recipe_ingredients

Unnamed: 0,id,ingredients
0,415,"white wine vinegar,salt and pepper,low fat sou..."
1,860,"capers,olive oil,shallots,red wine vinegar,kos..."
2,896,"coarse salt,blood oranges,green olives,unsalte..."
3,3131,"oregano,paprika,olive oil,garlic clove,onion,r..."
4,3642,"sea-salt,roma tomatoes,feta cheese,fresh dill,..."
...,...,...
1102,993892,"butter,cream cheese,heavy cream,salt,vanilla,s..."
1103,996399,"honey,plain greek yogurt,red onion,poppy seeds..."
1104,998213,"parmesan cheese,salt,olive oil,dried cranberri..."
1105,999278,"vanilla ice cream,brown sugar,flour,kosher sal..."


# Initialise df to store PMI

In [168]:
ingredients = df['ingredient_name'].unique()
df_pmi = pd.DataFrame(index = ingredients, columns = ingredients)
df_pmi.fillna(0, inplace = True)
df_pmi

Unnamed: 0,white wine vinegar,salt and pepper,low fat sour cream,buttermilk,fresh tarragon,olive oil,low fat mayonnaise,chicken breasts,avocado,chives,...,rooibos,low sodium beef broth,cooked macaroni pasta,tempeh,canned beans,potato flakes,85 percent ground beef,queso dip,erythritol,hazelnut meal
white wine vinegar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
salt and pepper,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
low fat sour cream,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
buttermilk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fresh tarragon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
potato flakes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85 percent ground beef,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
queso dip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
erythritol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Count the occurence of ingredients in the same recipe and compute PMI

\\[    \frac{p(ab)}{p(a)p(b)}      \\]

In [169]:
for i in range(len(df_recipe_ingredients['ingredients'])):
    list_of_ingredients = df_recipe_ingredients['ingredients'][i].split(',')
    for j in range(len(list_of_ingredients)-1):
        for k in range(j+1, len(list_of_ingredients)):
            df_pmi.loc[list_of_ingredients[j], list_of_ingredients[k]] +=1
            df_pmi.loc[list_of_ingredients[k], list_of_ingredients[j]] +=1

In [170]:
df_pmi

Unnamed: 0,white wine vinegar,salt and pepper,low fat sour cream,buttermilk,fresh tarragon,olive oil,low fat mayonnaise,chicken breasts,avocado,chives,...,rooibos,low sodium beef broth,cooked macaroni pasta,tempeh,canned beans,potato flakes,85 percent ground beef,queso dip,erythritol,hazelnut meal
white wine vinegar,0,1,1,1,2,4,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
salt and pepper,1,0,1,1,2,41,2,5,5,6,...,0,0,1,0,0,1,1,1,0,0
low fat sour cream,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
buttermilk,1,1,1,0,2,4,1,1,3,2,...,0,0,0,1,0,0,0,0,0,0
fresh tarragon,2,2,1,2,0,6,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
potato flakes,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
85 percent ground beef,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
queso dip,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
erythritol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
df_pmi.astype('int32').dtypes

white wine vinegar        int32
salt and pepper           int32
low fat sour cream        int32
buttermilk                int32
fresh tarragon            int32
                          ...  
potato flakes             int32
85 percent ground beef    int32
queso dip                 int32
erythritol                int32
hazelnut meal             int32
Length: 1231, dtype: object

In [172]:
import math

def pmi(list_of_ingredients, count_matrix):
    pmi_list = []
    ingredient_pair = []
    for i in range(len(list_of_ingredients)-1):
        for j in range(i+1, len(list_of_ingredients)):
            pa = sum(count_matrix[list_of_ingredients[i]]) 
            pb = sum(count_matrix[list_of_ingredients[j]]) 
            pab = count_matrix.loc[list_of_ingredients[i], list_of_ingredients[j]]
            pmi = pab/(pa*pb)
            pmi_list.append(pmi)
            ingredient_pair.append(list_of_ingredients[i] + '_' + list_of_ingredients[j])
    
    
    
    return  pmi_list

In [173]:
p = pmi(df_recipe_ingredients['ingredients'][1].split(','), df_pmi )

In [174]:
p 

[2.5049203793165147e-05,
 6.57102179388895e-05,
 0.00011645962732919254,
 1.642238370899536e-05,
 4.675081813931744e-05,
 0.0005797101449275362,
 2.176819976987903e-05,
 2.9394473838918284e-05,
 1.5932415470764017e-05,
 1.1062436390990752e-05,
 2.7434842249657064e-05,
 2.24901043540842e-05,
 1.6649897840983963e-05,
 2.7084856856531512e-05,
 0.00016792611251049538,
 1.686226898691488e-05,
 2.4001536098310292e-05,
 0.00029761904761904765,
 1.0153624336206809e-05,
 0.00012590494176896444,
 0.00035842293906810036]

In [175]:
import numpy as np

max_pmi = []
min_pmi = []
avg_pmi = []

for i in range(len(df_recipe_ingredients)):
    p = pmi(df_recipe_ingredients['ingredients'][i].split(','), df_pmi )
    if len(p) == 0:
        
        max_pmi.append(None)
        min_pmi.append(None)
        avg_pmi.append(None)
    else:
        max_pmi.append(max(p))
        min_pmi.append(min(p))
        avg_pmi.append(np.mean(p))


In [176]:
df_recipe_ingredients['max_pmi'] = max_pmi
df_recipe_ingredients['min_pmi'] = min_pmi
df_recipe_ingredients['avg_pmi'] = avg_pmi
df_recipe_ingredients

Unnamed: 0,id,ingredients,max_pmi,min_pmi,avg_pmi
0,415,"white wine vinegar,salt and pepper,low fat sou...",0.000611,0.000004,0.000083
1,860,"capers,olive oil,shallots,red wine vinegar,kos...",0.000580,0.000010,0.000096
2,896,"coarse salt,blood oranges,green olives,unsalte...",0.000461,0.000006,0.000100
3,3131,"oregano,paprika,olive oil,garlic clove,onion,r...",0.000407,0.000008,0.000038
4,3642,"sea-salt,roma tomatoes,feta cheese,fresh dill,...",0.000123,0.000006,0.000030
...,...,...,...,...,...
1102,993892,"butter,cream cheese,heavy cream,salt,vanilla,s...",0.002268,0.000007,0.000188
1103,996399,"honey,plain greek yogurt,red onion,poppy seeds...",0.000439,0.000008,0.000080
1104,998213,"parmesan cheese,salt,olive oil,dried cranberri...",0.000316,0.000005,0.000038
1105,999278,"vanilla ice cream,brown sugar,flour,kosher sal...",0.000220,0.000005,0.000035


In [177]:
df_pmi

Unnamed: 0,white wine vinegar,salt and pepper,low fat sour cream,buttermilk,fresh tarragon,olive oil,low fat mayonnaise,chicken breasts,avocado,chives,...,rooibos,low sodium beef broth,cooked macaroni pasta,tempeh,canned beans,potato flakes,85 percent ground beef,queso dip,erythritol,hazelnut meal
white wine vinegar,0,1,1,1,2,4,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
salt and pepper,1,0,1,1,2,41,2,5,5,6,...,0,0,1,0,0,1,1,1,0,0
low fat sour cream,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
buttermilk,1,1,1,0,2,4,1,1,3,2,...,0,0,0,1,0,0,0,0,0,0
fresh tarragon,2,2,1,2,0,6,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
potato flakes,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
85 percent ground beef,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
queso dip,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
erythritol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculate PMI for each pair of ingredients

In [162]:
recipe_pmi = pd.DataFrame(index = ingredients, columns = ingredients)
recipe_pmi.fillna(0, inplace = True)
recipe_pmi

Unnamed: 0,white wine vinegar,salt and pepper,low fat sour cream,buttermilk,fresh tarragon,olive oil,low fat mayonnaise,chicken breasts,avocado,chives,...,rooibos,low sodium beef broth,cooked macaroni pasta,tempeh,canned beans,potato flakes,85 percent ground beef,queso dip,erythritol,hazelnut meal
white wine vinegar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
salt and pepper,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
low fat sour cream,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
buttermilk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fresh tarragon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
potato flakes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85 percent ground beef,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
queso dip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
erythritol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
sum_dict = {}
for i in ingredients:
    sum_dict[i] = df_pmi[i].sum()

In [165]:
sum_dict

{'white wine vinegar': 52,
 'salt and pepper': 928,
 'low fat sour cream': 42,
 'buttermilk': 297,
 'fresh tarragon': 105,
 'olive oil': 2430,
 'low fat mayonnaise': 52,
 'chicken breasts': 142,
 'avocado': 370,
 'chives': 218,
 'cucumber': 248,
 'fresh parsley': 978,
 'tomato': 841,
 'arugula': 105,
 'anchovy': 39,
 'capers': 115,
 'shallots': 397,
 'red wine vinegar': 224,
 'kosher salt': 1059,
 'spinach': 186,
 'striped bass': 15,
 'coarse salt': 87,
 'blood oranges': 35,
 'green olives': 62,
 'unsalted butter': 1053,
 'canola oil': 644,
 'cod fillets': 78,
 'oregano': 561,
 'paprika': 227,
 'garlic clove': 3247,
 'onion': 1936,
 'red bell peppers': 486,
 'fennel': 142,
 'white wine': 388,
 'herbs': 60,
 'canned tomatoes': 405,
 'fish': 41,
 'sea-salt': 521,
 'roma tomatoes': 77,
 'feta cheese': 229,
 'fresh dill': 177,
 'zucchini': 173,
 'sour cream': 420,
 'vegetable oil': 545,
 'limes': 91,
 'lime juice': 598,
 'lettuce': 159,
 'mahi mahi': 19,
 'corn tortillas': 142,
 'pico de g

In [166]:
for i in ingredients:
    for j in ingredients:
        recipe_pmi.loc[i,j] = df_pmi.loc[i,j]/(sum_dict[i] * sum_dict[j])

In [167]:
recipe_pmi

Unnamed: 0,white wine vinegar,salt and pepper,low fat sour cream,buttermilk,fresh tarragon,olive oil,low fat mayonnaise,chicken breasts,avocado,chives,...,rooibos,low sodium beef broth,cooked macaroni pasta,tempeh,canned beans,potato flakes,85 percent ground beef,queso dip,erythritol,hazelnut meal
white wine vinegar,0.000000,0.000021,0.000458,0.000065,0.000366,0.000032,0.000370,0.000135,0.000052,0.000088,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0
salt and pepper,0.000021,0.000000,0.000026,0.000004,0.000021,0.000018,0.000041,0.000038,0.000015,0.000030,...,0.0,0.0,0.00012,0.00000,0.0,0.000108,0.000108,0.000098,0.0,0.0
low fat sour cream,0.000458,0.000026,0.000000,0.000080,0.000227,0.000010,0.000458,0.000168,0.000064,0.000109,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0
buttermilk,0.000065,0.000004,0.000080,0.000000,0.000064,0.000006,0.000065,0.000024,0.000027,0.000031,...,0.0,0.0,0.00000,0.00016,0.0,0.000000,0.000000,0.000000,0.0,0.0
fresh tarragon,0.000366,0.000021,0.000227,0.000064,0.000000,0.000024,0.000183,0.000134,0.000026,0.000044,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
potato flakes,0.000000,0.000108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.010000,0.000000,0.0,0.0
85 percent ground beef,0.000000,0.000108,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.00000,0.00000,0.0,0.010000,0.000000,0.000000,0.0,0.0
queso dip,0.000000,0.000098,0.000000,0.000000,0.000000,0.000000,0.000000,0.000640,0.000000,0.000000,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0
erythritol,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.00000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0


In [178]:
recipe_pmi.to_csv('recipe_pmi.csv', index = False)