# Correlation of ingredients

Objective: find the correlation between ingredients. Which occur together most often, and which do not?
- Which ingredients appear most often? Make a word cloud. 
- Iterate throught the ingredient column in the recipe CSV and create a contextual co-occurence frequency table of ingredients

In [1]:
import pandas as pd
#dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['INGREDIENTS'] 
#dataset 3 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['INGREDIENTS'] 
#dataset 4 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['INGREDIENTS'] 

frames = [saved_veg, saved_armed, saved_common]
#creating one dataframe with the set of ingredients
result = pd.concat(frames)
result.head()

0    {'parsley', 'salt', 'pepper', 'tea', 'mushroom...
1    {'paprika', 'parsley', 'water', 'rice', 'basil...
2    {'fat', 'vegetable', 'apple', 'flour', 'broth'...
3    {'fat', 'vegetable', 'apple', 'flour', 'broth'...
4    {'water', 'redients', 'corn', 'oil', 'gravy', ...
Name: INGREDIENTS, dtype: object

In [2]:
#importing libraries
from itertools import permutations
from collections import Counter

cooccs_surface = Counter()
#contextual coocurrence
for row in result:
    for i,w in enumerate(row):
        row = row.replace('}', "")
        row = row.replace('{', "")
        for w in row.split(','):
            for cw in row.split(','):
                if cw != w:
                    cooccs_surface[(w, cw)] += 1
    

In [3]:
cooccs_surface.most_common(100)

[((" 'water'", " 'oil'"), 57113),
 ((" 'oil'", " 'water'"), 57113),
 ((" 'water'", " 'salt'"), 50616),
 ((" 'salt'", " 'water'"), 50616),
 ((" 'salt'", " 'oil'"), 45839),
 ((" 'oil'", " 'salt'"), 45839),
 ((" 'water'", " 'pepper'"), 44830),
 ((" 'pepper'", " 'water'"), 44830),
 ((" 'pepper'", " 'oil'"), 44136),
 ((" 'oil'", " 'pepper'"), 44136),
 ((" 'pepper'", " 'onion'"), 42319),
 ((" 'onion'", " 'pepper'"), 42319),
 ((" 'oil'", " 'onion'"), 40777),
 ((" 'onion'", " 'oil'"), 40777),
 ((" 'water'", " 'onion'"), 40290),
 ((" 'onion'", " 'water'"), 40290),
 ((" 'water'", " 'ice'"), 37509),
 ((" 'ice'", " 'water'"), 37509),
 ((" 'salt'", " 'pepper'"), 37411),
 ((" 'pepper'", " 'salt'"), 37411),
 ((" 'ice'", " 'oil'"), 36672),
 ((" 'oil'", " 'ice'"), 36672),
 ((" 'salt'", " 'sugar'"), 34791),
 ((" 'sugar'", " 'salt'"), 34791),
 ((" 'ice'", " 'onion'"), 34679),
 ((" 'onion'", " 'ice'"), 34679),
 ((" 'flour'", " 'water'"), 33761),
 ((" 'water'", " 'flour'"), 33761),
 ((" 'ice'", " 'pepper'"

# Correlation of verb with noun, what are the actions? 

In [4]:
import pandas as pd
#dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['METHOD'] 
#dataset 3 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['METHOD'] 
#dataset 4 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['METHOD'] 

frames = [saved_veg, saved_armed, saved_common]
#creating one dataframe with the methods
method = pd.concat(frames)
method.head()

0    [('Pierce', 'NOUN'), ('top', 'NOUN'), ('of', '...
1    [('In', 'ADP'), ('a', 'DET'), ('one', 'NUM'), ...
2    [('Broth', 'DET'), ('for', 'ADP'), ('Boiling',...
3    [('Broth', 'DET'), ('for', 'ADP'), ('Boiling',...
4    [('In', 'ADP'), ('a', 'DET'), ('medium', 'NOUN...
Name: METHOD, dtype: object

In [5]:
from collections import Counter
verb_cooccs_surface = Counter()
#"cut the apple", boil the broccoli etc. Only look at the right side form the verb.
spansize = 2
for row in method:
    row = row.split("),")
    for i,w in enumerate(row):
        w = w.replace('(', "")
        w = w.replace('[', "")
        if "VERB" in w:
            span_range = list(range(max(i, 0), i)) # left side indices (range, then list so we can extend)
            span_range.extend(range(i+1, min(i + spansize + 1, len(w)))) # extend by right side indices
            for cw in [row[idx] for idx in span_range]:
                if 'INGREDIENT'in cw:
                    verb_cooccs_surface[(w, cw)] += 1
                
                

IndexError: list index out of range

In [6]:
verb_cooccs_surface.most_common(100)

[((" 'boiling', 'VERB'", " ('water', 'INGREDIENT'"), 30),
 ((" 'melted', 'VERB'", " ('shortening', 'INGREDIENT'"), 28),
 ((" 'running', 'VERB'", " ('water', 'INGREDIENT'"), 21),
 ((" 'melted', 'VERB'", " ('salad', 'INGREDIENT'"), 10),
 ((" 'Remove', 'VERB'", " ('fat', 'INGREDIENT'"), 6),
 ((" 'removing', 'VERB'", " ('beans', 'INGREDIENT'"), 5),
 ((" 'discolored', 'VERB'", " ('beans', 'INGREDIENT'"), 5),
 ((" 'corned', 'VERB'", " ('beef', 'INGREDIENT'"), 5),
 ((" 'substituted', 'VERB'", " ('meat', 'INGREDIENT'"), 5),
 ((" 'crushed', 'VERB'", " ('oregano', 'INGREDIENT'"), 5),
 ((" 'crushed', 'VERB'", " ('basil', 'INGREDIENT'"), 4),
 ((" 'melted', 'VERB'", " ('butter', 'INGREDIENT'"), 4),
 ((" 'serving', 'VERB'", " ('size', 'INGREDIENT'"), 4),
 ((" 'diced', 'VERB'", " ('onion', 'INGREDIENT'"), 3),
 (("'Blend', 'VERB'", " ('sugar', 'INGREDIENT'"), 3),
 ((" 'Add', 'VERB'", " ('water', 'INGREDIENT'"), 3),
 ((" 'Blend', 'VERB'", " ('shortening', 'INGREDIENT'"), 3),
 ((" 'diced', 'VERB'", " ('

In [7]:
import helpers as hp 

hp.dump(cooccs_surface, "Cococcs of ingredients")
hp.dump(verb_cooccs_surface, "Cococcs of ingredients vs verbs")