# Correlation of ingredients

Objective: find the correlation between ingredients. Which occur together most often, and which do not?
- Iterate throught the ingredient column in the recipe CSV and create a contextual co-occurence frequency table of ingredients

In [1]:
import pandas as pd
#dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['INGREDIENTS'] 
#dataset 3 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['INGREDIENTS'] 
#dataset 4 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['INGREDIENTS'] 

frames = [saved_veg, saved_armed, saved_common]
#creating one dataframe with the set of ingredients
result = pd.concat(frames)
result.head()

0    {'parsley', 'salt', 'pepper', 'tea', 'mushroom...
1    {'paprika', 'parsley', 'water', 'rice', 'basil...
2    {'fat', 'vegetable', 'apple', 'flour', 'broth'...
3    {'fat', 'vegetable', 'apple', 'flour', 'broth'...
4    {'water', 'redients', 'corn', 'oil', 'gravy', ...
Name: INGREDIENTS, dtype: object

In [2]:
#importing libraries
from itertools import permutations
from collections import Counter

cooccs_surface = Counter()
#contextual coocurrence
for row in result:
    for i,w in enumerate(row):
        row = row.replace('}', "")
        row = row.replace('{', "")
        for w in row.split(','):
            for cw in row.split(','):
                if cw != w:
                    cooccs_surface[(w, cw)] += 1
    

In [12]:
cooccs_surface.most_common(100)

[((" 'oil'", " 'water'"), 54419),
 ((" 'water'", " 'oil'"), 54419),
 ((" 'water'", " 'pepper'"), 42363),
 ((" 'pepper'", " 'water'"), 42363),
 ((" 'pepper'", " 'oil'"), 39987),
 ((" 'oil'", " 'pepper'"), 39987),
 ((" 'pepper'", " 'onion'"), 39164),
 ((" 'onion'", " 'pepper'"), 39164),
 ((" 'water'", " 'onion'"), 38110),
 ((" 'onion'", " 'water'"), 38110),
 ((" 'water'", " 'ice'"), 37286),
 ((" 'ice'", " 'water'"), 37286),
 ((" 'oil'", " 'onion'"), 37064),
 ((" 'onion'", " 'oil'"), 37064),
 ((" 'water'", " 'sugar'"), 35730),
 ((" 'sugar'", " 'water'"), 35730),
 ((" 'flour'", " 'water'"), 35276),
 ((" 'water'", " 'flour'"), 35276),
 ((" 'oil'", " 'ice'"), 33192),
 ((" 'ice'", " 'oil'"), 33192),
 ((" 'onion'", " 'ice'"), 32156),
 ((" 'ice'", " 'onion'"), 32156),
 ((" 'pepper'", " 'ice'"), 30924),
 ((" 'ice'", " 'pepper'"), 30924),
 ((" 'water'", " 'dehydrated'"), 29560),
 ((" 'dehydrated'", " 'water'"), 29560),
 ((" 'onion'", " 'dehydrated'"), 29425),
 ((" 'dehydrated'", " 'onion'"), 2942

# Correlation of verb with noun, what are the actions? 

In [3]:
import pandas as pd
#dataset 1 - vegetarian
df = pd.read_csv('recipes_veg.csv')
saved_veg = df['METHOD'] 
#dataset 3 - armed
df = pd.read_csv('recipes_armed.csv')
saved_armed = df['METHOD'] 
#dataset 4 - common
df = pd.read_csv('recipes_common.csv')
saved_common = df['METHOD'] 

frames = [saved_veg, saved_armed, saved_common]
#creating one dataframe with the methods
method = pd.concat(frames)
method.head()

0    [('Pierce', 'NOUN'), ('top', 'NOUN'), ('of', '...
1    [('In', 'ADP'), ('a', 'DET'), ('one', 'NUM'), ...
2    [('Broth', 'DET'), ('for', 'ADP'), ('Boiling',...
3    [('Broth', 'DET'), ('for', 'ADP'), ('Boiling',...
4    [('In', 'ADP'), ('a', 'DET'), ('medium', 'NOUN...
Name: METHOD, dtype: object

In [5]:
from collections import Counter
verb_cooccs_surface = Counter()
#"cut the apple", boil the broccoli etc. Only look at the right side form the verb.
spansize = 5
for row in method:
    row = row.split("),")
    for i,w in enumerate(row):
        w = w.replace('(', "")
        w = w.replace('[', "")
        if "VERB" in w:
            span_range = list(range(max(i - spansize, 0), i)) # left side indices (range, then list so we can extend)
            span_range.extend(range(i + 1, min(i + spansize + 1, len(w)))) # extend by right side indices
            for cw in [row[idx] for idx in span_range]:
                if 'INGREDIENT'in cw:
                    verb_cooccs_surface[(w, cw)] += 1
                
                

IndexError: list index out of range

In [6]:
verb_cooccs_surface.most_common(100)

[((" 'may', 'VERB'", " ('dehydrated', 'INGREDIENT'"), 66),
 ((" 'be', 'VERB'", " ('dehydrated', 'INGREDIENT'"), 65),
 ((" 'used', 'VERB'", " ('dehydrated', 'INGREDIENT'"), 63),
 ((" 'greased', 'VERB'", " ('batter', 'INGREDIENT'"), 39),
 ((" 'melted', 'VERB'", " ('salad', 'INGREDIENT'"), 34),
 ((" 'melted', 'VERB'", " ('oil', 'INGREDIENT'"), 33),
 ((" 'Add', 'VERB'", " ('water', 'INGREDIENT'"), 31),
 ((" 'may', 'VERB'", " ('garlic', 'INGREDIENT'"), 31),
 ((" 'be', 'VERB'", " ('garlic', 'INGREDIENT'"), 31),
 ((" 'add', 'VERB'", " ('water', 'INGREDIENT'"), 30),
 ((" 'used', 'VERB'", " ('garlic', 'INGREDIENT'"), 30),
 ((" 'boiling', 'VERB'", " ('water', 'INGREDIENT'"), 23),
 ((" 'minced', 'VERB'", " ('onion', 'INGREDIENT'"), 22),
 ((" 'floured', 'VERB'", " ('batter', 'INGREDIENT'"), 22),
 ((" 'used', 'VERB'", " ('water', 'INGREDIENT'"), 21),
 ((" 'Bring', 'VERB'", " ('water', 'INGREDIENT'"), 20),
 ((" 'will', 'VERB'", " ('celery', 'INGREDIENT'"), 20),
 ((" 'running', 'VERB'", " ('chicken',

In [7]:
import helpers as hp

hp.dump(verb_cooccs_surface, "Cococcs of ingredients vs verbs")