In [10]:
import json, random, re, collections, itertools,base64,hashlib, os, pickle,hashlib
from pprint import pprint
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
from ipywidgets import interact
display_html = lambda x: display(HTML(x))
data_dir = (Path(os.getcwd()).parent /"data"/"npn-cooking").absolute()
annot_path = Path("../annotations/npn-cooking")

ls = lambda p: print("\n".join(map(str,p.iterdir())))

In [2]:
def simple_hash(s, length=8):
    md5bytes = hashlib.md5(s.encode('ascii',errors="ignore")).digest()
    ret = base64.urlsafe_b64encode(md5bytes).decode('ascii')
    return ret[:length]

In [3]:
ls(data_dir)

/home/ugoren/recipe_scheduler/data/npn-cooking/vocabs
/home/ugoren/recipe_scheduler/data/npn-cooking/README.txt
/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon
/home/ugoren/recipe_scheduler/data/npn-cooking/recipes


In [4]:
ls(data_dir / "lexicon")

/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon/attr_assignments_by_verb.pickle
/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon/state_change_by_verb_ncl.pickle


In [5]:
with (data_dir / "lexicon" / "attr_assignments_by_verb.pickle").open('rb') as f:
    attr_assignments_by_verb=pickle.load(f)
with (data_dir / "lexicon" / "state_change_by_verb_ncl.pickle").open('rb') as f:
    state_change_by_verb_ncl=pickle.load(f)

In [6]:
attr_assignments_by_verb

{'ignite': ['temperature'],
 'thicken': ['composition'],
 'salt': ['composition'],
 'lace': ['shape'],
 'perch': ['location', 'composition'],
 'poach': ['cookedness', 'temperature'],
 'tip': ['location'],
 'dribble': ['composition'],
 'move': ['location'],
 'knot': ['shape'],
 'mince': ['shape'],
 'soften': ['temperature'],
 'snip': ['shape', 'composition'],
 'press': ['shape'],
 'shape': ['shape'],
 'skim': ['composition'],
 'fillet': ['shape'],
 'shake': ['composition'],
 'barbecue': ['cookedness'],
 'strew': ['composition'],
 'line': ['composition'],
 'toast': ['cookedness', 'temperature'],
 'deep-fry': ['cookedness', 'temperature'],
 'blot': ['cleanliness', 'composition'],
 'nudge': ['location'],
 'dice': ['shape'],
 'glide': ['location'],
 'flood': ['composition'],
 'wring': ['shape', 'composition'],
 'ease': ['location'],
 'lard': ['composition'],
 'sugar': ['composition'],
 'add': ['composition'],
 'spread': ['shape', 'composition'],
 'pile': ['location'],
 'crack': ['shape'],
 

# Generate ingredient mapping

In [7]:
ingredient_set=collections.Counter()
lst=list((data_dir / "recipes").glob("*.json"))
for p in tqdm(lst):
    with p.open('r') as f:
        recipe = json.load(f)
    ingredient_set+=collections.Counter(recipe["ingredient_list"])
ingredient_set=collections.Counter(ingredient_set)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=121074.0), HTML(value='')))




In [8]:
ingredient_list = [ing for ing, cnt in ingredient_set.most_common() if cnt>1000]
ingredient_list

['salt',
 'sugar',
 'pepper',
 'onion',
 'water',
 'butter',
 'flour',
 'milk',
 'eggs',
 'garlic',
 'parsley',
 'olive_oil',
 'tomato',
 'lemon_juice',
 'vanilla',
 'egg',
 'chicken',
 'cinnamon',
 'oil',
 'margarine',
 'garlic_clove',
 'baking_powder',
 'cream',
 'celery',
 'cheese',
 'vegetable_oil',
 'lemon',
 'carrot',
 'mustard',
 'ginger',
 'potato',
 'soy_sauce',
 'mushroom',
 'baking_soda',
 'beef',
 'rice',
 'green_onion',
 'sour_cream',
 'nutmeg',
 'vinegar',
 'oregano',
 'bean',
 'honey',
 'bread',
 'paprika',
 'green_pepper',
 'thyme',
 'cloves_garlic',
 'fresh',
 'clove',
 'cornstarch',
 'basil',
 'raisin',
 'cumin',
 'parmesan_cheese',
 'sauce',
 'chocolate',
 'cheddar_cheese',
 'worcestershire_sauce',
 'cream_cheese',
 'nuts',
 'chili_powder',
 'clove_garlic',
 'white_wine',
 'walnut',
 'orange',
 'apple',
 'chicken_broth',
 'orange_juice',
 'almond',
 'shortening',
 'egg_white',
 'mayonnaise',
 'egg_yolk',
 'bacon',
 'dry',
 'tomato_sauce',
 'cilantro',
 'pecan',
 'bay

In [16]:
ingredients_map = {ing:"I" + simple_hash(ing) for ing in ingredient_list}
with (data_dir/"ingredients_map.json").open('w') as f:
    json.dump(ingredients_map,f)

# Generate annotations

In [15]:
def preprocess_recipe(recipe):
    prep = {}
    instructions =[]
    for i,t in recipe["text"].items():
        instructions.append((int(i), " ".join(t)))
    instructions = [t for i, t in sorted(instructions)]
    prep["instructions"]=instructions
    prep["ingredients"]=recipe["ingredient_list"]
    prep["normalized_ingredients"]={ingredients_map[ing]:ing for ing in recipe["ingredient_list"]}
    prep["title"]=recipe["id"]
    prep["status"]=0
    prep["photo_url"]=""
    prep["url"]=""
    prep["labels"]=[[] for _ in range(len(prep["instructions"]))]
    return prep

In [18]:
recipes={}
recipe_index=0
lst = list((data_dir / "recipes").glob("*.json"))
for p in tqdm(lst):
    with p.open('r') as f:
        recipe = json.load(f)
    if all([ing in ingredients_map for ing in recipe["ingredient_list"]]):
        recipe = preprocess_recipe(recipe)
        recipes[p.name] = recipe
        recipe_index +=1
        with (annot_path / f"{recipe_index}.json").open('w') as f:
            json.dump(recipe,f, indent=4)
len(recipes)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=121074.0), HTML(value='')))




16838

In [30]:
rid = list(recipes.keys())[2]
recipe = recipes[rid]
recipe

{'ing_type': {'1': 2, '0': 1, '3': 2, '2': 1, '5': 2, '4': 2},
 'text': {'1': ['puree', '.'],
  '0': ['steam',
   'carrots',
   'until',
   'tender',
   '-lrb-',
   'about',
   '7',
   'minutes',
   '-rrb-',
   '.'],
  '3': ['turn',
   'dough',
   'out',
   'onto',
   'a',
   'floured',
   'board',
   '&',
   'knead',
   'until',
   'smooth',
   'like',
   'a',
   'firm',
   'bread',
   'dough',
   '.'],
  '2': ['combine',
   '3/4',
   'carrot',
   'puree',
   'with',
   'remaining',
   'ingredients',
   'to',
   'form',
   'a',
   'dough',
   '.'],
  '5': ['shape', '&', 'cook', 'dough', 'as', 'desired', '.'],
  '4': ['place',
   'douhg',
   'in',
   'a',
   'bowl',
   ',',
   'cover',
   'with',
   'plastic',
   'wrap',
   '&',
   'let',
   'rest',
   'for',
   'at',
   'least',
   '30',
   'minutes',
   '.']},
 'ingredient_list': ['carrot', 'flour', 'ginger'],
 'id': 'golden pasta 0',
 'verb': {'1': ['puree'],
  '0': ['steam'],
  '3': ['turn', 'knead'],
  '2': ['combine', 'puree'],
 

In [10]:
def text(recipe):
    ret =[]
    for i,t in recipe["text"].items():
        ret.append((int(i), " ".join(t)))
    ret = [t for i, t in sorted(ret)]
    return  ret

print("\n".join(text(recipe)))

in a large saucepan , -lrb- 5 or 6 quart size -rrb- combine sugar , baking soda , salt , buttermilk and corn syrup .
bring to a boil , stirring constantly .
reduce heat to medium-low , continue cooking until mixture becomes caramel-colored and reaches soft ball stage -lrb- 238 degrees -rrb- .
remove from heat , add butter and pecan halves .
beat until thick enough to drop from a spoon onto waxed paper .
if mixture becomes too hard , return to heat and add small amount of water .
stir until smooth .
44 to 48 pralines .
omit corn syrup .
increase butter to 3/4 cup , add 1 teaspoon vanilla .
include butter in cooking stage .
add vanilla with pecans .


In [11]:
df=pd.DataFrame([[]])

In [28]:
np.stack([df,df,df],axis=2).shape

(1, 0, 3)