In [1]:
import json, random, re, collections, itertools,base64,hashlib, os, pickle,hashlib
from pprint import pprint
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
from ipywidgets import interact
display_html = lambda x: display(HTML(x))
data_dir = (Path(os.getcwd()).parent /"data"/"npn-cooking").absolute()
annot_path = Path("../annotations/npn-cooking")

ls = lambda p: print("\n".join(map(str,p.iterdir())))

In [2]:
def simple_hash(s, length=8):
    md5bytes = hashlib.md5(s.encode('ascii',errors="ignore")).digest()
    ret = base64.urlsafe_b64encode(md5bytes).decode('ascii')
    return ret[:length]

In [3]:
ls(data_dir)

/home/ugoren/recipe_scheduler/data/npn-cooking/.gitkeep
/home/ugoren/recipe_scheduler/data/npn-cooking/vocabs
/home/ugoren/recipe_scheduler/data/npn-cooking/README.txt
/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon
/home/ugoren/recipe_scheduler/data/npn-cooking/recipes
/home/ugoren/recipe_scheduler/data/npn-cooking/ingredients_map.json


In [4]:
ls(data_dir / "lexicon")

/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon/attr_assignments_by_verb.pickle
/home/ugoren/recipe_scheduler/data/npn-cooking/lexicon/state_change_by_verb_ncl.pickle


In [5]:
with (data_dir / "lexicon" / "attr_assignments_by_verb.pickle").open('rb') as f:
    attr_assignments_by_verb=pickle.load(f)
with (data_dir / "lexicon" / "state_change_by_verb_ncl.pickle").open('rb') as f:
    state_change_by_verb_ncl=pickle.load(f)

In [6]:
attr_assignments_by_verb

{'ignite': ['temperature'],
 'thicken': ['composition'],
 'salt': ['composition'],
 'lace': ['shape'],
 'perch': ['location', 'composition'],
 'poach': ['cookedness', 'temperature'],
 'tip': ['location'],
 'dribble': ['composition'],
 'move': ['location'],
 'knot': ['shape'],
 'mince': ['shape'],
 'soften': ['temperature'],
 'snip': ['shape', 'composition'],
 'press': ['shape'],
 'shape': ['shape'],
 'skim': ['composition'],
 'fillet': ['shape'],
 'shake': ['composition'],
 'barbecue': ['cookedness'],
 'strew': ['composition'],
 'line': ['composition'],
 'toast': ['cookedness', 'temperature'],
 'deep-fry': ['cookedness', 'temperature'],
 'blot': ['cleanliness', 'composition'],
 'nudge': ['location'],
 'dice': ['shape'],
 'glide': ['location'],
 'flood': ['composition'],
 'wring': ['shape', 'composition'],
 'ease': ['location'],
 'lard': ['composition'],
 'sugar': ['composition'],
 'add': ['composition'],
 'spread': ['shape', 'composition'],
 'pile': ['location'],
 'crack': ['shape'],
 

# Generate ingredient mapping

In [7]:
ingredient_set=collections.Counter()
lst=list((data_dir / "recipes").glob("*.json"))
for p in tqdm(lst):
    with p.open('r') as f:
        recipe = json.load(f)
    ingredient_set+=collections.Counter(recipe["ingredient_list"])
ingredient_set=collections.Counter(ingredient_set)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=121074.0), HTML(value='')))




In [8]:
ingredient_list = [ing for ing, cnt in ingredient_set.most_common() if cnt>1000]
ingredient_list

['salt',
 'sugar',
 'pepper',
 'onion',
 'water',
 'butter',
 'flour',
 'milk',
 'eggs',
 'garlic',
 'parsley',
 'olive_oil',
 'tomato',
 'lemon_juice',
 'vanilla',
 'egg',
 'chicken',
 'cinnamon',
 'oil',
 'margarine',
 'garlic_clove',
 'baking_powder',
 'cream',
 'celery',
 'cheese',
 'vegetable_oil',
 'lemon',
 'carrot',
 'mustard',
 'ginger',
 'potato',
 'soy_sauce',
 'mushroom',
 'baking_soda',
 'beef',
 'rice',
 'green_onion',
 'sour_cream',
 'nutmeg',
 'vinegar',
 'oregano',
 'bean',
 'honey',
 'bread',
 'paprika',
 'green_pepper',
 'thyme',
 'cloves_garlic',
 'fresh',
 'clove',
 'cornstarch',
 'basil',
 'raisin',
 'cumin',
 'parmesan_cheese',
 'sauce',
 'chocolate',
 'cheddar_cheese',
 'worcestershire_sauce',
 'cream_cheese',
 'nuts',
 'chili_powder',
 'clove_garlic',
 'white_wine',
 'walnut',
 'orange',
 'apple',
 'chicken_broth',
 'orange_juice',
 'almond',
 'shortening',
 'egg_white',
 'mayonnaise',
 'egg_yolk',
 'bacon',
 'dry',
 'tomato_sauce',
 'cilantro',
 'pecan',
 'bay

In [9]:
ingredients_map = {ing:"I" + simple_hash(ing) for ing in ingredient_list}
with (data_dir/"ingredients_map.json").open('w') as f:
    json.dump(ingredients_map,f)

# Generate annotations

In [10]:
def preprocess_recipe(recipe):
    prep = {}
    instructions =[]
    for i,t in recipe["text"].items():
        instructions.append((int(i), " ".join(t).replace("-lrb-",'(').replace("-rrb-",')')))
    instructions = [t for i, t in sorted(instructions)]
    ing_validations = [[ingredients_map[recipe["ingredient_list"][x]] for x in t] for i, t in sorted([(int(k),v) for k,v in recipe["ingredients"].items()])]
    prep["instructions"]=instructions
    prep["ingredients"]=sorted(set(recipe["ingredient_list"]))
    prep["normalized_ingredients"]={ingredients_map[ing]:ing for ing in prep["ingredients"]}
    prep["title"]=recipe["id"].title()
    prep["status"]=0
    prep["validations"]=ing_validations
    prep["photo_url"]=""
    prep["url"]=""
    prep["labels"]=[[] for _ in range(len(prep["instructions"]))]
    return prep

In [11]:
recipes={}
recipe_index=0
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "butter", "margarine", "vinegar","mustard", "turkey",
                      "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn", "mayonnaise","paprika"
                      "cloves_garlic", "sauce", "noodles", "carrot", "bread", "broccoli", "sausage", "ketchup","oregano",
                     "chili_powder", "bean", "olive_oil", "lemon_juice", "carrot", "yam", "mushroom", "rice", "buckwheat","zucchini",
                     "parmesan_cheese", "cream_cheese", "cheddar_cheese", "bacon", "cream","peas","pineapple"}
ignored_ingredients = {"egg", "eggs", "flour", "lemon", "lemons", "grapefruit","apple","apples",
                       "pear","pears","chocolate", "soda","orange", "oranges", "vanilla", "nut", "nuts"}
lst = list((data_dir / "recipes").glob("*.json"))
for p in tqdm(lst):
    with p.open('r') as f:
        recipe = json.load(f)
    if all([ing in ingredients_map for ing in recipe["ingredient_list"]]):
        recipe = preprocess_recipe(recipe)
        if len(recipe["ingredients"])<3 or len(recipe["instructions"])<4 or len(recipe["instructions"])>9:
            continue
        recipe_index +=1
        if all([len(x.split())>2 for x in recipe["instructions"]])\
        and len(lookup_ingredients & set(recipe["ingredients"]))>3 \
        and len(ignored_ingredients & set(recipe["ingredients"]))==0 \
        and len(recipe["ingredients"])<=10 \
        and not recipe["title"][-1].isdigit():
            recipes[p.name] = recipe
            with (annot_path / f"{recipe_index}.json").open('w') as f:
                json.dump(recipe,f, indent=4)
len(recipes)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=121074.0), HTML(value='')))




344

In [12]:
for rid in recipes:
    print (rid[:-5])

turkey_creole
chicken_honey_nut_stir_fry
cuban_red_beans__rice
herbed_potatoes_on_the_grill
augustas_chilled_tomato_soup_basil_cream
hachis_parmentier_french
senators_navy_bean_soup
turkey_kabobs
broccoli_salad_bacon
tomato_rice_casserole
potatoes_primavera
condimento_alle_verdure_cool_summer_vegetabl
kohls_quick_hot_n_sweet_sauce
natalies_rice_black_eyed_peas
ponzu_sauce
tomato_sweet_pepper_ketchup
fourway-chicken
bourbon_street_shrimp
wintery_day_bean_soup_aka_bodle_bean_soup
sweet__sour_chicken_wings
homemade_croutons
potatoes_perfect
porcini_sausage_stew_bon_appetit
parmesan_risotto
southwestern_seasoning_mix
cajun_wings
cream_of_spinach_soup
beef_bean_chili
chicken__vegetable_casseroles
broccoli_lemon__garlic
carrot_orange_soup
contadina_garden_sauce
braised_celery__red_pepper
italian_vegetable_soup
turkey_bean_patties_mustard_sauce
broccoli_mustard_dressing
beefeaters_kidney_beans
fast_pilaf
blender_country_soup
herbed_onions
oven_baked_devilled_chicken
garlic_bread_in_a_bag
elot

In [22]:
def text(rid):
    ret = recipes[rid]["title"]
    ret +="\n------------\n"
    ret +="* " + "\n* ".join(recipes[rid]["ingredients"])
    ret +="\n------------\n"
    ret +="\n".join(recipes[rid]["instructions"])
    return ret
    
print(json.dumps({rid[:-5]: text(rid) for rid in recipes}))

{"turkey_creole": "Turkey Creole\n------------\n* celery\n* chili_powder\n* onion\n* pepper\n* rice\n* salt\n* tomato\n* turkey\n------------\nput everything except rice in crockpot .\nleave for most of the day .\nserve over cooked rice .\n< how 's this for simple ?", "chicken_honey_nut_stir_fry": "Chicken Honey Nut Stir Fry\n------------\n* carrot\n* chicken\n* cornstarch\n* ginger\n* honey\n* oil\n* orange_juice\n* peanut\n* rice\n* soy_sauce\n------------\ncut chicken into thin strips , set aside .\nin a small bowl , combine juice , honey , soy sauce cornstarch and ginger , mix well .\nheat 1 tbsp oil in a wok , add carrots and celery , stir fry about 3 minutes , remove , set aside .\nadd remaining oil , add meat stir fry 3 minutes , add vegetables , add sauce and nuts , cook and stir until sauce is thickened .", "cuban_red_beans__rice": "Cuban Red Beans  Rice\n------------\n* bay\n* bean\n* garlic_clove\n* olive_oil\n* onion\n* oregano\n* pepper\n* rice\n* salt\n------------\ncook 

In [42]:
df = pd.read_csv(data_dir / "selected_recipes_20210216.csv")
df["text"] = df["example"].apply(lambda x: "\n".join(recipes[x + ".json"]["instructions"]))
selected_examples = sorted(map(tuple, df.query("OK>0")[["text", "example"]].values))
selected_examples

[(') rinse beans , place in large kettle , cover with water add salt and soak overnight .\ndrain add 2 quarts water and ham/sausage .\nsimmer for 2-1/2 to 3 hours .\nadd onion , garlic , chili powder , tomatoes , and lemon juice .\nsimmer 45 minutes .\nadd salt and pepper if desired .',
  'wintery_day_bean_soup'),
 (') rinse beans , place in large kettle , cover with water add salt and soak overnight .\ndrain add 2 quatrs water and ham/sausage .\nsimmer for 2-1/2 to 3 hours .\nadd onion , garlic , chili powder , tomatoes , and lemon juice .\nsimmer 45 minutes .\nadd salt and pepper if desired .',
  'wintery_day_bean_soup_aka_bodle_bean_soup'),
 ('add pinto beans to a 1-quart saucepan and cover them with water .\ncook over medium heat 1 hour or until tender .\nsimmer onion and green pepper in 1/4 cup water in a large nonstick skillet until onion is translucent .\nadd ground beef and cook over medium heat until browned .\ndrain excess fat from pan .\nstir in garlic , chili powder , cumin

In [52]:
def jaccard(s1,s2):
    s1=set(s1.split())
    s2=set(s2.split())
    i,u=map(len, [s1&s2, s1|s2])
    return i/u

In [55]:
recipe_ids = []
prev_r=("","")
for i in range(len(selected_examples)):
    r=selected_examples[i]
    if jaccard(r[0],prev_r[0])<0.9:
        recipe_ids.append(r[1])
    prev_r=r
len(recipe_ids)

93

In [63]:
exp_dir = (data_dir / "20210216_exp")
exp_dir.mkdir(exist_ok=True)
for rid in recipe_ids:
    with (exp_dir / (rid + ".json")).open('w') as f:
        json.dump(recipes[rid + ".json"], f)