In [1]:
import json, random, re, collections, itertools
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
display_html = lambda x: display(HTML(x))
data_path = Path("../data")
annot_path = Path("../annotations")

# Normalize ingredients

In [2]:
fname = data_path / "normalized_top_ingredients.csv"
normalized_top_ingredients = dict()
if fname.exists():
    normalized_top_ingredients = dict(pd.read_csv(str(fname)).dropna()[["ingredient", "normalized"]].values)

In [3]:
units=["teaspoon", "tablespoon", "ounce", "inch", "inche", "cup", "pound", "kg", "tbsp", "ml", "pint", "oz", "can", "package"]
adjs = ["large", "small", "medium", "fresh", "chopped"]
no_units_pattern = re.compile(r"^[\d\s(/)]*(?:{units})?\s?(?:{adjs}){twice}[\s(/)]*(.+)$".format(
    twice="{0,2}",
    units="|".join([u+'s' for u in units]+units),
    adjs="|".join(adjs)),
                             )

def remove_parenthesis(s):
    s=s.translate({ord("\n"):" ", ord("½"): "1", ord("¾"): "1", ord("¼"): "1"})
    return re.sub(r"\([^()]+\)","", s)

def normalize_ingredient(s):
    ret = no_units_pattern.findall(remove_parenthesis(s))
    if len(ret)==0:
        raise Exception("'{s}' returned en empty string".format(s=s))
    ret=ret[0].rsplit(",", 1)[0]
    return normalized_top_ingredients.get(ret,ret)


def normalize_ingredients(lst):
    ret = [normalize_ingredient(s) for s in lst]
    ret = [s for s in ret if s is not None]
    return ret

# Create sample

In [4]:
with (data_path / "allrecipes-recipes.jsonl").open('r') as f:
    data = [json.loads(l) for l in f]

In [5]:
def get_id(recipe):
    return int(recipe["url"].rstrip('/').split('/')[-1])

In [6]:
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "egg", "butter", "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn"}
sampled_data = [d for d in data if (not d["error"]) and "®" not in "".join(d["ingredients"])]
sampled_data = [d for d in sampled_data if 3<len(d["ingredients"])<10]
sampled_data = [d for d in sampled_data if len(lookup_ingredients&set(" ".join(d["ingredients"]).lower().split()))>3]
sampled_data = [random.choice(list(recipes)) for author, recipes in itertools.groupby(sorted(sampled_data, key=at("author")), at("author"))]
len(sampled_data)

306

In [7]:
normalized_data = []
ingredients_map = set()
for datum in tqdm(sampled_data):
    d = dict()
    for k in ["ingredients", "instructions", "title", "photo_url", "url"]:
        d[k] = datum[k]
    d["normalized_ingredients"] = normalize_ingredients(datum["ingredients"])
    assert len(d["normalized_ingredients"]) == len(d["ingredients"])
    ingredients_map |= set(d["normalized_ingredients"])
    d["status"]=0
    d["labels"]=[[] for _ in range(len(datum["instructions"]))]
    normalized_data.append(d)
    
ingredients_map=sorted(ingredients_map)
for datum in normalized_data:
    datum["normalized_ingredients"]=dict(zip(map(ingredients_map.index,datum["normalized_ingredients"]),
                                        datum["normalized_ingredients"]))
    
with (data_path / "ingredients_map.json").open('w') as f:
    json.dump(ingredients_map, f)
print ("We have {n} recipes, with {m} ingredients".format(n=len(normalized_data), m=len(ingredients_map)))

HBox(children=(FloatProgress(value=0.0, max=306.0), HTML(value='')))


We have 306 recipes, with 778 ingredients


### Check ingredients that map to single normalized ingredients

In [8]:
output = []
normalizing_errors=[]
for idx, datum in enumerate(normalized_data):
    if len(datum["normalized_ingredients"])==len(datum["ingredients"]):
        continue
    normalizing_errors.append(idx)
    output.append("<b>{t}</b>".format(t=datum["title"]))
    output.append("<table>")
    already_mapped_ings=set()
    for i in datum["ingredients"]:
        n=normalize_ingredient(i)
        if n in already_mapped_ings:
            n="<font color=\"red\">{n}</font>".format(n=n)
        output.append("<tr><td>{i}</td><td>{n}</td></tr>".format(i=i,n=n))
        already_mapped_ings.add(n)
    output.append("</table>")
display_html("<br>".join(output))

0,1
1/2 cup butter,butter
3 tablespoons minced garlic,minced garlic
3 tablespoons soy sauce,soy sauce
1/4 teaspoon black pepper,black pepper
1 tablespoon dried parsley,dried parsley
"6 boneless chicken thighs, with skin",boneless chicken thighs
"dried parsley, to taste",dried parsley

0,1
1/2 cup chopped celery,celery
1/2 cup chopped onion,onion
1/2 tablespoon butter,butter
"15 slices day-old bread, torn into small pieces",slices day-old bread
1/2 tablespoon Greek-style seasoning,Greek-style seasoning
1 (14 ounce) can chicken broth,chicken broth
6 pork chops,pork chops
1 cup packed brown sugar,packed brown sugar
"1/2 cup butter, melted",butter

0,1
2 tablespoons butter,butter
2 tablespoons chopped garlic,garlic
1 cup thinly sliced zucchini,thinly sliced zucchini
1/4 cup chopped onion,onion
"1 skinless, boneless chicken breast, cut into 1/2 inch slices","skinless, boneless chicken breast"
1 tablespoon butter,butter


In [9]:
for idx in reversed(normalizing_errors):
    del normalized_data[idx]
normalizing_errors=[]

In [10]:
sorted(map(at("title"),normalized_data))

['A Good Easy Garlic Chicken',
 'A Homemade San Francisco Treat: Chicken Vermicelli Rice',
 'All Protein Meatloaf',
 'Amazing Pork Tenderloin in the Slow Cooker',
 "Amber's Super Stuffing",
 "Amber's Super Stuffing",
 'Amish Casserole',
 'Apple and Orange Chicken',
 'Asian Citrus-Grilled Salmon',
 'Asian Ginger Catfish',
 'Asian Ginger Grill Marinade',
 'Asian Turkey Burgers',
 "Aunt Ro's Baked Beans",
 'Avocado Side Dish',
 'Bacon Pork Tenderloin',
 'Baked Asian-Style Honey Chicken',
 'Barbeque Sauce for Meat Sandwiches',
 'Barbeque Shredded Beef',
 'Basic Beef Starter',
 'Beef Noodle Bake',
 'Beefy Tomato Soup',
 'Best Ever Brisket',
 'Best Ever Meatloaf I',
 'Best-Ever Cornbread-Sausage Stuffing',
 "Bev's Orange Chicken",
 "Big Ray's Tropical Island Chicken",
 'Bourbon Chicken',
 'Braised Pork Chops',
 'Breaded Chicken Wings',
 'Breaded Pork Chops',
 'Breakfast Sausage',
 'Broccoli Beef Noodles',
 'Broccoli-Beef Noodles',
 'Bunkhouse Beans',
 'Cajun Buttered Corn',
 'Campfire Stew',

In [11]:
random.choice(normalized_data)

{'ingredients': ['2 1/4 cups chicken stock',
  '1/4 cup wild rice',
  '4 tablespoons butter',
  '2 cups fresh sliced mushrooms',
  '2 cups chopped celery',
  '1 cup chopped onion',
  '4 cups corn bread stuffing mix',
  '1 tablespoon poultry seasoning'],
 'instructions': ['Combine chicken stock and wild rice in a saucepan. Cover and bring to a boil over high heat. Reduce heat and simmer until rice is tender, about 45 minutes. Remove from heat.',
  'Melt the butter in a large skillet. Add the mushrooms, celery, and onion. Cook and stir until the vegetables are soft, about 5 minutes.',
  'In a large bowl, mix the crumbled cornbread and poultry seasoning. Add the rice with chicken broth and the vegetables and mix well. Use to stuff a turkey or bake on its own. More chicken stock can be added if stuffing is dry.'],
 'title': "Amber's Super Stuffing",
 'photo_url': 'http://images.media-allrecipes.com/userphotos/560x315/1001108.jpg',
 'url': 'http://allrecipes.com/Recipe/9068/',
 'normalized_

In [12]:
!rm -rf ../annotations
annot_path.mkdir()
for datum in tqdm(normalized_data):
    id = get_id(datum)
    with (annot_path/ f"{id}.json").open('w') as f:
        json.dump(datum, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=303.0), HTML(value='')))


