In [1]:
import json, random, re, collections, itertools,base64,hashlib
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
from ipywidgets import interact
display_html = lambda x: display(HTML(x))
data_path = Path("../data")
annot_path = Path("../annotations")

In [2]:
def simple_hash(s, length=8):
    md5bytes = hashlib.md5(s.encode('ascii',errors="ignore")).digest()
    ret = base64.urlsafe_b64encode(md5bytes).decode('ascii')
    return ret[:length]

simple_hash("baked potato")

'oIfAVH2w'

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Normalize ingredients

In [4]:
fname = data_path / "normalized_top_ingredients.csv"
normalized_top_ingredients = dict()
if fname.exists():
    normalized_top_ingredients = dict(pd.read_csv(str(fname)).dropna()[["ingredient", "normalized"]].values)

In [5]:
units=["teaspoon", "tablespoon", "ounce", "inch", "inche", "cup", "pound", "kg", "tbsp", "ml", "pint", "oz", "can", "package"]
adjs = ["large", "small", "medium", "fresh", "chopped", "minced", "sliced"]
no_units_pattern = re.compile(r"^[\d\s(/)]*(?:{units})?\s?(?:{adjs}){twice}[\s(/)]*(.+)$".format(
    twice="{0,2}",
    units="|".join([u+'s' for u in units]+units),
    adjs="|".join(adjs)),
                             )

def remove_parenthesis(s):
    s=s.translate({ord("\n"):" ", ord("½"): "1", ord("¾"): "1", ord("¼"): "1"})
    return re.sub(r"\([^()]+\)","", s)

def normalize_ingredient(s):
    ret = no_units_pattern.findall(remove_parenthesis(s))
    if len(ret)==0:
        return None
        raise Exception("'{s}' returned en empty string".format(s=s))
    ret=ret[0].rsplit(",", 1)[0]
    return normalized_top_ingredients.get(ret,ret)


def normalize_ingredients(lst):
    ret = [normalize_ingredient(s) for s in lst]
    ret = [s for s in ret if s is not None]
    return ret

# Data

In [6]:
with (data_path / "allrecipes-recipes.jsonl").open('r') as f:
    data = [json.loads(l) for l in f if '\\u00ae' not in l]

In [7]:
@interact(lb=(1,10),ub=(2,20), show_top_ingredients=False, show_sample=False)
def filter_by_num_of_ingredients(lb=3, ub=10, show_top_ingredients=False, show_sample=False):
    if show_top_ingredients:
        c = collections.Counter([ing for d in data if lb<=len(d["ingredients"])<=ub for ing in map(normalize_ingredient, d["ingredients"])])
        df = pd.DataFrame(c.most_common(), columns=["Ingredient", "#"])
        df["%"] = df["#"]/df["#"].sum()
        return df.head(20)
    if show_sample:
        return pd.DataFrame(random.sample([d["title"] for d in data], 20), columns=["title"])
    return len([d["ingredients"] for d in data if lb<=len(d["ingredients"])<=ub])

interactive(children=(IntSlider(value=3, description='lb', max=10, min=1), IntSlider(value=10, description='ub…

# Create sample

In [8]:
def get_id(recipe):
    return int(recipe["url"].rstrip('/').split('/')[-1])

In [9]:
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "egg", "butter", "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn"}
sampled_data = [d for d in data if (not d["error"]) and "®" not in "".join(d["ingredients"])]
sampled_data = [d for d in sampled_data if 3<len(d["ingredients"])<10]
sampled_data = [d for d in sampled_data if len(lookup_ingredients&set(" ".join(d["ingredients"]).lower().split()))>3]
sampled_data = [random.choice(list(recipes)) for author, recipes in itertools.groupby(sorted(sampled_data, key=at("author")), at("author"))]
len(sampled_data)

287

In [10]:
collections.Counter([1,1,1,1,2,3,2,3,2,2,2]).most_common()

[(2, 5), (1, 4), (3, 2)]

In [11]:
normalized_data = []
ingredients_map = collections.Counter()
for datum in tqdm(sampled_data):
    d = dict()
    for k in ["ingredients", "instructions", "title", "photo_url", "url"]:
        d[k] = datum[k]
    d["normalized_ingredients"] = normalize_ingredients(datum["ingredients"])
    assert len(d["normalized_ingredients"]) == len(d["ingredients"])
    ingredients_map += collections.Counter(d["normalized_ingredients"])
    d["status"]=0
    normalized_data.append(d)
    
ingredients_map={ing: "I"+simple_hash(ing) for ing, c in ingredients_map.most_common()}
for datum in normalized_data:
    datum["normalized_ingredients"]=dict(zip(map(ingredients_map.get,datum["normalized_ingredients"]),
                                        datum["normalized_ingredients"]))
    datum["steps"]=datum["instructions"]
    datum["instructions"]=[str(sent) for step in datum["steps"] for sent in nlp(step).sents]
    datum["labels"]=[[] for _ in range(len(datum["instructions"]))]
    
with (data_path / "ingredients_map.json").open('w') as f:
    json.dump(ingredients_map, f)
print ("We have {n} recipes, with {m} ingredients".format(n=len(normalized_data), m=len(ingredients_map)))

HBox(children=(FloatProgress(value=0.0, max=287.0), HTML(value='')))


We have 287 recipes, with 734 ingredients


### Check ingredients that map to single normalized ingredients

In [12]:
output = []
normalizing_errors=[]
for idx, datum in enumerate(normalized_data):
    if len(datum["normalized_ingredients"])==len(datum["ingredients"]):
        continue
    normalizing_errors.append(idx)
    output.append("<b>{t}</b>".format(t=datum["title"]))
    output.append("<table>")
    already_mapped_ings=set()
    for i in datum["ingredients"]:
        n=normalize_ingredient(i)
        if n in already_mapped_ings:
            n="<font color=\"red\">{n}</font>".format(n=n)
        output.append("<tr><td>{i}</td><td>{n}</td></tr>".format(i=i,n=n))
        already_mapped_ings.add(n)
    output.append("</table>")
display_html("<br>".join(output))

0,1
1/2 cup butter,butter
3 tablespoons minced garlic,garlic
3 tablespoons soy sauce,soy sauce
1/4 teaspoon black pepper,black pepper
1 tablespoon dried parsley,dried parsley
"6 boneless chicken thighs, with skin",boneless chicken thighs
"dried parsley, to taste",dried parsley

0,1
1/2 cup chopped celery,celery
1/2 cup chopped onion,onion
1/2 tablespoon butter,butter
"15 slices day-old bread, torn into small pieces",slices day-old bread
1/2 tablespoon Greek-style seasoning,Greek-style seasoning
1 (14 ounce) can chicken broth,chicken broth
6 pork chops,pork chops
1 cup packed brown sugar,packed brown sugar
"1/2 cup butter, melted",butter

0,1
2 tablespoons butter,butter
2 tablespoons chopped garlic,garlic
1 cup thinly sliced zucchini,thinly sliced zucchini
1/4 cup chopped onion,onion
"1 skinless, boneless chicken breast, cut into 1/2 inch slices","skinless, boneless chicken breast"
1 tablespoon butter,butter


In [13]:
for idx in reversed(normalizing_errors):
    del normalized_data[idx]
normalizing_errors=[]

In [14]:
sorted(map(at("title"),normalized_data))

['A Good Easy Garlic Chicken',
 'A Homemade San Francisco Treat: Chicken Vermicelli Rice',
 'All Protein Meatloaf',
 'Amazing Pork Tenderloin in the Slow Cooker',
 "Amber's Super Stuffing",
 "Amber's Super Stuffing",
 'Amish Casserole',
 'Apple and Orange Chicken',
 'Asian Citrus-Grilled Salmon',
 'Asian Ginger Catfish',
 'Asian Ginger Grill Marinade',
 'Asian Turkey Burgers',
 "Aunt Ro's Baked Beans",
 'Bacon Pork Tenderloin',
 'Barbeque Sauce for Meat Sandwiches',
 'Basic Beef Starter',
 'Beef Noodle Bake',
 'Beefy Tomato Soup',
 'Best Ever Brisket',
 'Best Ever Meatloaf I',
 'Best-Ever Cornbread-Sausage Stuffing',
 "Bev's Orange Chicken",
 "Big Ray's Tropical Island Chicken",
 'Bourbon Chicken',
 'Braised Pork Chops',
 'Breaded Chicken Wings',
 'Breaded Pork Chops',
 'Breakfast Sausage',
 'Broccoli Beef Noodles',
 'Bunkhouse Beans',
 'Cajun Buttered Corn',
 'Campfire Stew',
 'Candied Curried Pecans',
 'Caramelized Chicken Wings',
 'Cheesy Corn',
 'Chicken Liver and Pistachio Nut Pat

In [15]:
random.choice(normalized_data)

{'ingredients': ['1 pound ground beef',
  '4 cups chicken broth',
  '4 cups water',
  '2 (1 ounce) packages dry onion soup mix',
  '1 (15 ounce) can tomato sauce',
  '2 stalks celery, chopped',
  '1 onion, chopped',
  '1 (16 ounce) package frozen mixed vegetables',
  '3/4 cup elbow macaroni'],
 'instructions': ['In a saute pan, brown ground beef, over medium heat.',
  'In a large stock pot, combine broth, water, onion soup mix, tomato sauce, celery, onion, frozen vegetables and macaroni.',
  'Bring to a boil and then simmer until macaroni is done.',
  'Add browned ground beef, mix and serve.'],
 'title': 'Hamburger Vegetable Soup',
 'photo_url': 'http://images.media-allrecipes.com/userphotos/250x250/160393.jpg',
 'url': 'http://allrecipes.com/Recipe/13323/',
 'normalized_ingredients': {'INzU_lxpu': 'ground beef',
  'INNbMITPe': 'chicken broth',
  'IlGA3C7DK': 'water',
  'I8IdKkoFz': 'dry onion soup mix',
  'I-4q8Lxl1': 'tomato sauce',
  'IUW0Gg5k3': 'stalks celery',
  'Iz0wiMjVJ': 'oni

In [16]:
!rm -rf ../annotations
annot_path.mkdir()
for datum in tqdm(normalized_data):
    id = get_id(datum)
    with (annot_path/ f"{id}.json").open('w') as f:
        json.dump(datum, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))


