In [1]:
import json, random, re, collections, itertools,base64,hashlib
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
from ipywidgets import interact
display_html = lambda x: display(HTML(x))
data_path = Path("../data")
annot_path = Path("../annotations")

In [2]:
def simple_hash(s, length=8):
    md5bytes = hashlib.md5(s.encode('ascii',errors="ignore")).digest()
    ret = base64.urlsafe_b64encode(md5bytes).decode('ascii')
    return ret[:length]

simple_hash("baked potato")

'oIfAVH2w'

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

# Normalize ingredients

In [4]:
fname = data_path / "normalized_top_ingredients.csv"
normalized_top_ingredients = dict()
if fname.exists():
    normalized_top_ingredients = dict(pd.read_csv(str(fname)).dropna()[["ingredient", "normalized"]].values)

In [5]:
units=["teaspoon", "tablespoon", "ounce", "inch", "inche", "cup", "pound", "kg", "tbsp", "ml", "pint",
       "quarts", "slices", "squares", "oz", "can", "package", "dash", "to taste"]
adjs = ["large", "small", "medium", "fresh[\s.,;]", r"[\s.,;]+[a-z]{2,10}ed", "[a-z]{3,7}less", r"[\s,.-][a-z]{2,10}[eh]ly"]
no_units_pattern = re.compile(r"^[\d\s(/)]*(?:{units})?\s?(?:{adjs}[,\s]?){twice}[\s(/)]*(.+)$".format(
    twice="{0,2}",
    units="|".join([u+'s' for u in units]+units),
    adjs="|".join(adjs)),
                             )

def remove_units(s):
    ret = s.lower()
    ret = re.sub(r"\d[\d/.\s]+", "#", ret)
    ret = re.sub(r"{units}".format(units="|".join([u+'s' for u in units]+units)), " # ", ret)
    ret = re.sub(r"#[\s#]{0,2}", "", ret)
    return ret.strip()

def remove_adj(s):
    ret = " "+s.lower()
    ret = re.sub(r"{adjs}".format(adjs="|".join(adjs)), "", ret)
    return ret.strip(' ,-;.')

def remove_parenthesis(s):
    s=s.translate({ord("\n"):" ", ord("½"): "1", ord("¾"): "1", ord("¼"): "1"})
    return re.sub(r"\([^()]+\)","", s)

def normalize_ingredient(s):
    ret = remove_adj(remove_units(remove_parenthesis(s)))
    #ret=ret.rsplit(",", 1)[0]
    return normalized_top_ingredients.get(ret,ret)


def normalize_ingredients(lst):
    ret = [normalize_ingredient(s) for s in lst]
    ret = [s for s in ret if s is not None]
    return ret

In [6]:
if 'data' in globals():
    # regenerate top ingredients
    c = collections.Counter([normalize_ingredient(ing) for d in data for ing in d["ingredients"]])
    with (data_path / "top_ingredients.csv").open('w') as f:
        f.write("count,ingredient,normalized\n")
        for ing, cnt in c.most_common():
            f.write('{c},"{i}",""\n'.format(c=cnt,i=ing.replace('"', '""')))

In [17]:
# raw_ingredients = sum((collections.Counter(map(lambda s: s.translate({i:'#' for i in range(48,58)}),d["ingredients"])) for d in tqdm(data)),collections.Counter())
# with (data_path / "raw_top_ingredients.csv").open('w') as f:
#     f.write("count,ingredient\n")
#     for ing, cnt in raw_ingredients.most_common():
#         f.write('{c},"{i}"\n'.format(c=cnt,i=ing.replace('"', '""')))

HBox(children=(FloatProgress(value=0.0, max=71148.0), HTML(value='')))




# Data

In [7]:
with (data_path / "allrecipes-recipes.jsonl").open('r') as f:
    data = [json.loads(l) for l in f if '\\u00ae' not in l]

In [8]:
@interact(lb=(1,10),ub=(2,20), show_top_ingredients=False, show_sample=False)
def filter_by_num_of_ingredients(lb=3, ub=10, show_top_ingredients=False, show_sample=False):
    if show_top_ingredients:
        c = collections.Counter([ing for d in data if lb<=len(d["ingredients"])<=ub for ing in map(normalize_ingredient, d["ingredients"])])
        df = pd.DataFrame(c.most_common(), columns=["Ingredient", "#"])
        df["%"] = df["#"]/df["#"].sum()
        return df.head(20)
    if show_sample:
        return pd.DataFrame(random.sample([d["title"] for d in data], 20), columns=["title"])
    return len([d["ingredients"] for d in data if lb<=len(d["ingredients"])<=ub])

interactive(children=(IntSlider(value=3, description='lb', max=10, min=1), IntSlider(value=10, description='ub…

# Create sample

In [9]:
def get_id(recipe):
    return int(recipe["url"].rstrip('/').split('/')[-1])

In [10]:
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "egg", "butter", "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn", "mayonnaise", "sauce", "noodles", "carrot", "bread"}
sampled_data = [d for d in data if (not d["error"]) and "®" not in "".join(d["ingredients"])]
sampled_data = [d for d in sampled_data if 3<len(d["ingredients"])<10]
sampled_data = [d for d in sampled_data if len(lookup_ingredients&set(" ".join(d["ingredients"]).lower().split()))>3]
sampled_data = [random.choice(list(recipes)) for author, recipes in itertools.groupby(sorted(sampled_data, key=at("author")), at("author"))]
len(sampled_data)

992

In [11]:
collections.Counter([1,1,1,1,2,3,2,3,2,2,2]).most_common()

[(2, 5), (1, 4), (3, 2)]

In [12]:
min_num_of_steps = 3
normalized_data = []
ingredients_map = collections.Counter()
for datum in tqdm(sampled_data):
    if len(datum["instructions"])<min_num_of_steps:
        continue
    d = dict()
    for k in ["ingredients", "instructions", "title", "photo_url", "url"]:
        d[k] = datum[k]
    d["normalized_ingredients"] = normalize_ingredients(datum["ingredients"])
    assert len(d["normalized_ingredients"]) == len(d["ingredients"])
    ingredients_map += collections.Counter(d["normalized_ingredients"])
    d["status"]=0
    normalized_data.append(d)
    
ingredients_map={ing: "I"+simple_hash(ing) for ing, c in ingredients_map.most_common()}
for datum in normalized_data:
    datum["normalized_ingredients"]=dict(zip(map(ingredients_map.get,datum["normalized_ingredients"]),
                                        datum["normalized_ingredients"]))
    datum["sentences"]=[str(sent) for step in datum["instructions"] for sent in nlp(step).sents]
    datum["labels"]=[[] for _ in range(len(datum["instructions"]))]
    datum["sentence_labels"]=[[] for _ in range(len(datum["sentences"]))]
    
with (data_path / "ingredients_map.json").open('w') as f:
    json.dump(ingredients_map, f)
print ("We have {n} recipes, with {m} ingredients".format(n=len(normalized_data), m=len(ingredients_map)))

HBox(children=(FloatProgress(value=0.0, max=992.0), HTML(value='')))


We have 524 recipes, with 1016 ingredients


In [13]:
collections.Counter(map(lambda x: len(x["instructions"]), normalized_data))

Counter({3: 271, 5: 57, 4: 174, 6: 19, 9: 1, 7: 2})

### Check ingredients that map to single normalized ingredients

In [14]:
output = []
normalizing_errors=[]
for idx, datum in enumerate(normalized_data):
    if len(datum["normalized_ingredients"])==len(datum["ingredients"]):
        continue
    normalizing_errors.append(idx)
    output.append("<b>{t}</b>".format(t=datum["title"]))
    output.append("<table>")
    already_mapped_ings=set()
    for i in datum["ingredients"]:
        n=normalize_ingredient(i)
        if n in already_mapped_ings:
            n="<font color=\"red\">{n}</font>".format(n=n)
        output.append("<tr><td>{i}</td><td>{n}</td></tr>".format(i=i,n=n))
        already_mapped_ings.add(n)
    output.append("</table>")
display_html("<br>".join(output))

0,1
1/2 cup butter,butter
3 tablespoons minced garlic,garlic
3 tablespoons soy sauce,soy sauce
1/4 teaspoon black pepper,pepper
1 tablespoon dried parsley,parsley
"6 boneless chicken thighs, with skin","chicken thighs, with skin"
"dried parsley, to taste",parsley

0,1
5 green bell peppers,green bell peppers
2 pounds ground beef,ground beef
1 1/2 cups chopped onion,onion
"1 green bell pepper, chopped",bell pepper
"6 cloves garlic, minced",garlic
1 (16 ounce) jar chunky pasta sauce,jar chunky pasta sauce
1/2 cup chopped onion,onion
1 (6 ounce) package tomato lentil couscous mix,tomato lentil couscous mix
8 ounces shredded sharp Cheddar cheese,sharpdar cheese

0,1
2 tablespoons chili powder,chili powder
1 tablespoon salt,salt
1 tablespoon garlic powder,garlic powder
1 tablespoon onion powder,onion powder
1 tablespoon ground black pepper,pepper
1 tablespoon white sugar,sugar
1 tablespoon seasoned salt,salt
1 (4 pound) beef brisket,beef brisket
1 1/2 cups beef broth,broth

0,1
1 cup Italian-seasoned bread crumbs,italian-seasoned bread crumbs
1/2 cup grated Parmesan cheese,parmesan cheese
1/2 teaspoon salt,salt
1/4 teaspoon ground black pepper,pepper
1/8 teaspoon garlic powder,garlic powder
1 egg,egg
1/4 cup prepared ranch dressing,ranch dressing
"1 pound skinless, boneless chicken breasts, cut into 1-inch cubes","chicken breasts, cut into 1- cubes"
1/4 cup ranch dressing,ranch dressing

0,1
1 tablespoon butter,butter
"4 large sweet onions, sliced",sweet onions
"1/2 pound Swiss cheese, shredded",swiss cheese
10 slices Italian bread,italian bread
"1/4 cup butter, softened",butter
1/2 cup milk,milk
1 (10.75 ounce) can condensed cream of mushroom soup,cream of mushroom soup
3 teaspoons soy sauce,soy sauce
salt and pepper to taste,salt

0,1
1 pound ground beef,ground beef
"2 links chorizo sausage, casings removed","links chorizo sausage, casings"
1 cup plain bread crumbs,plain bread crumbs
1/2 cup prepared pasta sauce,pasta sauce
"1 egg, beaten","egg, beaten"
2 teaspoons jarred minced garlic,garlic
1 teaspoon Italian seasoning,italian seasoning
salt and ground black pepper to taste,salt and ground black pepper
1 tablespoon prepared pasta sauce,pasta sauce

0,1
1 cup mayonnaise,mayonnaise
2 teaspoons dried minced onion,onion
2 teaspoons dry mustard,mustard
1 cup crushed buttery round cracker crumbs,buttery round cracker crumbs
1/2 cup sesame seeds,sesames
"2 pounds skinless, boneless chicken breast halves",chicken breast halves
SAUCE:,sauce:
1 cup mayonnaise,mayonnaise
2 tablespoons honey,honey

0,1
1 pound beef sirloin steaks,beef sirloin steaks
1 tablespoon olive oil,olive oil
2/3 cup cocktail sauce,cocktail sauce
1/4 cup honey,honey
3 tablespoons soy sauce,soy sauce
"3 cloves garlic, crushed",garlic
"1/4 teaspoon seasoning salt, or to taste","seasoning salt, or"
1 tablespoon olive oil,olive oil
8 ounces sliced fresh mushrooms,mushroom

0,1
2 tablespoons reduced-fat mayonnaise,fat mayonnaise
1 1/2 teaspoons wasabi paste,wasabi paste
1 teaspoon Chinese five-spice powder,chinese five-spice powder
1 tablespoon low-sodium soy sauce,low-sodium soy sauce
4 (6 ounce) yellowfin tuna fillets,yellowfin tuna fillets
1 tablespoon rice vinegar,vinegar
1 tablespoon low-sodium soy sauce,low-sodium soy sauce
2 tablespoons toasted sesame seeds,sesames

0,1
1 egg,egg
1/4 cup milk,milk
1 pound ground beef,ground beef
1/4 cup dry cream of wheat cereal,dry cream of wheat cereal
1/4 cup minced onion,onion
1 (10.75 ounce) can condensed cream of chicken soup,cream of chicken soup
1 (10.75 ounce) can condensed cream of mushroom soup,cream of mushroom soup
1 (12 fluid ounce) can evaporated milk,milk
1 tablespoon chopped fresh parsley,parsley

0,1
1 (8 ounce) package wide egg noodles,wide egg noodles
2 cups sour cream,sour cream
1/4 cup grated Parmesan cheese,parmesan cheese
1 tablespoon chopped fresh chives,chives
1 teaspoon salt,salt
1/8 teaspoon ground black pepper,pepper
1 clove crushed garlic,garlic
2 tablespoons butter,butter
1/4 cup grated Parmesan cheese,parmesan cheese

0,1
1/2 cup chopped celery,celery
1/2 cup chopped onion,onion
1/2 tablespoon butter,butter
"15 slices day-old bread, torn into small pieces","day-old bread, torn into pieces"
1/2 tablespoon Greek-style seasoning,greek-style seasoning
1 (14 ounce) can chicken broth,chicken broth
6 pork chops,pork chops
1 cup packed brown sugar,sugar
"1/2 cup butter, melted",butter

0,1
1 pound chicken parts,chicken parts
1 large onion,onion
"3 stalks celery, including some leaves","stalks celery, including some leaves"
1 large carrot,carrot
1 1/2 teaspoons salt,salt
3 whole cloves,whole cloves
6 cups water,water
1/4 cup cold water (optional),water
1 egg,egg

0,1
1 pound ground beef,ground beef
3 tablespoons dry bread crumbs,bread crumbs
1 egg,egg
"3 green onions, chopped",green onion
1 tablespoon Cajun seasoning,cajun seasoning
1 tablespoon prepared mustard,mustard
1/4 cup barbeque sauce,barbeque sauce
1 teaspoon Cajun seasoning,cajun seasoning
4 slices Cheddar cheese,dar cheese

0,1
"24 chicken wings, split and tips discarded","chicken wings, split and tips"
3/4 cup packed brown sugar,sugar
"5 cloves garlic, minced",garlic
1 teaspoon minced fresh ginger root,ginger root
2 1/2 cups water,water
5 tablespoons honey,honey
1/4 cup reduced-sodium soy sauce,sodium soy sauce
3 tablespoons cornstarch,cornstarch
3/4 cup water,water

0,1
"1 onion, chopped",onion
"1 red bell pepper, chopped",bell pepper
"1 green bell pepper, chopped",bell pepper
"1 pound skinless, boneless chicken tenders",chicken tenders
1 (18 ounce) bottle barbeque sauce,bottle barbeque sauce
1 cup chili sauce,chili sauce
2 tablespoons minced garlic,garlic
"1 (8 ounce) can pineapple chunks, drained",pineapple chunks

0,1
2 pounds beef brisket,beef brisket
"15 gingersnap cookies, crushed",gingersnap cookies
2 (1 ounce) packages dry onion soup mix,dry onion soup mix
2 cups water,water
1 (12 fluid ounce) can or bottle chili sauce,or bottle chili sauce
1 pound baby carrots,baby carrots
"15 small red potatoes, cubed",red potatoes
"1 onion, chopped",onion
2 cups water,water

0,1
"2 (8 ounce) skinless, boneless chicken breasts",chicken breasts
1/4 cup minced fresh cilantro,cilantro
2 tablespoons lime juice,lime juice
1 tablespoon low-sodium soy sauce,low-sodium soy sauce
1 tablespoon olive oil,olive oil
"2 cloves garlic, minced",garlic
1/4 cup sweet chili sauce,sweet chili sauce
1 teaspoon minced fresh cilantro,cilantro

0,1
"1 pound skinless, boneless chicken breast halves",chicken breast halves
1 tablespoon butter,butter
"1 small onion, chopped",onion
1 (10.75 ounce) can cream of celery soup,cream of celery soup
1 (5 ounce) can evaporated milk,milk
1 cup chicken broth,chicken broth
1/4 cup melted butter,butter
8 ounces corn bread stuffing mix,corn bread stuffing mix


In [15]:
for idx in reversed(normalizing_errors):
    del normalized_data[idx]
normalizing_errors=[]

In [16]:
sorted(map(at("title"),normalized_data))

['Alice Chicken',
 'All Protein Meatloaf',
 "Amber's Super Stuffing",
 "Amber's Super Stuffing",
 'Amish Casserole',
 'Amish Yumazuti',
 "Ann's Sister's Meatloaf Recipe",
 'Apple and Orange Chicken',
 'Asian Barbecue Burgers',
 'Asian Crock Pot',
 'Asian Marinated Pork Chops',
 'Asian Sugar Snap Pea Appetizer',
 'Asian-Inspired Vegetable Noodle Bowl ',
 'Asparagus Beef Lo Mein',
 "Aunt Ro's Baked Beans",
 'Australian Deep Fried Chicken Wings',
 'BBQ Beef Brisket Sandwiches',
 'BBQ Fried Chicken',
 'Backyard Cooper Burgers',
 'Bacon Pork Tenderloin',
 'Baked Aloha Chicken',
 'Baked Asian-Style Honey Chicken',
 'Baked Coconut Cayenne Chicken Nuggets',
 'Baked Mushroom Thighs',
 'Baked Penne with Italian Sausage',
 'Baked Turkey Meatballs',
 'Baked Zesty Carrots',
 'Bar Stool Pretzels',
 'Barbecued Meatballs',
 'Barbeque Beef Casserole',
 'Barbeque Pork Fajitas',
 'Barbequed Marinated Flank Steak',
 'Basic Salisbury Steaks',
 'Bat Wings',
 "Becca's Barbequed Beans",
 "Becki's Oven Barbecu

In [17]:
random.choice(normalized_data)

{'ingredients': ['1/2 cup uncooked brown rice',
  '1 3/8 cups water',
  '1 pound ground beef',
  '1 (10.75 ounce) can condensed cream of mushroom soup',
  '1/2 cup chopped green onion',
  '1 tablespoon Worcestershire sauce',
  '1 teaspoon soy sauce',
  '1/4 teaspoon ground black pepper'],
 'instructions': ['In a medium saucepan, bring water and brown rice to a boil. Reduce heat, cover and simmer for 45 to 50 minutes.',
  'Meanwhile, in a medium skillet over high heat, brown the ground beef and drain fat.',
  'Stir the mushroom soup, green onion, Worcestershire sauce, soy sauce and pepper into the skillet with the beef. Simmer on medium low heat for 10 minutes. Serve over the rice.'],
 'title': "Poor Man's Shepherd's Pie",
 'photo_url': 'http://images.media-allrecipes.com/userphotos/560x315/1610513.jpg',
 'url': 'http://allrecipes.com/Recipe/26639/',
 'normalized_ingredients': {'IeLJalKYK': 'brown rice',
  'IlGA3C7DK': 'water',
  'INzU_lxpu': 'ground beef',
  'ICvNMkhaQ': 'cream of mush

In [18]:
!rm -rf ../annotations
annot_path.mkdir()
for datum in tqdm(normalized_data):
    id = get_id(datum)
    with (annot_path/ f"{id}.json").open('w') as f:
        json.dump(datum, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=505.0), HTML(value='')))


