In [1]:
import json, random, re,collections, itertools
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
data_path = Path("../data")
annot_path = Path("../annotations")

# Normalize ingredients

In [2]:
fname = data_path / "normalized_top_ingredients.csv"
normalized_top_ingredients = dict()
if fname.exists():
    normalized_top_ingredients = dict(pd.read_csv(str(fname)).dropna()[["ingredient", "normalized"]].values)

In [3]:
units=["teaspoon", "tablespoon", "ounce", "inch", "inche", "cup", "pound", "kg", "tbsp", "ml", "pint", "oz", "large", "small", "medium", "fresh"]
no_units_pattern = re.compile(r"^[\d\s(/)]*(?:{units})?[\s(/)]*(.+)$".format(units="|".join([u+'s' for u in units]+units)))

def remove_parenthesis(s):
    s=s.translate({ord("\n"):" ", ord("½"): "1", ord("¾"): "1", ord("¼"): "1"})
    return re.sub(r"\([^()]+\)","", s)

def normalize_ingredient(s):
    ret = no_units_pattern.findall(remove_parenthesis(s))
    if len(ret)==0:
        return None
    ret=ret[0].rsplit(",", 1)[0]
    return normalized_top_ingredients.get(ret,ret)


def normalize_ingredients(lst):
    ret = [normalize_ingredient(s) for s in lst]
    ret = [s for s in ret if s is not None]
    return ret

# Create sample

In [4]:
with (data_path / "allrecipes-recipes.jsonl").open('r') as f:
    data = [json.loads(l) for l in f]

In [5]:
def get_id(recipe):
    return int(recipe["url"].rstrip('/').split('/')[-1])

In [6]:
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "egg", "butter", "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn"}
sampled_data = [d for d in data if (not d["error"]) and "®" not in "".join(d["ingredients"])]
sampled_data = [d for d in sampled_data if 3<len(d["ingredients"])<10]
sampled_data = [d for d in sampled_data if len(lookup_ingredients&set(" ".join(d["ingredients"]).lower().split()))>3]
sampled_data = [random.choice(list(recipes)) for author, recipes in itertools.groupby(sorted(sampled_data, key=at("author")), at("author"))]
len(sampled_data)

306

In [7]:
normalized_data = []
ingredients_map = set()
for datum in tqdm(sampled_data):
    d = dict()
    for k in ["ingredients", "instructions", "title", "photo_url", "url"]:
        d[k] = datum[k]
    d["normalized_ingredients"] = normalize_ingredients(datum["ingredients"])
    ingredients_map |= set(d["normalized_ingredients"])
    d["status"]=0
    d["labels"]=[[] for _ in range(len(datum["instructions"]))]
    normalized_data.append(d)
    
ingredients_map=sorted(ingredients_map)
for datum in normalized_data:
    datum["normalized_ingredients"]=dict(zip(map(ingredients_map.index,d["normalized_ingredients"]),
                                        d["normalized_ingredients"]))
    
with (data_path / "ingredients_map.json").open('w') as f:
    json.dump(ingredients_map, f)
print ("We have {n} recipes, with {m} ingredients".format(n=len(normalized_data), m=len(ingredients_map)))

HBox(children=(FloatProgress(value=0.0, max=306.0), HTML(value='')))


We have 306 recipes, with 774 ingredients


In [8]:
sorted(map(at("title"),normalized_data))

['A Good Easy Garlic Chicken',
 'A Homemade San Francisco Treat: Chicken Vermicelli Rice',
 'All Protein Meatloaf',
 'Amazing Pork Tenderloin in the Slow Cooker',
 "Amber's Super Stuffing",
 "Amber's Super Stuffing",
 'Amish Casserole',
 'Apple and Orange Chicken',
 'Asian Citrus-Grilled Salmon',
 'Asian Ginger Catfish',
 'Asian Ginger Grill Marinade',
 'Asian Turkey Burgers',
 "Aunt Ro's Baked Beans",
 'Bacon Pork Tenderloin',
 'Baked Asian-Style Honey Chicken',
 'Barbeque Sauce for Meat Sandwiches',
 'Barbeque Shredded Beef',
 'Basic Beef Starter',
 'Beef Noodle Bake',
 'Beefy Tomato Soup',
 'Best Ever Brisket',
 'Best Ever Meatloaf I',
 'Best-Ever Cornbread-Sausage Stuffing',
 "Bev's Orange Chicken",
 "Big Ray's Tropical Island Chicken",
 'Bourbon Chicken',
 'Braised Pork Chops',
 'Breaded Chicken Wings',
 'Breaded Pork Chops',
 'Breakfast Sausage',
 'Broccoli-Beef Noodles',
 'Broccoli-Beef Noodles',
 'Brown Sugar Glazed Pork Chops',
 'Bunkhouse Beans',
 'Cajun Buttered Corn',
 'Cam

In [9]:
random.choice(normalized_data)

{'ingredients': ['1 pound chicken wings',
  '2 tablespoons white sugar',
  '1/2 cup soy sauce',
  '1/4 cup rice wine',
  '3 tablespoons chili garlic sauce',
  '1 tablespoon sesame oil',
  '2 cloves garlic, minced',
  '1/4 cup water'],
 'instructions': ['Rinse and pat dry the chicken wings. Whisk together the sugar, soy sauce, rice wine, chili garlic sauce, sesame oil, and garlic in a large bowl, then add the chicken wings and toss to evenly coat.',
  'Heat a lightly oiled skillet over medium heat. Stir in the chicken wings, sauce, and water. Cover and cook until the chicken wings are no longer pink at the bone, turning the chicken wings occasionally, about 15 minutes. Remove lid and continue cooking until the sauce has thickened, 5 to 10 minutes.'],
 'title': 'Spicy Chinese Chicken Wings',
 'photo_url': 'http://images.media-allrecipes.com/userphotos/560x315/3686140.jpg',
 'url': 'http://allrecipes.com/Recipe/104600/',
 'normalized_ingredients': {519: 'onion',
  732: 'tomato sauce',
  7

In [10]:
!rm -rf ../annotations
annot_path.mkdir()
for datum in tqdm(normalized_data):
    id = get_id(datum)
    with (annot_path/ f"{id}.json").open('w') as f:
        json.dump(datum, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=306.0), HTML(value='')))


