In [1]:
import json, random, re,collections, itertools
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
from operator import itemgetter as at
data_path = Path("../data")
annot_path = Path("../annotations")

# Normalize ingredients

In [2]:
fname = data_path / "normalized_top_ingredients.csv"
normalized_top_ingredients = dict()
if fname.exists():
    normalized_top_ingredients = dict(pd.read_csv(str(fname)).dropna()[["ingredient", "normalized"]].values)

In [3]:
units=["teaspoon", "tablespoon", "ounce", "inch", "inche", "cup", "pound", "kg", "tbsp", "ml", "pint", "oz", "large", "small", "medium", "fresh"]
no_units_pattern = re.compile(r"^[\d\s(/)]*(?:{units})?[\s(/)]*(.+)$".format(units="|".join([u+'s' for u in units]+units)))

def remove_parenthesis(s):
    s=s.translate({ord("\n"):" ", ord("½"): "1", ord("¾"): "1", ord("¼"): "1"})
    return re.sub(r"\([^()]+\)","", s)

def normalize_ingredient(s):
    ret = no_units_pattern.findall(remove_parenthesis(s))
    if len(ret)==0:
        return None
    ret=ret[0].rsplit(",", 1)[0]
    return normalized_top_ingredients.get(ret,ret)


def normalize_ingredients(lst):
    ret = [normalize_ingredient(s) for s in lst]
    ret = [s for s in ret if s is not None]
    return ret

# Create sample

In [4]:
with (data_path / "allrecipes-recipes.jsonl").open('r') as f:
    data = [json.loads(l) for l in f]

In [5]:
def get_id(recipe):
    return int(recipe["url"].rstrip('/').split('/')[-1])

In [6]:
lookup_ingredients = {"beef", "pork", "chicken", "tuna", "salmon", "lamb", "egg", "butter", "garlic", "onion", "avocado", "tomato", "soy", "honey", "chili", "feta", "corn"}
sampled_data = [d for d in data if (not d["error"]) and "®" not in "".join(d["ingredients"])]
sampled_data = [d for d in sampled_data if 3<len(d["ingredients"])<10]
sampled_data = [d for d in sampled_data if len(lookup_ingredients&set(" ".join(d["ingredients"]).lower().split()))>3]
sampled_data = [random.choice(list(recipes)) for author, recipes in itertools.groupby(sorted(sampled_data, key=at("author")), at("author"))]
len(sampled_data)

306

In [7]:
for datum in tqdm(data):
    datum["normalized_ingredients"] = normalize_ingredients(datum["ingredients"])
    del datum["error"]

HBox(children=(FloatProgress(value=0.0, max=225602.0), HTML(value='')))




In [11]:
sorted(map(at("title"),sampled_data))

['A Good Easy Garlic Chicken',
 'A Homemade San Francisco Treat: Chicken Vermicelli Rice',
 'All Protein Meatloaf',
 'Amazing Pork Tenderloin in the Slow Cooker',
 "Amber's Super Stuffing",
 "Amber's Super Stuffing",
 'Amish Casserole',
 'Amish Yumazuti',
 'Apple and Orange Chicken',
 'Asian Citrus-Grilled Salmon',
 'Asian Ginger Catfish',
 'Asian Ginger Grill Marinade',
 'Asian Turkey Burgers',
 "Aunt Ro's Baked Beans",
 'Bacon Pork Tenderloin',
 'Barbeque Sauce for Meat Sandwiches',
 'Barbeque Shredded Beef',
 'Basic Beef Starter',
 'Beef Noodle Bake',
 'Beefy Tomato Soup',
 'Best Ever Brisket',
 'Best Ever Meatloaf I',
 'Best-Ever Cornbread-Sausage Stuffing',
 "Bev's Orange Chicken",
 "Big Ray's Tropical Island Chicken",
 'Bourbon Chicken',
 'Braised Pork Chops',
 'Breaded Chicken Wings',
 'Breakfast Sausage',
 'Broccoli-Beef Noodles',
 'Broiled Salmon with Corn Relish',
 'Brown Sugar Glazed Pork Chops',
 'Bunkhouse Beans',
 'Cajun Buttered Corn',
 'Campfire Stew',
 'Candied Curried

In [13]:
random.choice(sampled_data)

{'author': 'JRich664',
 'cook_time_minutes': 10,
 'description': 'This recipe makes a wonderful chili sauce for hot dogs. Just place a grilled hot dog in a bun and top with chili, grated Cheddar cheese, and diced onions. Serve immediately. If halving this recipe, cover the pot for half of the two hour simmer.',
 'footnotes': [],
 'ingredients': ['2 pounds ground beef',
  '2 (14.5 ounce) cans reduced-sodium beef broth',
  '1 (28 ounce) can crushed tomatoes with puree',
  '2 tablespoons chili powder',
  '1 tablespoon paprika',
  '1 teaspoon onion powder',
  '1 teaspoon garlic powder',
  '1 teaspoon kosher salt',
  '1/4 teaspoon cayenne pepper'],
 'instructions': ['Heat a large skillet over medium-high heat. Cook and stir beef in the hot skillet until browned and crumbly, 5 to 7 minutes; drain and discard grease. Add beef broth to ground beef; bring to a boil. Reduce heat to medium-low and simmer until liquid is slightly reduced, about 30 minutes.',
  'Mix tomatoes with puree, chili powde

In [10]:
!rm -rf ../annotations
annot_path.mkdir()
for datum in tqdm(sampled_data):
    id = get_id(datum)
    with (annot_path/ f"{id}.json").open('w') as f:
        json.dump(datum, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=306.0), HTML(value='')))


