In [41]:
import json 
import pprint
import requests
from collections import defaultdict
cleaning_stats = defaultdict(int)

In [42]:
# Clean function to remove unwanted characters and validate the recipe
def clean_recipe(raw):
    if not isinstance(raw, dict):
        return None
    
    title = str(raw.get("title") or "").strip()
    ingredients = raw.get("ingredients", [])
    instructions = str(raw.get("instructions") or "").strip()

    # Check if title and instructions are present
    if not title:
        cleaning_stats["missing_title"] += 1
        return None
    if not instructions:
        cleaning_stats["missing_instructions"] += 1
        return None
    if not isinstance(ingredients, list):
        cleaning_stats["invalid_ingredients_type"] += 1
        return None

    
    # Clean up the ingredients
    cleaned_ingredients = [
        i.strip().replace("ADVERTISEMENT", "") 
        for i in ingredients 
        if isinstance(i, str) and i.strip()
    ]
    if len(cleaned_ingredients) < 2:
        cleaning_stats["too_few_ingredients"] += 1
        return None

    return {
        "title": title,
        "ingredients": cleaned_ingredients,
        "instructions": instructions
    }

In [43]:
# Read the raw recipe data from JSON files
with open("../data/recipes_raw_nosource_ar.json") as f1, \
     open("../data/recipes_raw_nosource_fn.json") as f2, \
     open("../data/recipes_raw_nosource_epi.json") as f3:
    
    raw_ar = list(json.load(f1).values())
    raw_fn = list(json.load(f2).values())
    raw_epi = list(json.load(f3).values())

print(f"Loaded {len(raw_ar)} + {len(raw_fn)} + {len(raw_epi)} recipes")

Loaded 39802 + 60039 + 25323 recipes


In [46]:
cleaned_all = []
for r in raw_ar + raw_fn + raw_epi:
    result = clean_recipe(r)
    if result:
        cleaned_all.append(result)

print(f"clean {len(cleaned_all)} cookbooks")
print("Cleaning summary:")
for reason, count in cleaning_stats.items():
    print(f" - {reason}: {count} recipes removed")


clean 122755 cookbooks
Cleaning summary:
 - missing_title: 1138 recipes removed
 - too_few_ingredients: 3390 recipes removed
 - missing_instructions: 290 recipes removed


In [48]:
import pprint
print(cleaned_all[0])

{'title': 'Slow Cooker Chicken and Dumplings', 'ingredients': ['4 skinless, boneless chicken breast halves ', '2 tablespoons butter ', '2 (10.75 ounce) cans condensed cream of chicken soup ', '1 onion, finely diced ', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ', ''], 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.'}


In [49]:
import json

with open("cleaned_recipes.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_all, f, indent=2, ensure_ascii=False)
