In [7]:
import json 
import pprint
import requests
import re
from collections import defaultdict
cleaning_stats = defaultdict(int)

In [42]:
# Clean function to remove unwanted characters and validate the recipe
def clean_recipe(raw):
    if not isinstance(raw, dict):
        return None
    
    title = str(raw.get("title") or "").strip()
    ingredients = raw.get("ingredients", [])
    instructions = str(raw.get("instructions") or "").strip()

    # Check if title and instructions are present
    if not title:
        cleaning_stats["missing_title"] += 1
        return None
    if not instructions:
        cleaning_stats["missing_instructions"] += 1
        return None
    if not isinstance(ingredients, list):
        cleaning_stats["invalid_ingredients_type"] += 1
        return None

    
    # Clean up the ingredients
    cleaned_ingredients = [
        i.strip().replace("ADVERTISEMENT", "") 
        for i in ingredients 
        if isinstance(i, str) and i.strip()
    ]
    if len(cleaned_ingredients) < 2:
        cleaning_stats["too_few_ingredients"] += 1
        return None

    return {
        "title": title,
        "ingredients": cleaned_ingredients,
        "instructions": instructions
    }

In [43]:
# Read the raw recipe data from JSON files
with open("../data/recipes_raw_nosource_ar.json") as f1, \
     open("../data/recipes_raw_nosource_fn.json") as f2, \
     open("../data/recipes_raw_nosource_epi.json") as f3:
    
    raw_ar = list(json.load(f1).values())
    raw_fn = list(json.load(f2).values())
    raw_epi = list(json.load(f3).values())

print(f"Loaded {len(raw_ar)} + {len(raw_fn)} + {len(raw_epi)} recipes")

Loaded 39802 + 60039 + 25323 recipes


In [46]:
cleaned_all = []
for r in raw_ar + raw_fn + raw_epi:
    result = clean_recipe(r)
    if result:
        cleaned_all.append(result)

print(f"clean {len(cleaned_all)} cookbooks")
print("Cleaning summary:")
for reason, count in cleaning_stats.items():
    print(f" - {reason}: {count} recipes removed")


clean 122755 cookbooks
Cleaning summary:
 - missing_title: 1138 recipes removed
 - too_few_ingredients: 3390 recipes removed
 - missing_instructions: 290 recipes removed


In [48]:
import pprint
print(cleaned_all[0])

{'title': 'Slow Cooker Chicken and Dumplings', 'ingredients': ['4 skinless, boneless chicken breast halves ', '2 tablespoons butter ', '2 (10.75 ounce) cans condensed cream of chicken soup ', '1 onion, finely diced ', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ', ''], 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.'}


In [49]:
import json

with open("cleaned_recipes.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_all, f, indent=2, ensure_ascii=False)


In [5]:

with open("cleaned_recipes.json") as f:
    recipes = json.load(f)

recipes_200 = recipes[:200]

with open("recipes_200.json", "w", encoding="utf-8") as f:
    json.dump(recipes_200, f, indent=2)

In [16]:
# cleaning data for embedding 
def clean_ingredient(raw_ingredient: str) -> str:
    if not raw_ingredient:
        return ""
    cleaned = re.sub(r"\([^)]*\)", "", raw_ingredient)


    cleaned = re.sub(
        r"""^\s*
            (\d+(\.\d+)?|\d+\s+\d+/\d+|\d+/\d+)?      
            \s*
            (cup|cups|tablespoons?|tbsp|teaspoons?|tsp|
             grams?|g|kg|ml|liters?|oz|ounces?|lbs?|pounds?|
             cans?|packages?|slices?|cloves?|sticks?)?     
            \s*
        """,
        "", cleaned.strip(),
        flags=re.IGNORECASE | re.VERBOSE
    )

    # clean numbers
    cleaned = re.sub(r"\b\d+([\/\.]\d+)?\b", "", cleaned)

    cleaned = re.sub(
        r"\b(cup|cups|tablespoons?|tbsp|teaspoons?|tsp|grams?|g|kg|ml|liters?|oz|ounces?|lbs?|pounds?|cans?|packages?|slices?|cloves?|sticks?)\b",
        "", cleaned
    )
    cleaned = re.sub(r"[\s,.\-]+", " ", cleaned)
    
    cleaned = re.sub(r"[^\w\s]", "", cleaned)

    descriptive_words = r"\b(fresh|organic|chopped|diced|minced|sliced|peeled|grated|crushed)\b"

    cleaned = re.sub(descriptive_words, "", cleaned, flags=re.IGNORECASE)

    return cleaned.strip(" ,.-").lower()

In [9]:
def build_embedding_input(recipe: dict) -> str:
    title = recipe.get("title", "").strip()
    ingredients = [
        clean_ingredient(i) for i in recipe.get("ingredients", []) if i.strip()
    ]
    ingredients_str = ", ".join(ingredients)
    instructions = recipe.get("instructions", "").replace("\n", " ").strip()

    return f"""Recipe title: {title}
Ingredients: {ingredients_str}
Instructions: {instructions}"""



In [18]:
with open("recipes_200.json", "r") as f:
    recipes = json.load(f)
for recipe in recipes:
    recipe["embedding_input"] = build_embedding_input(recipe)

for r in recipes[:3]:
    print(r["embedding_input"])
    print("="*80)

with open("recipes_with_embedding_input.json", "w") as f:
    json.dump(recipes, f, indent=2)

Recipe title: Slow Cooker Chicken and Dumplings
Ingredients: skinless boneless chicken breast halves, butter, condensed cream of chicken soup, onion finely, refrigerated biscuit dough torn into pieces
Instructions: Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover. Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.
Recipe title: Awesome Slow Cooker Pot Roast
Ingredients: condensed cream of mushroom soup, dry onion soup mix, water, pot roast
Instructions: In a slow cooker, mix cream of mushroom soup, dry onion soup mix and water. Place pot roast in slow cooker and coat with soup mixture. Cook on High setting for 3 to 4 hours, or on Low setting for 8 to 9 hours.
Recipe title: Brown Sugar Meatloaf
Ingredients: packed brown sugar, ketchup, lean ground beef, milk, eggs, salt, ground black pepper, small onion, ground ging

In [20]:

# load the recipes with embedding input
with open('recipes_with_embedding_input.json', 'r') as f:
    data = json.load(f)

# add features to the recipes
for recipe in data:
    title = recipe['title'].lower()
    features = {}

    # cusinie
    if 'lasagna' in title or 'italian' in title:
        features['cuisine'] = 'Italian'
    elif 'chili' in title or 'taco' in title:
        features['cuisine'] = 'Mexican'
    elif 'chicken' in title or 'meatloaf' in title or 'mac and cheese' in title:
        features['cuisine'] = 'American'
    elif 'irish' in title:
        features['cuisine'] = 'Irish'
    elif 'tilapia' in title or 'teriyaki' in title:
        features['cuisine'] = 'Asian'
    else:
        features['cuisine'] = 'Other'


    # hard or easy
    instr = recipe.get('instructions', '').lower()
    if len(instr.split()) < 50:
        features['difficulty'] = 'easy'
    elif len(instr.split()) < 120:
        features['difficulty'] = 'medium'
    else:
        features['difficulty'] = 'hard'

    # add features
    recipe['features'] = features

# save the updated recipes with features
with open('recipes_with_features.json', 'w') as f:
    json.dump(data, f, indent=2)


In [None]:
import time
from voyageai import Client
from tqdm import tqdm
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("VOYAGER_API")

# 载入你刚刚添加 features 的文件
with open("recipes_with_features.json", "r") as f:
    recipes = json.load(f)

# 初始化 Voyage 客户端（确保 .env 设置了 VOYAGE_API_KEY）
client = Client(api_key=api_key)

for recipe in tqdm(recipes, desc="Generating Embeddings"):
    if "voyage_embedding" in recipe:
        continue

    content = recipe.get("embedding_input", "")
    if not content.strip():
        continue

    try:
        result = client.embed(
            [content],
            model="voyage-3.5",
            input_type="document"
        )
        recipe["voyage_embedding"] = result.embeddings[0]
        time.sleep(1.5)  # 防止 API 被限速
    except Exception as e:
        print(f"Error embedding recipe: {recipe['title']} -> {e}")

# 保存成新文件
with open("recipes_with_embeddings_final.json", "w") as f:
    json.dump(recipes, f, indent=2)

AuthenticationError: No API key provided. You can set your API key in code using 'voyageai.api_key = <API-KEY>', or set the environment variable VOYAGE_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the voyageai module at it with 'voyageai.api_key_path = <PATH>', or set the environment variable VOYAGE_API_KEY_PATH=<PATH>. API keys can be generated in Voyage AI's dashboard (https://dash.voyageai.com).