In [2]:
import pandas as pd
from pathlib import Path
pd.set_option("display.max_columns", None)

In [None]:
DATA_DIR = Path("../data")

In [5]:
ingredients = pd.read_csv(DATA_DIR / "ingredients.csv")
recipes = pd.read_csv(DATA_DIR / "recipes.csv")
recipe_ingredients = pd.read_csv(DATA_DIR / "recipe_ingredients.csv")
pantry = pd.read_csv(DATA_DIR / "pantry.csv")
ingredient_aliases = pd.read_csv(DATA_DIR / "ingredient_aliases.csv")
recipe_feedback = pd.read_csv(DATA_DIR / "recipe_feedback.csv")

In [6]:
datasets = {
    "ingredients": ingredients,
    "recipes": recipes,
    "recipe_ingredients": recipe_ingredients,
    "pantry": pantry,
    "ingredient_aliases": ingredient_aliases,
    "recipe_feedback": recipe_feedback
}

for name, df in datasets.items():
    print(f"\n{name.upper()}")
    print(f"Shape: {df.shape}")
    display(df.head(3))


INGREDIENTS
Shape: (10, 4)


Unnamed: 0,ingredient_id,name,category,common_unit
0,1,onion,vegetable,pieces
1,2,tomato,vegetable,pieces
2,3,rice,grain,grams



RECIPES
Shape: (4, 8)


Unnamed: 0,recipe_id,name,cuisine,dish_type,requires_airfryer,requires_soaking,meal_prep_type,video_url
0,1,Vegetable Upma,Indian,breakfast,False,False,good_for_2_days,https://youtube.com/xxx
1,2,Chana Salad,Indian,salad,False,True,prep_components,https://youtube.com/yyy
2,3,Airfryer Tikkis,Indian,snack,True,False,freeze_friendly,https://youtube.com/zzz



RECIPE_INGREDIENTS
Shape: (11, 5)


Unnamed: 0,recipe_id,ingredient_id,quantity,unit,is_optional
0,1,1,1.0,pieces,False
1,1,2,1.0,pieces,False
2,1,3,100.0,grams,False



PANTRY
Shape: (5, 6)


Unnamed: 0,pantry_id,ingredient_id,quantity,unit,location,expiry_date
0,1,1,3,pieces,fridge,2025-01-20
1,2,3,1000,grams,pantry,2025-06-01
2,3,6,500,grams,pantry,2025-05-15



INGREDIENT_ALIASES
Shape: (5, 2)


Unnamed: 0,alias,ingredient_id
0,jeera,4
1,dahi,5
2,curd,5



RECIPE_FEEDBACK
Shape: (2, 7)


Unnamed: 0,feedback_id,recipe_id,rating,liked,comments,cooked_on,would_make_again
0,1,1,4.5,True,"Quick, filling breakfast",2025-01-10,True
1,2,2,2.0,False,"Too bland, needs spice",2025-01-12,False


In [7]:
invalid_recipe_ingredients = recipe_ingredients[
    ~recipe_ingredients["ingredient_id"].isin(ingredients["ingredient_id"])
]

assert invalid_recipe_ingredients.empty, \
    "‚ùå Orphaned ingredient_id found in recipe_ingredients"

print("‚úÖ All recipe_ingredients ingredient_ids are valid")

‚úÖ All recipe_ingredients ingredient_ids are valid


In [8]:
invalid_pantry_ingredients = pantry[
    ~pantry["ingredient_id"].isin(ingredients["ingredient_id"])
]

assert invalid_pantry_ingredients.empty, \
    "‚ùå Orphaned ingredient_id found in pantry"

print("‚úÖ All pantry ingredient_ids are valid")

‚úÖ All pantry ingredient_ids are valid


In [9]:
invalid_aliases = ingredient_aliases[
    ~ingredient_aliases["ingredient_id"].isin(ingredients["ingredient_id"])
]

assert invalid_aliases.empty, \
    "‚ùå Orphaned ingredient_id found in ingredient_aliases"

print("‚úÖ All ingredient_aliases ingredient_ids are valid")


‚úÖ All ingredient_aliases ingredient_ids are valid


In [10]:
invalid_feedback_recipes = recipe_feedback[
    ~recipe_feedback["recipe_id"].isin(recipes["recipe_id"])
]

assert invalid_feedback_recipes.empty, \
    "‚ùå Orphaned recipe_id found in recipe_feedback"

print("‚úÖ All recipe_feedback recipe_ids are valid")


‚úÖ All recipe_feedback recipe_ids are valid


In [11]:
assert ingredients["ingredient_id"].is_unique, "‚ùå Duplicate ingredient_id found"
assert recipes["recipe_id"].is_unique, "‚ùå Duplicate recipe_id found"
assert pantry["pantry_id"].is_unique, "‚ùå Duplicate pantry_id found"

print("‚úÖ All primary keys are unique")

‚úÖ All primary keys are unique


Sample Joins

Join Recipes ‚Üî Ingredients (via recipe_ingredients)

In [13]:
recipe_ingredient_join = (
    recipe_ingredients
    .merge(recipes, on="recipe_id", how="left")
    .merge(ingredients, on="ingredient_id", how="left")
)

display(
    recipe_ingredient_join[
        ["recipe_id", "name_x", "ingredient_id", "name_y", "quantity", "unit"]
    ].head(10)
)

# Confirms names + quantities line up correctly

Unnamed: 0,recipe_id,name_x,ingredient_id,name_y,quantity,unit
0,1,Vegetable Upma,1,onion,1.0,pieces
1,1,Vegetable Upma,2,tomato,1.0,pieces
2,1,Vegetable Upma,3,rice,100.0,grams
3,1,Vegetable Upma,4,cumin seeds,0.5,tsp
4,1,Vegetable Upma,8,salt,0.5,tsp
5,2,Chana Salad,6,chickpeas,150.0,grams
6,2,Chana Salad,1,onion,0.5,pieces
7,2,Chana Salad,7,olive oil,1.0,tbsp
8,4,Overnight Oats,9,oats,50.0,grams
9,4,Overnight Oats,5,curd,100.0,ml


Join Pantry ‚Üî Ingredients

In [15]:
pantry_join = pantry.merge(ingredients, on="ingredient_id", how="left")

display(
    pantry_join[
        ["pantry_id", "name", "quantity", "unit", "location", "expiry_date"]
    ]
)

Unnamed: 0,pantry_id,name,quantity,unit,location,expiry_date
0,1,onion,3,pieces,fridge,2025-01-20
1,2,rice,1000,grams,pantry,2025-06-01
2,3,chickpeas,500,grams,pantry,2025-05-15
3,4,oats,250,grams,pantry,2025-03-10
4,5,curd,500,ml,fridge,2025-01-18


In [16]:
print("üìä SANITY CHECKS")

print(f"Total ingredients: {len(ingredients)}")
print(f"Total recipes: {len(recipes)}")
print(f"Total recipe-ingredient mappings: {len(recipe_ingredients)}")
print(f"Total pantry items: {len(pantry)}")
print(f"Total aliases: {len(ingredient_aliases)}")
print(f"Total feedback entries: {len(recipe_feedback)}")

üìä SANITY CHECKS
Total ingredients: 10
Total recipes: 4
Total recipe-ingredient mappings: 11
Total pantry items: 5
Total aliases: 5
Total feedback entries: 2


In [17]:
critical_columns = {
    "ingredients": ["ingredient_id", "name"],
    "recipes": ["recipe_id", "name", "dish_type"],
    "recipe_ingredients": ["recipe_id", "ingredient_id", "quantity"],
    "pantry": ["pantry_id", "ingredient_id", "quantity"],
}

for table, cols in critical_columns.items():
    df = datasets[table]
    for col in cols:
        assert df[col].notna().all(), f"‚ùå Null values in {table}.{col}"

print("‚úÖ No nulls in critical columns")

‚úÖ No nulls in critical columns
