# Imports


In [22]:
import os
import ast
import re
import json
import unicodedata
import pandas as pd
import sqlite3
import kagglehub


# Load the Dataset

In [23]:
# Download the Kaggle dataset
path = kagglehub.dataset_download("josephmdev/recipes")
print("Downloaded to:", path)


#checking to see the files in path
os.listdir(path)

Using Colab cache for faster access to the 'recipes' dataset.
Downloaded to: /kaggle/input/recipes


['5k-recipes.db']

In [24]:
# Connect to the .db file
conn = sqlite3.connect(path + "/5k-recipes.db")

# Load the tables inside .db file and show data
df = pd.read_sql_query("SELECT * FROM recipes;", conn)
df.head()

Unnamed: 0,id,Title,Ingredients,Instructions
0,1,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ..."
1,2,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...
2,3,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...
3,4,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...
4,5,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...


# Performing EDA:

In [25]:
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("Column names:", df.columns)

Number of rows: 5000
Number of columns: 4
Column names: Index(['id', 'Title', 'Ingredients', 'Instructions'], dtype='object')


In [26]:
#Check to see if there's missing data
df.isnull().sum()

Unnamed: 0,0
id,0
Title,0
Ingredients,0
Instructions,0


No missing data, great!

In [27]:
# We don't need the id column so we can drop it.
df.drop('id', axis=1, inplace=True)
print("Dataset loaded:", df.shape, "rows")
df.head()


Dataset loaded: (5000, 3) rows


Unnamed: 0,Title,Ingredients,Instructions
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...


Data looks clean, we can move to preprocessing

Create a Python script preprocessing.py:

Load dataset

Lowercase everything

Remove symbols

Tokenize ingredients

Store cleaned result as cleaned_recipes.json

Data format example:

{
  "recipe_name": "Chicken Fried Rice",
  "ingredients": ["chicken", "rice", "egg", "soy sauce"],
  "tags": ["easy", "asian", "30 minutes"]

# Preprocessing:

1. Ingredient Parsing + Normalization

In [28]:
def ensure_ingredient_list(value):
    """
    Returns a Python list of ingredient strings.
    Accepts:
        - list
        - string list repr "['a','b']"
        - comma separated string "a, b, c"
    """
    if isinstance(value, list):
        return value
    if value is None:
        return []
    if isinstance(value, str):
        s = value.strip()

        # Try literal list "[...]"
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, list):
                return [str(x) for x in parsed]
        except:
            pass

        # fallback: split by commas
        return [p.strip() for p in re.split(r",|\n|;", s) if p.strip()]

    return [str(value)]

df["ingredients_raw"] = df["Ingredients"]   # backup

# Allowed chars (keep fractions, slash, hyphen, parentheses, units)
_allowed_chars_re = re.compile(r"[^0-9a-zA-Z\s\.,\-/()¼½¾⅓⅔⅛⅜⅝⅞%°–—']")

def normalize_ingredient_item(text):
    """
    Lowercase, normalize unicode, and remove unwanted symbols
    while keeping quantities (½, 1/3), slashes, hyphens, etc.
    """
    if text is None:
        return ""

    text = unicodedata.normalize("NFKC", str(text)).lower()
    text = _allowed_chars_re.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Extract ingredient names (strip quantity + unit)
UNITS = {
    "cup","cups","c","tbsp","tbsp.","tablespoon","tablespoons",
    "tbs","tbs.","tsp","tsp.","teaspoon","teaspoons","t",
    "oz","oz.","ounce","ounces","lb","lb.","lbs","lbs.","pound","pounds",
    "g","g.","gram","grams","kg","kg.","kilogram","kilograms",
    "ml","ml.","l","l.","liter","liters","pinch","dash","clove","cloves",
    "slice","slices","package","packages","can","cans","stick","sticks",
    "bunch","sprig","sprigs","piece","pieces","bag","bags","box","boxes",
    "quart","quart.","pint","pint.","large","small","medium","jar","jars",
    "container","containers","fillet","fillets","pkg","pkgs"
}

_num_re = re.compile(r"^\d+([.,]\d+)?$")
_fraction_re = re.compile(r"^\d+\/\d+$")
_unicode_frac_re = re.compile(r"^[¼½¾⅓⅔⅛⅜⅝⅞]+$")
_range_re = re.compile(r"^\d+[-–—]\d+$")
_digitstart_re = re.compile(r"^\d")

_strip_punct = re.compile(r"^[\W_]+|[\W_]+$")


def extract_ingredient_name(ingredient):
    """
    Remove quantities + units from the beginning of an ingredient string.
    Example:
        '1 1/2 cups chopped onions' → 'chopped onions'
        '3–4 lb. pork shoulder' → 'pork shoulder'
    """
    if not ingredient:
        return ""

    tokens = ingredient.split()
    i = 0
    n = len(tokens)

    while i < n:
        tok = tokens[i].strip(",.()").lower()

        # Numbers / decimals / fractions / unicode fractions / ranges
        if (_num_re.match(tok)
            or _fraction_re.match(tok)
            or _unicode_frac_re.match(tok)
            or _range_re.match(tok)
            or _digitstart_re.match(tok)):
            i += 1
            continue

        # Units (cup, tbsp, lb, etc.)
        if tok.rstrip(".") in UNITS:
            i += 1
            continue

        # Found ingredient name
        break

    name = " ".join(tokens[i:]).strip()
    name = _strip_punct.sub("", name)
    return name.lower()


2. Cleaning up the Dataframe

In [29]:
df["ingredient_list"] = df["Ingredients"].apply(ensure_ingredient_list)

df["ingredient_list"] = df["ingredient_list"].apply(
    lambda lst: [normalize_ingredient_item(x) for x in lst]
)

df["ingredient_tokens"] = df["ingredient_list"].apply(
    lambda lst: [extract_ingredient_name(x) for x in lst]
)

df["recipe_name"] = df["Title"].apply(lambda x: unicodedata.normalize("NFKC", str(x)).lower().strip())
df["instructions_clean"] = df["Instructions"].apply(
    lambda x: unicodedata.normalize("NFKC", str(x)).lower().strip()
)

3. Saving clean dataset into a final json file

In [30]:
records = []
for _, row in df.iterrows():
    records.append({
        "recipe_name": row["recipe_name"],
        "ingredients_raw": row["ingredients_raw"],
        "ingredient_tokens": row["ingredient_tokens"],
        "instructions": row["instructions_clean"],
    })

output_path = "cleaned_recipes.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print("Saved cleaned dataset to:", output_path)

Saved cleaned dataset to: cleaned_recipes.json


# Final JSON File with Cleaned Dataset

In [31]:
#verify the file is saved

os.listdir()

['.config', 'cleaned_recipes.json', 'sample_data']

In [34]:
#View the dataset
df = pd.read_json("cleaned_recipes.json")

print("Shape of cleaned dataset:", df.shape)
df.head()

Shape of cleaned dataset: (5000, 4)


Unnamed: 0,recipe_name,ingredients_raw,ingredient_tokens,instructions
0,miso-butter roast chicken with acorn squash pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","[whole chicken, kosher salt, divided, plus mor...","pat chicken dry with paper towels, season all ..."
1,crispy salt and pepper potatoes,"['2 large egg whites', '1 pound new potatoes (...","[egg whites, new potatoes (about 1 inch in dia...",preheat oven to 400°f and line a rimmed baking...
2,thanksgiving mac and cheese,"['1 cup evaporated milk', '1 cup whole milk', ...","[evaporated milk, whole milk, garlic powder, o...",place a rack in middle of oven; preheat to 400...
3,italian sausage and bread stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...","[to 1-pound) round italian loaf, cut into 1-in...",preheat oven to 350°f with rack in middle. gen...
4,newton's law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...","[dark brown sugar, hot water, bourbon, fresh l...",stir together brown sugar and hot water in a c...
