In [1]:
import pandas as pd
import numpy as np

recipes = pd.read_csv('parsed_recipes.csv')

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
recipes.head()

Unnamed: 0,id,name,description,ingredients_raw,steps,servings,serving_size,tags,ingredients,amounts,amount_gram,serving_size_numeric,predicted_total,actual_total,approx_rate,total_recipe_weight,recipe_energy_per100g,recipe_carbohydrates_per100g,recipe_proteins_per100g,recipe_fat_per100g,recipe_energy_kcal_per100g,recipe_energy_per_serving,recipe_carbohydrates_per_serving,recipe_proteins_per_serving,recipe_fat_per_serving,recipe_energy_kcal_per_serving,steps_parsed,steps_validated
0,76133,Reuben and Swiss Casserole Bake,I think this is even better than a reuben sand...,"['1/2-1 lb corned beef, cooked and choppe...","[""Set oven to 350 degrees F."", ""Butter a 9 x 1...",4.0,1 (207 g),"['60-minutes-or-less', 'time-to-make', 'course...","['corned beef', 'thousand island dressing', 's...","[{'unit': 'pound', 'amount_min': 0.5, 'amount_...","['226.8-453.6', 60.0, 453.6, 226.8, 150.0, 53.9]",207.0,1284.5,828.0,0.644609,1284.5,837.891086,8.031474,12.64761,13.490992,200.260776,1734.434548,16.625151,26.180553,27.926353,414.539806,"['Set oven to 350 degrees F.', 'Butter a 9 x 1...",True
1,392934,Safe to Eat Raw Chocolate Chip Oreo Cookie &qu...,I was searching the web for something like thi...,"['1/2 cup butter, room temperature ', '1/2...","[""Cream butter and sugars together."", ""Blend i...",24.0,1 (26 g),"['15-minutes-or-less', 'time-to-make', 'course...","['butter', 'brown sugar', 'granulated sugar', ...","[{'unit': 'cup', 'amount': 0.5}, {'unit': 'cup...","[107.8, 100.0, 41.4, 60.0, 3.5, 360.0, 1.6, 21...",26.0,943.3,624.0,0.661507,943.3,1736.646273,62.849303,5.789453,17.069299,415.068421,451.528031,16.340819,1.505258,4.438018,107.917789,"['Cream butter and sugars together.', 'Blend i...",True
2,489452,Teriyaki Pork Chops,I made these on a whim and they are my husband...,"['1 (16 ounce) bottle teriyaki sauce', '4 ...","[""I like to marinade them overnight in a ziplo...",4.0,1 (313 g),"['weeknight', '15-minutes-or-less', 'time-to-m...","['teriyaki sauce', 'pork chops']","[{'unit': 'ounce', 'amount': 16.0}, {'unit': '...","[453.6, 920.0]",313.0,1373.6,1252.0,0.911474,1373.6,850.683285,9.387697,14.16645,10.807455,203.318185,2662.638681,29.383492,44.340987,33.827334,636.385918,['I like to marinade them overnight in a ziplo...,True
3,318331,Granny's Butter Rolls,"Recipe from Crystal Van Poppe in her booklet ""...","['2 1/4 cups biscuit mix', '2/3 cup wa...","[""Mix biscuit mix and water until a soft dough...",12.0,1 (91 g),"['60-minutes-or-less', 'time-to-make', 'course...","['biscuit mix', 'water', 'granulated sugar', '...","[{'unit': 'cup', 'amount': 2.25}, {'unit': 'cu...","[270.0, 151.3, 82.8, 53.9, 480.0, 4.9]",91.0,1042.9,1092.0,1.04708,1042.9,813.073064,30.019476,3.678463,6.370956,194.329126,739.896488,27.317723,3.347401,5.79757,176.839505,['Mix biscuit mix and water until a soft dough...,True
4,384761,Chocolate Mousse Cupcakes,From The Cupcake Deck by Elinor Klivans.The ad...,"['3/4 cup heavy whipping cream', '2 tabl...","[""FOR CHOCOLATE GANACHE FROSTING (Makes about ...",12.0,1 (155 g),"['time-to-make', 'course', 'preparation', 'cup...","['heavy whipping cream', 'unsalted butter', 's...","[{'unit': 'cup', 'amount': 0.75}, {'unit': 'ta...","[180.0, 29.6, 255.1, 2.3, 180.0, 300.0, 29.6, ...",155.0,1918.5,1860.0,0.969507,1918.5,1587.32007,41.946135,2.892101,22.968018,379.378602,2460.346109,65.016509,4.482757,35.600429,588.036833,"['\\""FOR CHOCOLATE GANACHE FROSTING (Makes abo...",True


In [4]:
# Convert stringified columns into an actual list
import ast

def safe_ast_parse(s):
    try:
        parsed = ast.literal_eval(s)
        return parsed if isinstance(parsed,list) else None
    except Exception as e:
        return None
    
recipes['parsed_steps'] = recipes['steps'].apply(safe_ast_parse)
recipes['validated_steps'] = recipes['parsed_steps'].notna()

print(f"Successfully parsed: {recipes['validated_steps'].sum()}")
print(f"Failed: {len(recipes) - recipes['validated_steps'].sum()}")



Successfully parsed: 113784
Failed: 20370




In [5]:
# Drop rows where steps couldn't be parsed into a list
recipes = recipes[recipes['validated_steps']].copy()

# drop the helper columns
recipes.drop(columns=['parsed_steps', 'validated_steps'], inplace=True)

# Check the new shape
print(f"Remaining rows after dropping invalid ones: {len(recipes)}")

Remaining rows after dropping invalid ones: 113784


In [6]:
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)

In [7]:
recipes['amount_gram'] = recipes['amount_gram'].apply(ast.literal_eval)

In [8]:
recipes['amounts'] = recipes['amounts'].apply(ast.literal_eval)

In [9]:
recipes['tags'] = recipes['tags'].apply(ast.literal_eval)

In [10]:
all_tags = [tag for tags in recipes['tags'] for tag in tags]
unique_tags = set(all_tags)
unique_tags = sorted(unique_tags)
# Display
print(f"Total unique tags: {len(unique_tags)}")
print(unique_tags)

Total unique tags: 533
['', '1-day-or-more', '15-minutes-or-less', '3-steps-or-less', '30-minutes-or-less', '4-hours-or-less', '5-ingredients-or-less', '60-minutes-or-less', 'a1-sauce', 'african', 'american', 'amish-mennonite', 'angolan', 'appetizers', 'apples', 'april-fools-day', 'argentine', 'artichoke', 'asian', 'asparagus', 'australian', 'austrian', 'avocado', 'bacon', 'baja', 'baking', 'bananas', 'bar-cookies', 'barbecue', 'bass', 'bath-beauty', 'bean-soup', 'beans', 'beans-soups', 'bear', 'beef', 'beef-liver', 'beef-organ-meats', 'beef-ribs', 'beef-sauces', 'beef-sausage', 'beginner-cook', 'beijing', 'belgian', 'berries', 'beverages', 'birthday', 'biscotti', 'bisques-cream-soups', 'bizarre', 'black-beans', 'blueberries', 'bok-choys', 'brazilian', 'bread-machine', 'bread-pudding', 'breads', 'breakfast', 'breakfast-eggs', 'brewing', 'brisket', 'british-columbian', 'broccoli', 'broil', 'brown-bag', 'brown-rice', 'brownies', 'brunch', 'burgers', 'cabbage', 'cajun', 'cake-fillings-and

In [11]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113784 entries, 0 to 134153
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   id                                113784 non-null  int64  
 1   name                              113784 non-null  object 
 2   description                       113181 non-null  object 
 3   ingredients_raw                   113784 non-null  object 
 4   steps                             113784 non-null  object 
 5   servings                          113784 non-null  float64
 6   serving_size                      113784 non-null  object 
 7   tags                              113784 non-null  object 
 8   ingredients                       113784 non-null  object 
 9   amounts                           113784 non-null  object 
 10  amount_gram                       113784 non-null  object 
 11  serving_size_numeric              113784 non-null  float6

In [12]:
pd.set_option('display.max_colwidth', None)

In [13]:
recipes.head(1)

Unnamed: 0,id,name,description,ingredients_raw,steps,servings,serving_size,tags,ingredients,amounts,amount_gram,serving_size_numeric,predicted_total,actual_total,approx_rate,total_recipe_weight,recipe_energy_per100g,recipe_carbohydrates_per100g,recipe_proteins_per100g,recipe_fat_per100g,recipe_energy_kcal_per100g,recipe_energy_per_serving,recipe_carbohydrates_per_serving,recipe_proteins_per_serving,recipe_fat_per_serving,recipe_energy_kcal_per_serving,steps_parsed,steps_validated
0,76133,Reuben and Swiss Casserole Bake,"I think this is even better than a reuben sandwich, I bet you will probably eat the whole casserole by yourself! :)","['1/2-1 lb corned beef, cooked and chopped ', '1/4 cup thousand island dressing (Kraft is best)', '1 (16 ounce) can sauerkraut, drained and rinsed with cold water ', '1/2 lb swiss cheese, shredded ', '6 slices rye bread, crumbled into very small pieces ', '1/4 cup butter, melted ']","[""Set oven to 350 degrees F."", ""Butter a 9 x 13-inch casserole dish."", ""Place the corned beef in the bottom on the casserole dish then dot all over with the dressing."", ""Spread the sauerkraut over the top of the dressing then top with the grated Swiss cheese."", ""Toss the breadcrumbs with the butter in a bowl then sprinkle evenly over the the cheese."", ""Bake for 25-30 mins, or until bubbly."", ""NOTE; if desired the rye bread topping can be doubled.""]",4.0,1 (207 g),"[60-minutes-or-less, time-to-make, course, main-ingredient, preparation, casseroles, main-dish, eggs-dairy, oven, cheese, dietary, equipment]","[corned beef, thousand island dressing, sauerkraut, swiss cheese, rye bread, butter]","[{'unit': 'pound', 'amount_min': 0.5, 'amount_max': 1.0}, {'unit': 'cup', 'amount': 0.25}, {'unit': 'ounce', 'amount': 16.0}, {'unit': 'pound', 'amount': 0.5}, {'unit': 'slices', 'amount': 6.0}, {'unit': 'cup', 'amount': 0.25}]","[226.8-453.6, 60.0, 453.6, 226.8, 150.0, 53.9]",207.0,1284.5,828.0,0.644609,1284.5,837.891086,8.031474,12.64761,13.490992,200.260776,1734.434548,16.625151,26.180553,27.926353,414.539806,"['Set oven to 350 degrees F.', 'Butter a 9 x 13-inch casserole dish.', 'Place the corned beef in the bottom on the casserole dish then dot all over with the dressing.', 'Spread the sauerkraut over the top of the dressing then top with the grated Swiss cheese.', 'Toss the breadcrumbs with the butter in a bowl then sprinkle evenly over the the cheese.', 'Bake for 25-30 mins, or until bubbly.', 'NOTE; if desired the rye bread topping can be doubled.']",True


In [14]:
recipes.to_csv("recipes_revisited.csv",index=False)

In [1]:
import pandas as pd
import numpy as np
recipes = pd.read_csv("recipes_revisited.csv")

In [2]:
import ast
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)

In [4]:
# First, let's extract all unique ingredients from the dataframe
all_ingredients = []

# Collect all ingredients from all recipes
for ingredient_list in recipes['ingredients']:
    if isinstance(ingredient_list, list):
        all_ingredients.extend(ingredient_list)

# Count frequency of each ingredient
from collections import Counter
ingredient_counts = Counter(all_ingredients)

print(f"Total ingredient occurrences: {len(all_ingredients)}")
print(f"Unique ingredients: {len(ingredient_counts)}")

# Convert to dataframe for easier analysis

ingredients_df = pd.DataFrame(
    ingredient_counts.items(), 
    columns=['ingredient', 'frequency']
).sort_values('frequency', ascending=False)


Total ingredient occurrences: 968758
Unique ingredients: 24533


In [6]:
print("\nTop 20 most common ingredients:")
print(ingredients_df.head(20))

# Look at bottom 20 (potential misspellings)
print("\nLeast common ingredients (potential misspellings):")
print(ingredients_df.tail(20))


Top 20 most common ingredients:
            ingredient  frequency
11                salt      48042
43               sugar      30782
5               butter      28939
44                eggs      21790
62              garlic      19350
17               water      18116
29               onion      17199
10               flour      14872
8                 milk      14435
57              pepper      13454
6          brown sugar      12147
51           olive oil      11989
27   all-purpose flour      11938
40       baking powder      11856
41         baking soda      10203
121                egg      10103
18             vanilla       9726
131           cinnamon       7920
9      vanilla extract       7441
34         lemon juice       7255

Least common ingredients (potential misspellings):
                                   ingredient  frequency
13456        no-sugar-added crushed pineapple          1
13458              frozen whole-grain waffles          1
13460                        b

In [7]:
from fuzzywuzzy import fuzz
from collections import defaultdict
import time
import json

def cluster_ingredients_optimized(ingredients_df, similarity_threshold=85):
    """
    Cluster ingredients with optimizations for speed
    """
    print(f"Starting clustering of {len(ingredients_df)} ingredients...")
    start_time = time.time()
    
    # Sort by frequency - process common ingredients first
    ingredients_df = ingredients_df.sort_values('frequency', ascending=False)
    all_ingredients = ingredients_df.to_dict('records')
    
    clusters = []
    processed = set()
    
    for i, ing1 in enumerate(all_ingredients):
        if i % 1000 == 0:
            elapsed = time.time() - start_time
            print(f"Processed {i}/{len(all_ingredients)} ingredients ({elapsed/60:.1f} minutes)")
        
        if ing1['ingredient'] in processed:
            continue
        
        # Start new cluster
        cluster = {
            'canonical': ing1['ingredient'],
            'variants': [ing1['ingredient']],
            'frequencies': {ing1['ingredient']: ing1['frequency']}
        }
        processed.add(ing1['ingredient'])
        
        # Only check ingredients that haven't been processed
        for j in range(i + 1, len(all_ingredients)):
            ing2 = all_ingredients[j]
            
            if ing2['ingredient'] in processed:
                continue
            
            # Quick length check - very different lengths unlikely to match
            len_ratio = len(ing1['ingredient']) / len(ing2['ingredient'])
            if len_ratio < 0.5 or len_ratio > 2.0:
                continue
            
            # Calculate similarity
            ratio = fuzz.ratio(ing1['ingredient'].lower(), ing2['ingredient'].lower())
            
            # If close enough, do more expensive checks
            if ratio > 70:
                partial = fuzz.partial_ratio(ing1['ingredient'].lower(), ing2['ingredient'].lower())
                
                # Check containment for ingredients > 4 chars
                contains = False
                if len(ing1['ingredient']) > 4 and len(ing2['ingredient']) > 4:
                    contains = (ing1['ingredient'].lower() in ing2['ingredient'].lower() or 
                              ing2['ingredient'].lower() in ing1['ingredient'].lower())
                
                if ratio > similarity_threshold or partial > 90 or contains:
                    cluster['variants'].append(ing2['ingredient'])
                    cluster['frequencies'][ing2['ingredient']] = ing2['frequency']
                    processed.add(ing2['ingredient'])
        
        # Only keep clusters with multiple items
        if len(cluster['variants']) > 1:
            # Calculate total frequency
            cluster['total_frequency'] = sum(cluster['frequencies'].values())
            clusters.append(cluster)
    
    total_time = time.time() - start_time
    print(f"\nClustering completed in {total_time/60:.1f} minutes")
    
    return clusters

# Run on your data
print("Running ingredient clustering...")
clusters = cluster_ingredients_optimized(ingredients_df)

# Sort by impact (total frequency)
clusters.sort(key=lambda x: x['total_frequency'], reverse=True)

# Save results
with open('ingredient_clusters.json', 'w', encoding='utf-8') as f:
    json.dump(clusters, f, indent=2, ensure_ascii=False)

# Show top clusters
print(f"\nFound {len(clusters)} ingredient clusters")
print("\nTop 20 clusters by total frequency:")
for i, cluster in enumerate(clusters[:20]):
    print(f"\n{i+1}. {cluster['canonical']} (total frequency: {cluster['total_frequency']})")
    print(f"   Variants: {', '.join(cluster['variants'])}")

# Create mapping dictionary for easy lookup
ingredient_mapping = {}
for cluster in clusters:
    for variant in cluster['variants']:
        ingredient_mapping[variant] = cluster['variants']

# Save mapping
with open('ingredient_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(ingredient_mapping, f, indent=2, ensure_ascii=False)

print("\nSaved to:")
print("- ingredient_clusters.json (full cluster data)")
print("- ingredient_mapping.json (variant lookup)")

Running ingredient clustering...
Starting clustering of 24533 ingredients...
Processed 0/24533 ingredients (0.0 minutes)
Processed 1000/24533 ingredients (0.5 minutes)
Processed 2000/24533 ingredients (0.9 minutes)
Processed 3000/24533 ingredients (1.2 minutes)
Processed 4000/24533 ingredients (1.5 minutes)
Processed 5000/24533 ingredients (1.7 minutes)
Processed 6000/24533 ingredients (1.9 minutes)
Processed 7000/24533 ingredients (2.1 minutes)
Processed 8000/24533 ingredients (2.3 minutes)
Processed 9000/24533 ingredients (2.5 minutes)
Processed 10000/24533 ingredients (2.6 minutes)
Processed 11000/24533 ingredients (2.8 minutes)
Processed 12000/24533 ingredients (2.9 minutes)
Processed 13000/24533 ingredients (3.0 minutes)
Processed 14000/24533 ingredients (3.1 minutes)
Processed 15000/24533 ingredients (3.3 minutes)
Processed 16000/24533 ingredients (3.3 minutes)
Processed 17000/24533 ingredients (3.4 minutes)
Processed 18000/24533 ingredients (3.5 minutes)
Processed 19000/24533 in