In [1]:
import numpy as np 
import pandas as pd 
import ast

from collections import Counter

# Dataset preparation

In [2]:
df_interact = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_interactions.csv")
df_recipes = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv")

print(f"Interactions: {df_interact.shape}")
print(f"Recipes: {df_recipes.shape}")
df_interact.head()

Interactions: (1132367, 5)
Recipes: (231637, 12)


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


## Ratings

In [3]:
grouped = (
    df_interact
    .groupby("recipe_id")["rating"]
    .agg(avg_rating="mean", n_reviews="count")
    .reset_index()
)

grouped.head()

Unnamed: 0,recipe_id,avg_rating,n_reviews
0,38,4.25,4
1,39,3.0,1
2,40,4.333333,9
3,41,4.5,2
4,43,1.0,1


In [4]:
df_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [5]:
df_recipes = df_recipes.rename(columns={"id": "recipe_id"})

cols = [
    "recipe_id",   
    "name",
    "minutes",
    "tags",
    "ingredients",
    "n_ingredients",
    "n_steps",
    "nutrition",
    "description"
]


df = df_recipes[cols]

df = df.merge(grouped, on="recipe_id", how="left")

print(f"Shape: {df.shape}")
df.head()

Shape: (231637, 11)


Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,n_steps,nutrition,description,avg_rating,n_reviews
0,137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['winter squash', 'mexican seasoning', 'mixed ...",7,11,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",autumn is my favorite time of year to cook! th...,5.0,3
1,31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['prepared pizza crust', 'sausage patty', 'egg...",6,9,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",this recipe calls for the crust to be prebaked...,3.5,4
2,112140,all in the kitchen chili,130,"['time-to-make', 'course', 'preparation', 'mai...","['ground beef', 'yellow onions', 'diced tomato...",13,6,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",this modified version of 'mom's' chili was a h...,4.0,1
3,59389,alouette potatoes,45,"['60-minutes-or-less', 'time-to-make', 'course...","['spreadable cheese with garlic and herbs', 'n...",11,11,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]","this is a super easy, great tasting, make ahea...",4.5,2
4,44061,amish tomato ketchup for canning,190,"['weeknight', 'time-to-make', 'course', 'main-...","['tomato juice', 'apple cider vinegar', 'sugar...",8,5,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",my dh's amish mother raised him on this recipe...,5.0,1


In [6]:
df["n_reviews"].value_counts()

n_reviews
1       91953
2       45120
3       25259
4       15712
5       10709
        ...  
252         1
298         1
455         1
1305        1
410         1
Name: count, Length: 362, dtype: int64

In [7]:
df.isna().sum()

recipe_id           0
name                1
minutes             0
tags                0
ingredients         0
n_ingredients       0
n_steps             0
nutrition           0
description      4979
avg_rating          0
n_reviews           0
dtype: int64

In [8]:
df = df[~df["name"].isna()]

df["description"] = df["description"].fillna("")

#df = df[df["n_reviews"] >= 3]

In [9]:
print(df["minutes"].describe())
print(df["avg_rating"].describe())

count    2.316360e+05
mean     9.398587e+03
std      4.461973e+06
min      0.000000e+00
25%      2.000000e+01
50%      4.000000e+01
75%      6.500000e+01
max      2.147484e+09
Name: minutes, dtype: float64
count    231636.000000
mean          4.346265
std           0.990767
min           0.000000
25%           4.000000
50%           4.714286
75%           5.000000
max           5.000000
Name: avg_rating, dtype: float64


In [10]:
upper = df["minutes"].quantile(0.99)  
df = df[(df["minutes"] > 0) & (df["minutes"] <= upper) & (df["n_reviews"] >= 3)]

print(df["minutes"].describe())
print(df["avg_rating"].describe())

count    93254.000000
mean        63.486210
std         92.564268
min          1.000000
25%         20.000000
50%         36.000000
75%         65.000000
max        900.000000
Name: minutes, dtype: float64
count    93254.000000
mean         4.419236
std          0.644096
min          0.000000
25%          4.166667
50%          4.625000
75%          4.909091
max          5.000000
Name: avg_rating, dtype: float64


## Tags

In [11]:
def parse_tags(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []

tags_lists = df["tags"].apply(parse_tags)

counter = Counter()
for tags in tags_lists:
    counter.update(t.lower() for t in tags)

most_common_tags = counter.most_common(500)

tags_only = [tag for tag, count in most_common_tags]
print(", ".join(tags_only))

preparation, time-to-make, course, dietary, main-ingredient, easy, occasion, cuisine, low-in-something, equipment, main-dish, 60-minutes-or-less, number-of-servings, taste-mood, meat, vegetables, 30-minutes-or-less, north-american, 4-hours-or-less, low-carb, 15-minutes-or-less, low-sodium, healthy, desserts, 3-steps-or-less, dinner-party, low-cholesterol, low-calorie, beginner-cook, oven, vegetarian, holiday-event, 5-ingredients-or-less, inexpensive, low-protein, american, low-saturated-fat, eggs-dairy, kid-friendly, comfort-food, fruit, side-dishes, pasta-rice-and-grains, healthy-2, presentation, stove-top, european, lunch, poultry, low-fat, for-1-or-2, seasonal, to-go, weeknight, chicken, brunch, appetizers, beef, one-dish-meal, cheese, for-large-groups, breads, breakfast, salads, seafood, asian, pork, pasta, potluck, served-hot, high-in-something, technique, free-of-something, cookies-and-brownies, condiments-etc, potatoes, beverages, sweet, savory, vegan, soups-stews, summer, very-

In [12]:
ETHNICITY_MAP = {
    # Italian
    "italian": "italian",
    "sicilian": "italian",
    "tuscan": "italian",

    # American / North American
    "american": "american",
    "north-american": "american",
    "southern-united-states": "american",
    "southwestern-united-states": "american",
    "northeastern-united-states": "american",
    "midwestern": "american",
    "californian": "american",
    "pacific-northwest": "american",
    "native-american": "american",

    # Mexican / Latin
    "mexican": "mexican",
    "tex-mex": "mexican",
    "central-american": "latin_american",
    "brazilian": "latin_american",
    "argentine": "latin_american",
    "chilean": "latin_american",
    "peruvian": "latin_american",
    "ecuadorean": "latin_american",
    "venezuelan": "latin_american",
    "cuban": "latin_american",
    "caribbean": "latin_american",
    "baja": "latin_american",

    # European
    "european": "european",
    "greek": "greek",
    "french": "french",
    "spanish": "spanish",
    "german": "german",
    "scandinavian": "scandinavian",
    "irish": "irish",
    "english": "british",
    "scottish": "british",
    "swiss": "european",
    "dutch": "european",
    "portuguese": "european",
    "polish": "european",
    "hungarian": "european",
    "russian": "european",
    "austrian": "european",
    "belgian": "european",
    "czech": "european",
    "finnish": "european",

    # Asian
    "asian": "asian",
    "indian": "indian",
    "chinese": "chinese",
    "szechuan": "chinese",
    "cantonese": "chinese",
    "japanese": "japanese",
    "korean": "korean",
    "thai": "thai",
    "vietnamese": "asian",
    "indonesian": "asian",
    "malaysian": "asian",
    "filipino": "asian",
    "laotian": "asian",
    "mongolian": "asian",
    "cambodian": "asian",
    "pakistani": "asian",
    "hunan": "chinese",
    "beijing": "chinese",

    # Middle Eastern / North African
    "middle-eastern": "middle_eastern",
    "lebanese": "middle_eastern",
    "moroccan": "middle_eastern",
    "turkish": "middle_eastern",
    "saudi-arabian": "middle_eastern",
    "iranian-persian": "middle_eastern",
    "egyptian": "middle_eastern",
    "palestinian": "middle_eastern",

    # African
    "african": "african",
    "south-african": "african",
    "ethiopian": "african",
    "nigerian": "african",
    "angolan": "african",
    "somalian": "african",

    # Pacific
    "hawaiian": "pacific",
    "polynesian": "pacific",
    "micro-melanesia": "pacific",
    "new-zealand": "pacific",
    "australian": "pacific",
}

In [13]:
def extract_cuisine(tags_raw):
    tags = parse_tags(tags_raw)
    for t in tags:
        t_lower = t.lower()
        if t_lower in ETHNICITY_MAP:
            return ETHNICITY_MAP[t_lower]
    return "unknown"

df["cuisine"] = df["tags"].apply(extract_cuisine)


print(df["cuisine"].value_counts().head(20))

df.head()

cuisine
unknown           54382
american          21551
european           5641
asian              5034
pacific            1394
african            1237
greek              1046
french              809
german              553
scandinavian        483
spanish             376
latin_american      253
middle_eastern      178
mexican             130
indian              103
chinese              84
Name: count, dtype: int64


Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,n_steps,nutrition,description,avg_rating,n_reviews,cuisine
0,137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['winter squash', 'mexican seasoning', 'mixed ...",7,11,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",autumn is my favorite time of year to cook! th...,5.0,3,american
1,31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['prepared pizza crust', 'sausage patty', 'egg...",6,9,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",this recipe calls for the crust to be prebaked...,3.5,4,american
9,75452,beat this banana bread,70,"['weeknight', 'time-to-make', 'course', 'main-...","['sugar', 'unsalted butter', 'bananas', 'eggs'...",9,12,"[2669.3, 160.0, 976.0, 107.0, 62.0, 310.0, 138.0]",from ann hodgman's,4.4,5,unknown
15,63986,chicken lickin good pork chops,500,"['weeknight', 'time-to-make', 'course', 'main-...","['lean pork chops', 'flour', 'salt', 'dry must...",7,5,"[105.7, 8.0, 0.0, 26.0, 5.0, 4.0, 3.0]",here's and old standby i enjoy from time to ti...,4.368421,19,unknown
16,43026,chile rellenos,45,"['60-minutes-or-less', 'time-to-make', 'course...","['egg roll wrap', 'whole green chilies', 'chee...",5,9,"[94.0, 10.0, 0.0, 11.0, 11.0, 21.0, 0.0]",a favorite from a local restaurant no longer i...,4.045455,22,american


## Allergens

In [14]:
ALLERGEN_KEYWORDS = {
    "milk": [
        "milk", "butter", "cream", "cheese", "yogurt", "mozzarella", "parmesan",
        "gouda", "cheddar", "buttermilk", "whey"
    ],
    "egg": [
        "egg", "egg yolk", "egg white", "mayonnaise"
    ],
    "nuts": [
        "almond", "walnut", "hazelnut", "cashew", "pistachio", "pecan",
        "macadamia", "brazil nut"
    ],
    "peanut": [
        "peanut", "peanut butter"
    ],
    "fish": [
        "salmon", "tuna", "cod", "trout", "halibut", "anchovy", "sardine"
    ],
    "shellfish": [
        "shrimp", "prawn", "lobster", "crab", "scallop", "mussel", "clam", "oyster"
    ],
    "wheat": [
        "wheat", "flour", "bread", "pasta", "noodles", "semolina"
    ],
    "soy": [
        "soy", "soybean", "tofu", "tempeh", "edamame", "soy sauce"
    ],
    "sesame": [
        "sesame", "tahini"
    ],
}

def parse_ingredients(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return []


def detect_allergens(ingredients_raw):
    ingredients = parse_ingredients(ingredients_raw)
    ingredients_lower = [ing.lower() for ing in ingredients]

    found = set()

    for allergen, keywords in ALLERGEN_KEYWORDS.items():
        for ing in ingredients_lower:
            if any(kw in ing for kw in keywords):
                found.add(allergen)
                break  

    return list(found)

df["allergens"] = df["ingredients"].apply(detect_allergens)

df.head()

Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,n_steps,nutrition,description,avg_rating,n_reviews,cuisine,allergens
0,137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['winter squash', 'mexican seasoning', 'mixed ...",7,11,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",autumn is my favorite time of year to cook! th...,5.0,3,american,[milk]
1,31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['prepared pizza crust', 'sausage patty', 'egg...",6,9,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",this recipe calls for the crust to be prebaked...,3.5,4,american,"[milk, egg]"
9,75452,beat this banana bread,70,"['weeknight', 'time-to-make', 'course', 'main-...","['sugar', 'unsalted butter', 'bananas', 'eggs'...",9,12,"[2669.3, 160.0, 976.0, 107.0, 62.0, 310.0, 138.0]",from ann hodgman's,4.4,5,unknown,"[milk, wheat, egg]"
15,63986,chicken lickin good pork chops,500,"['weeknight', 'time-to-make', 'course', 'main-...","['lean pork chops', 'flour', 'salt', 'dry must...",7,5,"[105.7, 8.0, 0.0, 26.0, 5.0, 4.0, 3.0]",here's and old standby i enjoy from time to ti...,4.368421,19,unknown,[wheat]
16,43026,chile rellenos,45,"['60-minutes-or-less', 'time-to-make', 'course...","['egg roll wrap', 'whole green chilies', 'chee...",5,9,"[94.0, 10.0, 0.0, 11.0, 11.0, 21.0, 0.0]",a favorite from a local restaurant no longer i...,4.045455,22,american,"[milk, egg]"


## Calories

In [15]:
def get_calories(x):
    try:
        vals = ast.literal_eval(x) if isinstance(x, str) else x
    except (ValueError, SyntaxError):
        return np.nan
    if not isinstance(vals, (list, tuple)) or len(vals) == 0:
        return np.nan
    return vals[0]  

df["calories"] = df["nutrition"].apply(get_calories)
df = df.drop(columns=["nutrition"])

df.head(20)

Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,n_steps,description,avg_rating,n_reviews,cuisine,allergens,calories
0,137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['winter squash', 'mexican seasoning', 'mixed ...",7,11,autumn is my favorite time of year to cook! th...,5.0,3,american,[milk],51.5
1,31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['prepared pizza crust', 'sausage patty', 'egg...",6,9,this recipe calls for the crust to be prebaked...,3.5,4,american,"[milk, egg]",173.4
9,75452,beat this banana bread,70,"['weeknight', 'time-to-make', 'course', 'main-...","['sugar', 'unsalted butter', 'bananas', 'eggs'...",9,12,from ann hodgman's,4.4,5,unknown,"[milk, wheat, egg]",2669.3
15,63986,chicken lickin good pork chops,500,"['weeknight', 'time-to-make', 'course', 'main-...","['lean pork chops', 'flour', 'salt', 'dry must...",7,5,here's and old standby i enjoy from time to ti...,4.368421,19,unknown,[wheat],105.7
16,43026,chile rellenos,45,"['60-minutes-or-less', 'time-to-make', 'course...","['egg roll wrap', 'whole green chilies', 'chee...",5,9,a favorite from a local restaurant no longer i...,4.045455,22,american,"[milk, egg]",94.0
17,23933,chinese candy,15,"['15-minutes-or-less', 'time-to-make', 'course...","['butterscotch chips', 'chinese noodles', 'sal...",3,4,"a little different, and oh so good. i include ...",4.833333,12,unknown,"[milk, peanut, wheat]",232.7
18,8559,chinese chop suey,70,"['weeknight', 'time-to-make', 'course', 'main-...","['celery', 'onion', 'ground pork', 'soy sauce'...",7,8,easy one-pot dinner.,2.0,4,american,[soy],395.4
20,76808,cream of spinach soup,45,"['60-minutes-or-less', 'time-to-make', 'course...","['water', 'salt', 'boiling potatoes', 'fresh s...",8,9,"wonderful comfort food from rozanne gold, a fa...",4.666667,3,unknown,[milk],126.0
22,83873,crispy crunchy chicken,35,"['60-minutes-or-less', 'time-to-make', 'course...","['boneless skinless chicken breast halves', 'c...",10,8,"delicious, crunchy fried chicken. this recipe ...",3.0,3,unknown,"[milk, wheat, egg]",335.8
24,49262,easiest ever hollandaise sauce,25,"['30-minutes-or-less', 'time-to-make', 'course...","['butter', 'lemon, juice of', 'salt', 'white p...",5,7,the secret to this easy hollandaise sauce is i...,3.5,4,unknown,"[milk, egg]",1290.4


In [16]:
df.to_csv("recipes.csv", index=False)
df.to_parquet("recipes.parquet")