<a href="https://www.kaggle.com/code/tommasofacchin/02-model-training-foodcom?scriptVersionId=298020653" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

Dataset created in this notebook -> [01_data_preparation_foodcom](https://www.kaggle.com/code/tommasofacchin/01-data-preparation-foodcom/).

## Dataset preparation

In [2]:
df = pd.read_csv("/kaggle/input/datasets/tommasofacchin/recipes-food-com/recipes.csv")

print(df.shape)
df.head()

(93254, 13)


Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,n_steps,description,avg_rating,n_reviews,cuisine,allergens,calories
0,137739,arriba baked winter squash mexican style,55,"['60-minutes-or-less', 'time-to-make', 'course...","['winter squash', 'mexican seasoning', 'mixed ...",7,11,autumn is my favorite time of year to cook! th...,5.0,3,american,['milk'],51.5
1,31490,a bit different breakfast pizza,30,"['30-minutes-or-less', 'time-to-make', 'course...","['prepared pizza crust', 'sausage patty', 'egg...",6,9,this recipe calls for the crust to be prebaked...,3.5,4,american,"['egg', 'milk']",173.4
2,75452,beat this banana bread,70,"['weeknight', 'time-to-make', 'course', 'main-...","['sugar', 'unsalted butter', 'bananas', 'eggs'...",9,12,from ann hodgman's,4.4,5,unknown,"['wheat', 'egg', 'milk']",2669.3
3,63986,chicken lickin good pork chops,500,"['weeknight', 'time-to-make', 'course', 'main-...","['lean pork chops', 'flour', 'salt', 'dry must...",7,5,here's and old standby i enjoy from time to ti...,4.368421,19,unknown,['wheat'],105.7
4,43026,chile rellenos,45,"['60-minutes-or-less', 'time-to-make', 'course...","['egg roll wrap', 'whole green chilies', 'chee...",5,9,a favorite from a local restaurant no longer i...,4.045455,22,american,"['egg', 'milk']",94.0


In [3]:
df["log_n_reviews"] = np.log1p(df["n_reviews"])

num_cols = ["minutes", "n_ingredients", "n_steps", "calories", "log_n_reviews"]
cat_cols = ["cuisine"]

X_num = df[num_cols]
X_cat = df[cat_cols]
y = df["avg_rating"]

In [4]:
X = df[num_cols + cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Model

In [5]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1,
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("regressor", rf),
    ]
)

model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)          
mae = mean_absolute_error(y_val, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


MSE: 0.44783587466960384
RMSE: 0.6692054054396183
MAE: 0.49956137494787295


In [7]:
joblib.dump(model, "smart_pantry_model.joblib")

['smart_pantry_model.joblib']

## Test

In [8]:
user_allergens = ["milk", "nuts"]         
user_max_time = 40                        
user_cuisines = ["asian"]
top_k = 5

def has_forbidden_allergen(allergens_list, forbidden):
    return any(a in allergens_list for a in forbidden)

mask = (
    (df["minutes"] <= user_max_time) &
    (df["cuisine"].isin(user_cuisines)) &
    (~df["allergens"].apply(lambda lst: has_forbidden_allergen(lst, user_allergens)))
)

df_candidates = df[mask].copy()

cols = ["minutes", "n_ingredients", "n_steps", "calories", "log_n_reviews", "cuisine"]

if "log_n_reviews" not in df_candidates.columns:
    df_candidates["log_n_reviews"] = np.log1p(df_candidates["n_reviews"])

X_cand = df_candidates[cols]

df_candidates["predicted_rating"] = model.predict(X_cand)

df_candidates = df_candidates.sort_values(
    ["predicted_rating", "n_reviews"],
    ascending=[False, False]
)

top5 = df_candidates.head(top_k)

top5[["recipe_id", "name", "cuisine", "minutes",
      "avg_rating", "predicted_rating", "allergens"]]

Unnamed: 0,recipe_id,name,cuisine,minutes,avg_rating,predicted_rating,allergens
60569,419406,orange chicken style tofu,asian,30,5.0,4.932083,['soy']
75648,241412,simple sweet and sour chicken,asian,25,5.0,4.917083,"['wheat', 'soy']"
48549,230831,korean pasta salad,asian,20,5.0,4.910417,"['wheat', 'sesame', 'soy', 'egg']"
3430,424216,aromatic vegetable fried rice su cai chao fan,asian,25,5.0,4.908417,"['sesame', 'peanut', 'soy']"
8388,173469,beef and scallops,asian,20,5.0,4.905,"['shellfish', 'peanut', 'soy']"
