In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import accuracy_score

# Change root depending on where dataset is downloaded
root = 'data'

In [2]:
# Get the training, validation, and test sets
train = pd.read_csv(root + '/interactions_train.csv')
valid = pd.read_csv(root + '/interactions_validation.csv')
test = pd.read_csv(root + '/interactions_test.csv')

In [3]:
train

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723
...,...,...,...,...,...,...
698896,926904,457971,2018-12-18,5.0,13681,141067
698897,2002312797,27208,2018-12-18,5.0,14897,99787
698898,1290903,131607,2018-12-18,5.0,11605,76163
698899,226867,363072,2018-12-18,5.0,3604,29101


In [4]:
# Getting preprocessed info about recipes and users and ingredients
recipes = pd.read_csv(root + '/PP_recipes.csv')
users = pd.read_csv(root + '/PP_users.csv')
ingredients = pd.read_pickle(root + '/ingr_map.pkl')

In [5]:
recipes[:2]

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."


In [6]:
users[:2]

Unnamed: 0,u,techniques,items,n_items,ratings,n_ratings
0,0,"[8, 0, 0, 5, 6, 0, 0, 1, 0, 9, 1, 0, 0, 0, 1, ...","[1118, 27680, 32541, 137353, 16428, 28815, 658...",31,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...",31
1,1,"[11, 0, 0, 2, 12, 0, 0, 0, 0, 14, 5, 0, 0, 0, ...","[122140, 77036, 156817, 76957, 68818, 155600, ...",39,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",39


In [7]:
ingredients[:2]

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308


In [8]:
# Specific ratings and recipes
interactions = pd.read_csv(root + '/RAW_interactions.csv')
recipe_info = pd.read_csv(root + '/RAW_recipes.csv')

In [9]:
interactions[:2]

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."


In [10]:
recipe_info[:2]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6


In [11]:
# Extracting recipe reviews and cleaning
reviews = interactions['review']
reviews = reviews.fillna('')
reviews = reviews.tolist()

In [12]:
# Using TFIDF for recipe reviews
vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.5)
X = vectorizer.fit_transform(reviews)

In [13]:
y = interactions['rating'].tolist()

In [14]:
# Check the structure of recipes DataFrame
print("Recipes columns:", recipes.columns.tolist())
print("\nRecipes shape:", recipes.shape)
print("\nFirst few rows:")
recipes.head()

Recipes columns: ['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'calorie_level', 'ingredient_ids']

Recipes shape: (178265, 8)

First few rows:


Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"


In [15]:
# feature matrix for recipes, sparse vector for ingredients and nutrition

from scipy.sparse import csr_matrix, hstack
import ast

# Check if ingredient_ids needs parsing or is already a list
if isinstance(recipes['ingredient_ids'].iloc[0], str):
    # Parse ingredient_ids from strings to actual lists
    recipes['ingredient_ids'] = recipes['ingredient_ids'].apply(ast.literal_eval)

max_ingr = max([max(ids) for ids in recipes['ingredient_ids'] if len(ids) > 0]) + 1
rows = []
cols = []
data = []

for i, ingr_list in enumerate(recipes['ingredient_ids']):
    for ingr in ingr_list:
        rows.append(i)
        cols.append(ingr)
        data.append(1)

ingredient_matrix = csr_matrix((data, (rows, cols)), shape=(len(recipes), max_ingr))

# Convert calorie_level to numeric and reshape
calorie_features = np.array(recipes['calorie_level']).reshape(-1, 1)
calorie_matrix = csr_matrix(calorie_features)

recipe_features = hstack([ingredient_matrix, calorie_matrix])

In [16]:
#Align interactions with recipe feature rows

recipe_id_to_row = {rid: idx for idx, rid in recipes['id'].items()}

recipe_rows = interactions['recipe_id'].map(recipe_id_to_row)
valid_mask = recipe_rows.notna()

# Filter interactions to only keep recipes that exist in pp recipes
interactions = interactions[valid_mask]
recipe_rows = recipe_rows[valid_mask].astype(int)

# Re-extract reviews and rebuild TF-IDF matrix after filtering
reviews = interactions['review'].fillna('').tolist()
vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.5)
X = vectorizer.fit_transform(reviews)

# Extract recipe feature rows
R = recipe_features[recipe_rows]

In [17]:
# X is TF-IDF matrix 
# R is recipe metadata matrix

X_full = hstack([X, R])
y = interactions['rating'].values

In [18]:
#Train ridge regression mode

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42)

model = Ridge(alpha=1.0, solver="lsqr")
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("MAE:", np.mean(np.abs(preds - y_test)))


MAE: 0.653308323283204


In [19]:
# Check if user 2046 exists and see what recipes they've reviewed
user_interactions = interactions[interactions['user_id'] == 2046]
print(f"User 2046 has {len(user_interactions)} interactions")
if len(user_interactions) > 0:
    print("Sample interactions:")
    print(user_interactions[['recipe_id', 'rating']].head())
else:
    print("User 2046 not found. Let's find a valid user:")
    sample_user = interactions['user_id'].iloc[0]
    print(f"Using user {sample_user} instead")

User 2046 has 6 interactions
Sample interactions:
        recipe_id  rating
284222       4684       5
481291       4038       2
557921        517       5
613784       4523       2
645705       3431       5


In [None]:
from scipy.sparse import hstack

def recommend_recipes_for_user(user_id, top_k=10, preference=None):
    # Define base weights
    weights = {
        "reviews": 3.0,
        "ingredients": 3.0,
        "nutrition": 3.0,
        "calories": 3.0
    }

    # Block dimensions
    tfidf_dim = X.shape[1]
    ingredient_dim = ingredient_matrix.shape[1]
    nutrition_dim = 1  # only calorie level

    # Weight vector for all features
    w = np.ones(tfidf_dim + ingredient_dim + nutrition_dim)

    # ----- PREFERENCE WEIGHTS -----

    if preference == "reviews":
        w[:tfidf_dim] *= weights["reviews"]

    elif preference == "ingredients":
        w[tfidf_dim:tfidf_dim + ingredient_dim] *= weights["ingredients"]

    elif preference == "nutrition":
        w[-1] *= weights["nutrition"]

    elif preference == "calories":
        w[-1] *= weights["calories"]

    # ⭐ NEW: low-calorie preference (boost low bins)
    elif preference == "low_calorie":
        # calorie_level is last column of full feature vector
        # boost LOW (0/1), penalize HIGH (2+)
        low_boost = 3.0
        high_penalty = 0.5
        
        # We do this by adjusting recipe predictions later
        # but we encode the weighting here
        # w[-1] controls how much calorie_level affects ranking
        w[-1] = low_boost  # boost low calorie
        # Later we manually down-weight high calorie recipes

    # ⭐ NEW: high-calorie preference (boost higher bins)
    elif preference == "high_calorie":
        high_boost = 3.0
        w[-1] = high_boost   # give calorie_level stronger influence
        
    elif preference is not None:
        print(f"Unknown preference '{preference}'. Ignoring.")

    # ------------------------------

    # Load user interactions
    user_interactions = interactions[interactions['user_id'] == user_id]
    if len(user_interactions) == 0:
        print(f"User {user_id} not found")
        return None

    reviewed = set(user_interactions['recipe_id'])
    candidates = recipes[~recipes['id'].isin(reviewed)].copy()
    if len(candidates) == 0:
        return None

    # Candidate TF-IDF
    cand_reviews = [''] * len(candidates)
    X_cand = vectorizer.transform(cand_reviews)

    # Candidate metadata (ingredients + calorie)
    cand_rows = [recipe_id_to_row[rid] for rid in candidates['id']]
    R_cand = recipe_features[cand_rows]

    # Combined sparse feature matrix
    full_cand = hstack([X_cand, R_cand])

    # ⭐ memory-safe weighting
    full_cand_weighted = full_cand.multiply(w)

    # Predict
    scores = model.predict(full_cand_weighted)

    # ----- EXTRA CALORIE ADJUSTMENTS -----

    if preference == "low_calorie":
        # boost low calorie more directly
        low_mask = candidates['calorie_level'] <= 1
        scores[low_mask] *= 1.3      # increase score
        scores[~low_mask] *= 0.85    # decrease score for high calorie

    if preference == "high_calorie":
        # boost high calorie recipes
        high_mask = candidates['calorie_level'] >= 2
        scores[high_mask] *= 1.3
        scores[~high_mask] *= 0.85

    # -------------------------------------

    candidates['pred_rating'] = scores

    recommendations = candidates.sort_values('pred_rating', ascending=False).head(top_k)

    print(f"\nTop {top_k} recipe recommendations for user {user_id} (preference={preference}):")
    print("=" * 60)
    for idx, (_, row) in enumerate(recommendations.iterrows(), 1):
        print(f"{idx}. Recipe ID: {row['id']}")
        print(f"   Predicted Rating: {row['pred_rating']:.2f}")
        print(f"   Calorie Level: {row['calorie_level']}")
        print("-" * 40)

    return recommendations[['id', 'pred_rating', 'calorie_level']]



# Example with error handling
sample_user = interactions['user_id'].iloc[0]  # Get a valid user
# allows for ingredients, calories, high_calorie, low_calorie, nutrition, reviews, or no input
#interchangeable without rerunnning our model
recommendations = recommend_recipes_for_user(sample_user, top_k=5)


Top 5 recipe recommendations for user 38094 (preference=None):
1. Recipe ID: 514885
   Predicted Rating: 5.75
   Calorie Level: 0
----------------------------------------
2. Recipe ID: 293914
   Predicted Rating: 4.96
   Calorie Level: 2
----------------------------------------
3. Recipe ID: 503144
   Predicted Rating: 4.96
   Calorie Level: 0
----------------------------------------
4. Recipe ID: 326127
   Predicted Rating: 4.93
   Calorie Level: 2
----------------------------------------
5. Recipe ID: 250205
   Predicted Rating: 4.93
   Calorie Level: 1
----------------------------------------
