In [2]:
pip install fastFM



In [116]:
import pandas as pd
import numpy as np
from collections import defaultdict
from fastFM import als
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import scipy
import ast

In [70]:
low_cal_interactions = pd.read_csv('low_cal_interactions.csv')
quick_interactions = pd.read_csv('quick_interactions.csv')
low_cal_recipes = pd.read_csv('low_cal_recipes.csv')
quick_recipes = pd.read_csv('quick_recipes.csv')

In [71]:
len(low_cal_interactions), len(quick_interactions), len(low_cal_recipes), len(quick_recipes)

(549777, 536170, 112172, 112145)

In [72]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

### Factorization Model - Quick Recipes

In [73]:
quick_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,8937,44394,4
1,126440,85009,5
2,57222,85009,5
3,52282,120345,4
4,124416,120345,0


In [74]:
userIDs,recipeIDs = {},{}

for _, row in quick_interactions.iterrows():
    u,i = row['user_id'],row['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in recipeIDs: recipeIDs[i] = len(recipeIDs)

nUsers,nRecipes = len(userIDs),len(recipeIDs)

In [75]:
minMinute = min([row['minutes'] for _, row in quick_recipes.iterrows()])
maxMinute = max([row['minutes'] for _, row in quick_recipes.iterrows()])

merged_data = pd.merge(quick_interactions, quick_recipes, left_on="recipe_id", right_on="id")
merged_data = merged_data[["user_id", "recipe_id", "minutes", "rating"]]

# Normalizing the minute values to a 0-1 range
def normalize_minute(minute, minMinute, maxMinute):
    return (minute - minMinute) / (maxMinute - minMinute)

nUsers, nItems = len(userIDs), len(recipeIDs)
X = scipy.sparse.lil_matrix((len(quick_interactions), nUsers + nItems + 1))  # One column for normalized minute data

for i, row in merged_data.iterrows():
    user = userIDs[row['user_id']]
    item = recipeIDs[row['recipe_id']]
    minute = row['minutes']

    # Normalize the minute value
    normalized_minute = normalize_minute(minute, minMinute, maxMinute)

    # Populate the matrix
    X[i, user] = 1  # One-hot encoding of user
    X[i, nUsers + item] = 1  # One-hot encoding of item
    X[i, nUsers + nItems] = normalized_minute  # Store the normalized minute as a continuous value



In [76]:
y = np.array([row['rating'] for _, row in merged_data.iterrows()])

In [77]:
split_index = int(0.8 * X.shape[0])
valid_index = int(0.9 * X.shape[0])

X_train, y_train, data_train = X[:split_index], y[:split_index], merged_data[:split_index]
X_valid, y_valid, data_valid = X[split_index:valid_index], y[split_index:valid_index], merged_data[split_index:valid_index]
X_test, y_test, data_test = X[valid_index:], y[valid_index:], merged_data[valid_index:]

In [78]:
# NOTE: This code takes ~2 hours to run, so to prevent it from being run every time the kernel is restarted, I have included a bypass (the i condition in the inner-most loop).
# The hyperparameters chosen below were based on the best outcome from tuning
# If you want to run this code yourself, please comment out the "if (i < 50)" block.
# Define the hyperparameters to be tuned and their possible values
n_iter_values = [500, 1000]
rank_values = [5, 10, 15]
l2_reg_w_values = [0.1, 0.5]
l2_reg_V_values = [0.1, 0.5]

# Store the best parameters and best performance
best_params = None
best_score = float('inf')

i = 0

# Grid search
for n_iter in n_iter_values:
    for rank in rank_values:
        for l2_reg_w in l2_reg_w_values:
            for l2_reg_V in l2_reg_V_values:
                if (i < 50):
                  i = i + 1
                  continue
                # Initialize the model with the current combination of hyperparameters
                fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V)

                # Fit the model
                fm.fit(X_train, y_train)

                # Predict on the test data
                y_pred = fm.predict(X_valid)

                # Evaluate the model using mean squared error (or another suitable metric)
                score = mean_squared_error(y_valid, y_pred)

                # Check if the current model is better than the best so far
                if score < best_score:
                    best_score = score
                    best_params = {
                        'n_iter': n_iter,
                        'rank': rank,
                        'l2_reg_w': l2_reg_w,
                        'l2_reg_V': l2_reg_V
                    }
                print(i)
                print(score)
                i = i+1
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: None
Best Score: inf


In [79]:
quick_fm = als.FMRegression(n_iter=500, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)

In [80]:
quick_fm.fit(X_train, y_train)

In [96]:
y_pred_with_features = quick_fm.predict(X_test)
mse = MSE(y_pred_with_features, y_test)
print(mse)

1.392954861982341


In [100]:
def recommend_quick_recipes(user_id, model, all_items, quick_recipes, user_interactions, userIDs, recipeIDs, feature_size, top_n=5):
    # Get recipes already interacted with by the user
    already_rated = user_interactions[user_interactions['user_id'] == user_id]['recipe_id'].unique()
    to_predict = [item for item in all_items if item not in already_rated]

    # Map the user_id and recipe_ids to their indices
    user_index = userIDs[user_id]

    # Build the feature matrix for the recipes to predict
    X_predict = scipy.sparse.lil_matrix((len(to_predict), feature_size))  # Ensure the feature size matches training
    for i, recipe_id in enumerate(to_predict):
        item_index = recipeIDs[recipe_id]
        X_predict[i, user_index] = 1  # One-hot encoding for the user
        X_predict[i, len(userIDs) + item_index] = 1  # One-hot encoding for the item
    X_predict = X_predict.tocsc()

    # Predict ratings for all recipes in `to_predict`
    predicted_ratings = model.predict(X_predict)
    min_pred = np.min(predicted_ratings)
    max_pred = np.max(predicted_ratings)
    predicted_ratings = 5 * (predicted_ratings - min_pred) / (max_pred - min_pred)
    recommendations = pd.DataFrame({
        'id': to_predict,
        'predicted_rating': predicted_ratings
    })
    recommendations = recommendations.merge(
        quick_recipes[['id', 'name', 'minutes']],
        on='id',
        how='inner'
    )

    recommendations = recommendations.sort_values(by='predicted_rating', ascending=False).head(top_n)

    return recommendations

# Select a random user ID
random_user_id = quick_interactions['user_id'].sample(1).iloc[0]

# Prepare all_items
all_items = quick_recipes['id'].unique()

# Determine feature size from training data
feature_size = quick_fm.w_.shape[0]

# Get top 5 recommendations
top_recommendations = recommend_quick_recipes(
    random_user_id,
    quick_fm,
    all_items,
    quick_recipes,
    quick_interactions,
    userIDs,
    recipeIDs,
    feature_size,
    top_n=5
)

# Display the top recommendations
print(f"Top 5 recommendations for user {random_user_id}:")
print(top_recommendations[['id', 'name', 'minutes']])

Top 5 recommendations for user 268202:
           id                          name  minutes
87584  353442        scallops and wild rice       13
89212  192967           shrimp and tomatoes       30
51531  339690  honey mustard chicken strips       20
20314  389374     chicken and rice burritos       30
32652  109091         deep fried vegetables       13


### Factorization Model - Low Calorie Recipes

In [102]:
low_cal_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,202555,225241,5
4,353579,225241,5


In [103]:
userIDs,recipeIDs = {},{}

for _, row in low_cal_interactions.iterrows():
    u,i = row['user_id'],row['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in recipeIDs: recipeIDs[i] = len(recipeIDs)

nUsers,nRecipes = len(userIDs),len(recipeIDs)

In [104]:
minCalories = min([row['calories'] for _, row in low_cal_recipes.iterrows()])
maxCalories = max([row['calories'] for _, row in low_cal_recipes.iterrows()])

merged_data = pd.merge(low_cal_interactions, low_cal_recipes, left_on="recipe_id", right_on="id")
merged_data = merged_data[["user_id", "recipe_id", "calories", "rating"]]

# Normalizing the calorie values to a 0-1 range
def normalize_calories(calories, minCalories, maxCalories):
    return (calories - minCalories) / (maxCalories - minCalories)

nUsers, nItems = len(userIDs), len(recipeIDs)
X = scipy.sparse.lil_matrix((len(low_cal_interactions), nUsers + nItems + 1))  # One column for normalized calorie data

for i, row in merged_data.iterrows():
    user = userIDs[row['user_id']]
    item = recipeIDs[row['recipe_id']]
    calories = row['calories']

    # Normalize the calories value
    normalized_calories = normalize_calories(calories, minCalories, maxCalories)

    # Populate the matrix
    X[i, user] = 1  # One-hot encoding of user
    X[i, nUsers + item] = 1  # One-hot encoding of item
    X[i, nUsers + nItems] = normalized_calories  # Store the normalized calories as a continuous value


In [105]:
y = np.array([row['rating'] for _, row in merged_data.iterrows()])

In [106]:
split_index = int(0.8 * X.shape[0])
valid_index = int(0.9 * X.shape[0])

X_train, y_train, data_train = X[:split_index], y[:split_index], merged_data[:split_index]
X_valid, y_valid, data_valid = X[split_index:valid_index], y[split_index:valid_index], merged_data[split_index:valid_index]
X_test, y_test, data_test = X[valid_index:], y[valid_index:], merged_data[valid_index:]

In [107]:
# NOTE: This code takes ~2 hours to run, so to prevent it from being run every time the kernel is restarted, I have included a bypass (the i condition in the inner-most loop).
# The hyperparameters chosen below were based on the best outcome from tuning
# If you want to run this code yourself, please comment out the "if (i < 50)" block.
# Define the hyperparameters to be tuned and their possible values
n_iter_values = [500, 1000]
rank_values = [5, 10, 15]
l2_reg_w_values = [0.1, 0.5]
l2_reg_V_values = [0.1, 0.5]

# Store the best parameters and best performance
best_params = None
best_score = float('inf')

i = 0

# Grid search
for n_iter in n_iter_values:
    for rank in rank_values:
        for l2_reg_w in l2_reg_w_values:
            for l2_reg_V in l2_reg_V_values:
                if (i < 50):
                  i = i + 1
                  continue
                # Initialize the model with the current combination of hyperparameters
                fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V)

                # Fit the model
                fm.fit(X_train, y_train)

                # Predict on the test data
                y_pred = fm.predict(X_valid)

                # Evaluate the model using mean squared error (or another suitable metric)
                score = mean_squared_error(y_valid, y_pred)

                # Check if the current model is better than the best so far
                if score < best_score:
                    best_score = score
                    best_params = {
                        'n_iter': n_iter,
                        'rank': rank,
                        'l2_reg_w': l2_reg_w,
                        'l2_reg_V': l2_reg_V
                    }
                print(i)
                print(score)
                i = i+1
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: None
Best Score: inf


In [108]:
fm = als.FMRegression(n_iter=500, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)

In [109]:
fm.fit(X_train, y_train)

In [110]:
y_pred = fm.predict(X_test)
mse = MSE(y_pred, y_test)
print(mse)

1.4723965873209226


In [111]:
from scipy.sparse import lil_matrix

def recommend_low_calorie_recipes(user_id, model, all_items, quick_recipes, user_interactions, userIDs, recipeIDs, feature_size, top_n=5):
    # Get recipes already interacted with by the user
    already_rated = user_interactions[user_interactions['user_id'] == user_id]['recipe_id'].unique()
    to_predict = [item for item in all_items if item not in already_rated]

    # Map the user_id and recipe_ids to their indices
    user_index = userIDs[user_id]

    # Build the feature matrix for the recipes to predict
    X_predict = lil_matrix((len(to_predict), feature_size))  # Ensure the feature size matches training
    for i, recipe_id in enumerate(to_predict):
        item_index = recipeIDs[recipe_id]
        X_predict[i, user_index] = 1  # One-hot encoding for the user
        X_predict[i, len(userIDs) + item_index] = 1  # One-hot encoding for the item

    # Convert to CSC format for FastFM
    X_predict = X_predict.tocsc()

    # Predict ratings for all recipes in `to_predict`
    predicted_ratings = model.predict(X_predict)
    min_pred = np.min(predicted_ratings)
    max_pred = np.max(predicted_ratings)
    predicted_ratings = 5 * (predicted_ratings - min_pred) / (max_pred - min_pred)

    # Create a DataFrame with predictions
    recommendations = pd.DataFrame({
        'id': to_predict,
        'predicted_rating': predicted_ratings
    })

    # Merge with recipe details and filter for top N low-calorie recipes
    recommendations = recommendations.merge(
        quick_recipes[['id', 'name', 'calories']],
        on='id',
        how='inner'
    )

    recommendations = recommendations.sort_values(by='predicted_rating', ascending=False).head(top_n)

    return recommendations

# Select a random user ID
random_user_id = low_cal_interactions['user_id'].sample(1).iloc[0]

# Prepare all_items
all_items = low_cal_recipes['id'].unique()

# Determine feature size from training data
feature_size = fm.w_.shape[0]

# Get top 5 recommendations
top_recommendations = recommend_low_calorie_recipes(
    random_user_id,
    fm,
    all_items,
    low_cal_recipes,
    low_cal_interactions,
    userIDs,
    recipeIDs,
    feature_size,
    top_n=5
)

# Display the top recommendations
print(f"Top 5 recommendations for user {random_user_id}:")
print(top_recommendations[['id', 'name', 'calories']])

Top 5 recommendations for user 197258:
           id                                               name  calories
69681  320444                                     no knead bread     228.0
61560  429120                      lower sugar chocolate squares     119.6
17597  173288                                      carrot bisque     212.9
77393  483807                     pineapple black bean guacamole     121.3
80383  400579  pumpkin roulade with ginger buttercream  pumpk...     281.7


### Factorization Model - Overall

In [117]:
# Dataframe for all interactions with relevant columns
raw_interactions = pd.read_csv("RAW_interactions.csv")
columns_to_keep = ["user_id", "recipe_id", "rating"]
raw_interactions = raw_interactions[columns_to_keep]

# Dataframe for all recipes with relevant columns
raw_recipes = pd.read_csv("RAW_recipes.csv")
columns_to_keep = ["name", "id", "minutes", "nutrition"]
raw_recipes = raw_recipes[columns_to_keep]
raw_recipes["calories"] = raw_recipes["nutrition"].apply(lambda x: ast.literal_eval(x)[0] if pd.notna(x) else None)
raw_recipes = raw_recipes.drop(columns=["nutrition"])

# Filter out rows of recipes where 10 < "calories" < 2000
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["calories"] <= 2000]
raw_recipes = raw_recipes[raw_recipes["calories"] > 10]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

# Filter out rows of recipes where "minutes" > 300
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["minutes"] <= 300]
all_recipes = raw_recipes[raw_recipes["minutes"] > 0]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")


Number of rows dropped: 7326
Number of rows dropped: 9593


In [118]:
raw_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,126440,85009,5
4,57222,85009,5


In [119]:
userIDs,recipeIDs = {},{}

for _, row in raw_interactions.iterrows():
    u,i = row['user_id'],row['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in recipeIDs: recipeIDs[i] = len(recipeIDs)

nUsers,nRecipes = len(userIDs),len(recipeIDs)

In [122]:
X = scipy.sparse.lil_matrix((len(raw_interactions), nUsers + nRecipes))
for i, row in raw_interactions.iterrows():
    user = userIDs[row['user_id']]
    item = recipeIDs[row['recipe_id']]
    X[i, user] = 1  # One-hot encoding of user
    X[i, nUsers + item] = 1  # One-hot encoding of item

y = np.array([row['rating'] for _, row in raw_interactions.iterrows()])

In [123]:
split_index = int(0.8 * X.shape[0])
valid_index = int(0.9 * X.shape[0])

X_train, y_train, data_train = X[:split_index], y[:split_index], raw_interactions[:split_index]
X_valid, y_valid, data_valid = X[split_index:valid_index], y[split_index:valid_index], raw_interactions[split_index:valid_index]
X_test, y_test, data_test = X[valid_index:], y[valid_index:], raw_interactions[valid_index:]

In [124]:
# NOTE: This code takes ~2 hours to run, so to prevent it from being run every time the kernel is restarted, I have included a bypass (the i condition in the inner-most loop).
# The hyperparameters chosen below were based on the best outcome from tuning
# If you want to run this code yourself, please comment out the "if (i < 50)" block.
# Define the hyperparameters to be tuned and their possible values
n_iter_values = [400, 500]
rank_values = [5, 10, 15]
l2_reg_w_values = [0.1, 0.2, 0.5]
l2_reg_V_values = [0.1, 0.2, 0.5]

# Store the best parameters and best performance
best_params = None
best_score = float('inf')

i = 0

# Grid search
for n_iter in n_iter_values:
    for rank in rank_values:
        for l2_reg_w in l2_reg_w_values:
            for l2_reg_V in l2_reg_V_values:
                if (i < 50):
                  i = i + 1
                  continue
                # Initialize the model with the current combination of hyperparameters
                fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=rank, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V)

                # Fit the model
                fm.fit(X_train, y_train)

                # Predict on the test data
                y_pred = fm.predict(X_valid)

                # Evaluate the model using mean squared error (or another suitable metric)
                score = mean_squared_error(y_valid, y_pred)

                # Check if the current model is better than the best so far
                if score < best_score:
                    best_score = score
                    best_params = {
                        'n_iter': n_iter,
                        'rank': rank,
                        'l2_reg_w': l2_reg_w,
                        'l2_reg_V': l2_reg_V
                    }
                print(i)
                print(score)
                i = i+1
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: None
Best Score: inf


In [127]:
fm = als.FMRegression(n_iter=400, init_stdev=0.1, rank=5, l2_reg_w=0.2, l2_reg_V=0.5)

In [128]:
fm.fit(X_train, y_train)

In [129]:
y_pred = fm.predict(X_test)
mse = MSE(y_pred, y_test)
print(mse)

1.4496614346436125


In [131]:
from scipy.sparse import lil_matrix

def recommend_low_calorie_recipes(user_id, model, all_items, quick_recipes, user_interactions, userIDs, recipeIDs, feature_size, top_n=5):
    # Get recipes already interacted with by the user
    already_rated = user_interactions[user_interactions['user_id'] == user_id]['recipe_id'].unique()
    to_predict = [item for item in all_items if item not in already_rated]

    # Map the user_id and recipe_ids to their indices
    user_index = userIDs[user_id]

    # Build the feature matrix for the recipes to predict
    X_predict = lil_matrix((len(to_predict), feature_size))  # Ensure the feature size matches training
    for i, recipe_id in enumerate(to_predict):
        item_index = recipeIDs[recipe_id]
        X_predict[i, user_index] = 1  # One-hot encoding for the user
        X_predict[i, len(userIDs) + item_index] = 1  # One-hot encoding for the item

    # Convert to CSC format for FastFM
    X_predict = X_predict.tocsc()

    # Predict ratings for all recipes in `to_predict`
    predicted_ratings = model.predict(X_predict)
    min_pred = np.min(predicted_ratings)
    max_pred = np.max(predicted_ratings)
    predicted_ratings = 5 * (predicted_ratings - min_pred) / (max_pred - min_pred)

    # Create a DataFrame with predictions
    recommendations = pd.DataFrame({
        'id': to_predict,
        'predicted_rating': predicted_ratings
    })

    # Merge with recipe details and filter for top N low-calorie recipes
    recommendations = recommendations.merge(
        quick_recipes[['id', 'name', 'calories']],
        on='id',
        how='inner'
    )

    recommendations = recommendations.sort_values(by='predicted_rating', ascending=False).head(top_n)

    return recommendations

# Select a random user ID
random_user_id = raw_interactions['user_id'].sample(1).iloc[0]

# Prepare all_items
all_items = raw_recipes['id'].unique()

# Determine feature size from training data
feature_size = fm.w_.shape[0]

# Get top 5 recommendations
top_recommendations = recommend_low_calorie_recipes(
    random_user_id,
    fm,
    all_items,
    raw_recipes,
    raw_interactions,
    userIDs,
    recipeIDs,
    feature_size,
    top_n=5
)

# Display the top recommendations
print(f"Top 5 recommendations for user {random_user_id}:")
print(top_recommendations[['id', 'name']])

Top 5 recommendations for user 183964:
            id                                      name
33526   411501                     carb free cloud bread
189656  487039    sweet   sassy peachy keen tilapia  rsc
115553  505862   loaded baked potato   chicken casserole
146256  112889                     perfect boiled shrimp
92749   135048  guacamole   real authentic mexican  guac
