In [34]:
import pandas as pd
from sklearn import linear_model
import pickle
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
import ast

In [None]:
df_low_cal_interactions = pd.read_csv('low_cal_interactions.csv')
df_quick_interactions = pd.read_csv('quick_interactions.csv')
df_low_cal_recipes = pd.read_csv('low_cal_recipes.csv')
df_quick_recipes = pd.read_csv('quick_recipes.csv')

In [5]:
len(df_low_cal_interactions), len(df_quick_interactions), len(df_low_cal_recipes), len(df_quick_recipes)

(549777, 536170, 112172, 112145)

### 1. Latent Factor Model- Quick Recipes

In [6]:
df_quick_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,8937,44394,4
1,126440,85009,5
2,57222,85009,5
3,52282,120345,4
4,124416,120345,0


In [8]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_quick_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26f3f9dd7c0>

In [10]:
predictions = model.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4099
Test MSE: 1.409913341875453


In [26]:
def recommend_quick_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name', 'minutes']],
        on='id',
        how='inner'
    )
    return recommendations_df


all_items = df_quick_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_quick_recipes(user_id, model, all_items, df_quick_recipes, df_quick_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)


Top 5 recommendations:
       id  predicted_rating  \
0  486267          4.924893   
1   55309          4.854845   
2  146320          4.851148   
3   25735          4.844417   
4  113193          4.839765   

                                                name  minutes  
0                summer vegetable grill packets  rsc       35  
1  caprese salad tomatoes  italian marinated toma...       10  
2                        oprah s pomegranate martini        2  
3                               autumn chicken salad       20  
4        sweet  buttery rolls   bread machine recipe       30  


### 2. Latent Factor Model- Low Calorie Recipes

In [18]:
df_low_cal_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,202555,225241,5
4,353579,225241,5


In [19]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_low_cal_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26f3f0bbec0>

In [20]:
predictions = model.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4525
Test MSE: 1.4524933521186694


In [31]:
def recommend_low_calorie_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name','calories']],
        on='id',
        how='inner'
    )
    return recommendations_df


all_items = df_low_cal_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_low_calorie_recipes(user_id, model, all_items, df_low_cal_recipes, df_low_cal_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)


Top 5 recommendations:
       id  predicted_rating  \
0  486267          4.924893   
1  474520          4.872249   
2   33995          4.864184   
3   55309          4.854845   
4  146320          4.851148   

                                                name  calories  
0                summer vegetable grill packets  rsc     232.1  
1                   company s coming mashed potatoes     185.3  
2                                     vegan truffles      11.8  
3  caprese salad tomatoes  italian marinated toma...     137.5  
4                        oprah s pomegranate martini     166.3  


### 3. Latent Factor Model- All Data

##### 3.1 Initial Preprocessing for All Data

In [43]:
# Dataframe for all recipes with relevant columns
raw_recipes = pd.read_csv("RAW_recipes.csv")
columns_to_keep = ["name", "id", "minutes", "nutrition"]
raw_recipes = raw_recipes[columns_to_keep]
raw_recipes["calories"] = raw_recipes["nutrition"].apply(lambda x: ast.literal_eval(x)[0] if pd.notna(x) else None)
raw_recipes = raw_recipes.drop(columns=["nutrition"])

# Dataframe for all interactions with relevant columns
raw_interactions = pd.read_csv("RAW_interactions.csv")
columns_to_keep = ["user_id", "recipe_id", "rating"]
raw_interactions = raw_interactions[columns_to_keep]

In [44]:
# Filter out rows of recipes where 10 < "calories" < 2000
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["calories"] <= 2000]
raw_recipes = raw_recipes[raw_recipes["calories"] > 10]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

Number of rows dropped: 7326


In [45]:
# Filter out rows of recipes where "minutes" > 300
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["minutes"] <= 300]
all_recipes = raw_recipes[raw_recipes["minutes"] > 0]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

Number of rows dropped: 9593


In [41]:
raw_recipes.head()

Unnamed: 0,name,id,minutes,calories
0,arriba baked winter squash mexican style,137739,55,51.5
1,a bit different breakfast pizza,31490,30,173.4
2,all in the kitchen chili,112140,130,269.8
3,alouette potatoes,59389,45,368.1
4,amish tomato ketchup for canning,44061,190,352.9


In [46]:
# Filter out interactions where "recipe_id" is in current "raw_recipes"
all_recipe_ids = set(raw_recipes["id"])  # faster lookup
all_interactions = raw_interactions[raw_interactions["recipe_id"].isin(all_recipe_ids)]

In [47]:
len(all_recipes), len(all_interactions)

(213751, 1044830)

##### 3.2 Actual Model

In [48]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(all_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26f3f9b7530>

In [49]:
predictions = model.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4597
Test MSE: 1.4596957871784184


In [51]:
def recommend_good_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name','calories', 'minutes']],
        on='id',
        how='inner'
    )
    return recommendations_df

all_items = all_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_good_recipes(user_id, model, all_items, all_recipes, all_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)

Top 5 recommendations:
       id  predicted_rating  \
0  486261          4.960660   
1  524863          4.942449   
2  487111          4.938791   
3  518151          4.897692   
4  100658          4.881906   

                                                name  calories  minutes  
0                              mexican stack up  rsc     793.0       75  
1                                   ragu shuka  ragu     337.6       40  
2                back porch bayou shrimp   corn  rsc     404.4       45  
3  a 1  pot stickers with chili pineapple dipping...     271.3       45  
4                         peppercorn pork tenderloin     678.4       30  
