In [19]:
import pandas as pd
from sklearn import linear_model
import pickle
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from surprise import accuracy
import ast

In [5]:
df_low_cal_interactions = pd.read_csv('low_cal_interactions.csv')
df_quick_interactions = pd.read_csv('quick_interactions.csv')
df_low_cal_recipes = pd.read_csv('low_cal_recipes.csv')
df_quick_recipes = pd.read_csv('quick_recipes.csv')

In [6]:
len(df_low_cal_interactions), len(df_quick_interactions), len(df_low_cal_recipes), len(df_quick_recipes)

(549777, 536170, 112172, 112145)

In [7]:
def save_model(model, filename='svd_recommendation_model.pkl'):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model saved to {filename}")

def load_model(filename='svd_recommendation_model.pkl'):
    with open(filename, 'rb') as file:
        loaded_model = pickle.load(file)
    print(f"Model loaded from {filename}")
    return loaded_model

### 1. Latent Factor Model- Quick Recipes

In [8]:
df_quick_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,8937,44394,4
1,126440,85009,5
2,57222,85009,5
3,52282,120345,4
4,124416,120345,0


In [12]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_quick_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
param_distributions = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.005, 0.01, 0.02],
    'reg_all': [0.02, 0.1, 0.4]
}

# Specify a lower number of iterations
rs = RandomizedSearchCV(
    SVD, 
    param_distributions, 
    measures=['rmse'], 
    cv=3, 
    n_iter=10
)
rs.fit(data)

In [12]:
print("Best RMSE score:", rs.best_score['rmse'])
print("Best parameters:", rs.best_params['rmse'])

Best RMSE score: 1.180469332097383
Best parameters: {'n_factors': 100, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}


In [13]:
best_params = rs.best_params['rmse']
model1 = SVD(**best_params)
model1.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x207b5abfbc0>

In [13]:
model1 = load_model('svd_quick_recommendation.pkl')

Model loaded from svd_quick_recommendation.pkl


In [14]:
predictions = model1.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4038
Test MSE: 1.4038390344299168


In [None]:
save_model(model1, 'svd_quick_recommendation.pkl')

Model saved to svd_quick_recommendation.pkl


In [15]:
def recommend_quick_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name', 'minutes']],
        on='id',
        how='inner'
    )
    return recommendations_df


all_items = df_quick_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_quick_recipes(user_id, model1, all_items, df_quick_recipes, df_quick_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)


Top 5 recommendations:
       id  predicted_rating                                 name  minutes
0   67425          4.935989             low carb root beer float        2
1  259937          4.923255    honey browned trout   candy trout       35
2  158082          4.915897  femmes  revenge caribbean curry mix       10
3  160432          4.913225    peanut butter and pickle sandwich        5
4   61245          4.903694                              fatoosh       30


### 2. Latent Factor Model- Low Calorie Recipes

In [16]:
df_low_cal_interactions.head()

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,202555,225241,5
4,353579,225241,5


In [17]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df_low_cal_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [21]:
param_distributions = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.005, 0.01, 0.02],
    'reg_all': [0.02, 0.1, 0.4]
}

# Specify a lower number of iterations
rs = RandomizedSearchCV(
    SVD, 
    param_distributions, 
    measures=['rmse'], 
    cv=3
)
rs.fit(data)
print("Best RMSE score:", rs.best_score['rmse'])
print("Best parameters:", rs.best_params['rmse'])

Best RMSE score: 1.2045276875514885
Best parameters: {'n_factors': 150, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}


In [22]:
best_params = rs.best_params['rmse']
model2 = SVD(**best_params)
model2.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x216812df680>

In [23]:
predictions = model2.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4420
Test MSE: 1.4420082438585737


In [24]:
save_model(model1, 'svd_low_calorie_recommendation.pkl')

Model saved to svd_low_calorie_recommendation.pkl


In [25]:
def recommend_low_calorie_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name','calories']],
        on='id',
        how='inner'
    )
    return recommendations_df


all_items = df_low_cal_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_low_calorie_recipes(user_id, model2, all_items, df_low_cal_recipes, df_low_cal_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)


Top 5 recommendations:
       id  predicted_rating                                 name  calories
0   64895          4.791600               the best brownies ever     203.4
1  486267          4.790720  summer vegetable grill packets  rsc     232.1
2   16603          4.789224                     preserved lemons      16.8
3   33995          4.779742                       vegan truffles      11.8
4   51351          4.775772      tangy bbq pork roast  crock pot     243.7


### 3. Latent Factor Model- All Data

##### 3.1 Initial Preprocessing for All Data

In [26]:
# Dataframe for all recipes with relevant columns
raw_recipes = pd.read_csv("RAW_recipes.csv")
columns_to_keep = ["name", "id", "minutes", "nutrition"]
raw_recipes = raw_recipes[columns_to_keep]
raw_recipes["calories"] = raw_recipes["nutrition"].apply(lambda x: ast.literal_eval(x)[0] if pd.notna(x) else None)
raw_recipes = raw_recipes.drop(columns=["nutrition"])

# Dataframe for all interactions with relevant columns
raw_interactions = pd.read_csv("RAW_interactions.csv")
columns_to_keep = ["user_id", "recipe_id", "rating"]
raw_interactions = raw_interactions[columns_to_keep]

In [27]:
# Filter out rows of recipes where 10 < "calories" < 2000
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["calories"] <= 2000]
raw_recipes = raw_recipes[raw_recipes["calories"] > 10]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

Number of rows dropped: 7326


In [28]:
# Filter out rows of recipes where "minutes" > 300
initial_count = len(raw_recipes)
raw_recipes = raw_recipes[raw_recipes["minutes"] <= 300]
all_recipes = raw_recipes[raw_recipes["minutes"] > 0]
final_count = len(raw_recipes)
rows_dropped = initial_count - final_count
print(f"Number of rows dropped: {rows_dropped}")

Number of rows dropped: 9593


In [29]:
raw_recipes.head()

Unnamed: 0,name,id,minutes,calories
0,arriba baked winter squash mexican style,137739,55,51.5
1,a bit different breakfast pizza,31490,30,173.4
2,all in the kitchen chili,112140,130,269.8
3,alouette potatoes,59389,45,368.1
4,amish tomato ketchup for canning,44061,190,352.9


In [30]:
# Filter out interactions where "recipe_id" is in current "raw_recipes"
all_recipe_ids = set(raw_recipes["id"])  # faster lookup
all_interactions = raw_interactions[raw_interactions["recipe_id"].isin(all_recipe_ids)]

In [31]:
len(all_recipes), len(all_interactions)

(213751, 1044830)

##### 3.2 Actual Model

In [32]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(all_interactions[['user_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [33]:
param_distributions = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.005, 0.01, 0.02],
    'reg_all': [0.02, 0.1, 0.4]
}

# Specify a lower number of iterations
rs = RandomizedSearchCV(
    SVD, 
    param_distributions, 
    measures=['rmse'], 
    cv=3,
    n_iter=10
)
rs.fit(data)
print("Best RMSE score:", rs.best_score['rmse'])
print("Best parameters:", rs.best_params['rmse'])

Best RMSE score: 1.2075001456080559
Best parameters: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}


In [34]:
best_params = rs.best_params['rmse']
model3 = SVD(**best_params)
model3.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21681b14b90>

In [35]:
predictions = model3.test(testset)
test_mse = accuracy.mse(predictions, verbose=True)
print(f"Test MSE: {test_mse}")

MSE: 1.4429
Test MSE: 1.4429245345548554


In [36]:
save_model(model1, 'svd_overall_recommendation.pkl')

Model saved to svd_overall_recommendation.pkl


In [37]:
def recommend_good_recipes(user_id, model, all_items, df_quick_recipes, df_user_interactions, top_n=5):
    already_rated = df_user_interactions[df_user_interactions['user_id'] == user_id]['recipe_id']
    to_predict = [item for item in all_items if item not in already_rated]
    recommendations = [(item, model.predict(user_id, item).est) for item in to_predict]
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]
    recommendations_df = pd.DataFrame(recommendations, columns=['id', 'predicted_rating'])
    recommendations_df = recommendations_df.merge(
        df_quick_recipes[['id', 'name','calories', 'minutes']],
        on='id',
        how='inner'
    )
    return recommendations_df

all_items = all_recipes['id'].unique()
user_id = 1

# Get top 5 recommendations
top_recommendations = recommend_good_recipes(user_id, model3, all_items, all_recipes, all_interactions, top_n=5)

# Display the top recommendations
print("Top 5 recommendations:")
print(top_recommendations)

Top 5 recommendations:
       id  predicted_rating                                 name  calories  \
0  486261          4.918660                mexican stack up  rsc     793.0   
1  524863          4.918125                     ragu shuka  ragu     337.6   
2  487111          4.912318  back porch bayou shrimp   corn  rsc     404.4   
3  495202          4.880247          crunchy valley chicken  rsc     598.4   
4   33995          4.863819                       vegan truffles      11.8   

   minutes  
0       75  
1       40  
2       45  
3       55  
4      140  
