In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import gzip
from collections import defaultdict
import numpy as np

In [11]:
recipes = pd.read_csv('data/RAW_recipes.csv')
interactions = pd.read_csv('data/RAW_interactions.csv')

In [19]:
interactions

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [53]:
np.random.seed(1)
interaction_array = interactions.to_numpy()
np.random.shuffle(interaction_array)
interaction_train = interaction_array[:len(interaction_array) * 8 // 10]
interaction_valid = interaction_array[len(interaction_array) * 8 // 10:len(interaction_array) * 9 // 10]
interaction_test = interaction_array[len(interaction_array) * 9 // 10:]

ratingsPerUser = defaultdict(list)
ratingsPerRecipe = defaultdict(list)
recipesPerUser = defaultdict(set)
usersPerRecipe = defaultdict(set)
all_ratings_per_pair = defaultdict(int)
for user_id, recipe_id, date, rating, review in interaction_train:
    ratingsPerUser[user_id].append((recipe_id,rating))
    ratingsPerRecipe[recipe_id].append((user_id,rating))
    recipesPerUser[user_id].add(recipe_id)
    usersPerRecipe[recipe_id].add(user_id)
    all_ratings_per_pair[(user_id,recipe_id)] = rating

In [54]:
# ratingsTrain = allRatings[:190000]
# ratingsValid = allRatings[190000:]
# ratingsPerUser = defaultdict(list)
# ratingsPerItem = defaultdict(list)
# itemsPerUser = defaultdict(set)
# usersPerItem = defaultdict(set)
# all_ratings_per_pair = defaultdict(int)
# for u,b,r in ratingsTrain:
#     ratingsPerUser[u].append((b,r))
#     ratingsPerItem[b].append((u,r))
#     itemsPerUser[u].add(b)
#     usersPerItem[b].add(u)
#     all_ratings_per_pair[(u,b)] = r

In [55]:
def train_latent_factor(reg, threshold):
    user_beta_param = defaultdict(int)
    recipe_beta_param = defaultdict(int)
    global_alpha = 0
    def calculate_mse(dataset):
        return np.mean([np.square(global_alpha + user_beta_param[user_id] + recipe_beta_param[recipe_id] - rating) for user_id, recipe_id, date, rating, review in dataset])
    def calculate_sse_reg(dataset, reg):
        reg_term = reg * (np.sum(np.square(list(recipe_beta_param.values()))) + np.sum(np.square(list(user_beta_param.values()))))
        return np.sum([np.square(global_alpha + user_beta_param[user_id] + recipe_beta_param[recipe_id] - rating) for user_id, recipe_id, date, rating, review in dataset]) + reg_term
    training_iteration = 0
    all_losses = []
    all_valid_mses = []
    prev_loss = float('inf')
    current_loss = calculate_sse_reg(interaction_train, reg)
    while np.abs(prev_loss - current_loss) > threshold:
        training_iteration += 1
        for recipe in ratingsPerRecipe:
            users_rating_for_this_recipe = ratingsPerRecipe[recipe] 
            reg_term = (reg + len(users_rating_for_this_recipe))
            recipe_beta_param[recipe] = np.sum([rating - (global_alpha + user_beta_param[user]) for user,rating in users_rating_for_this_recipe])/reg_term
        for user in ratingsPerUser:
            recipes_rating_for_this_user = ratingsPerUser[user] 
            reg_term = (reg + len(recipes_rating_for_this_user))
            user_beta_param[user] = np.sum([rating - (global_alpha + recipe_beta_param[recipe]) for recipe,rating in recipes_rating_for_this_user])/reg_term
        global_alpha = np.sum([all_ratings_per_pair[(user,recipe)] - (user_beta_param[user] + recipe_beta_param[recipe]) for user, recipe in all_ratings_per_pair]) / len(all_ratings_per_pair)
        prev_loss = current_loss
        current_loss = calculate_sse_reg(interaction_train, reg)
        valid_mse = calculate_mse(interaction_valid)
        all_losses.append(current_loss)
        all_valid_mses.append(valid_mse)
        if training_iteration % 5 == 1:
            print(f"Iteration {training_iteration}:")
            print(f"SSE Loss: {current_loss}")
            print(f"Valid MSE: {valid_mse}")
    return [global_alpha, user_beta_param, recipe_beta_param], training_iteration, all_losses, all_valid_mses

In [56]:
latent_factor_infos = []
reg_params = []
for reg_param in range(25,50,5):
    reg_param = reg_param / 10
    reg_params.append(reg_param)
    print(f"Reg Param: {reg_param}")
    latent_factor_infos.append(train_latent_factor(reg_param, 1000))
    print()
[x[-1][-1] for x in latent_factor_infos]

Reg Param: 2.5
Iteration 1:
SSE Loss: 4233501.717655072
Valid MSE: 3.012183452625178
Iteration 6:
SSE Loss: 1563273.0266886025
Valid MSE: 1.8965204279018224
Iteration 11:
SSE Loss: 1127214.0393039365
Valid MSE: 1.5485434153958948
Iteration 16:
SSE Loss: 994292.0075342997
Valid MSE: 1.4460280180900087
Iteration 21:
SSE Loss: 961933.638751481
Valid MSE: 1.4286149711305491

Reg Param: 3.0
Iteration 1:
SSE Loss: 4480432.577671228
Valid MSE: 3.0252189527515227
Iteration 6:
SSE Loss: 1567105.716484117
Valid MSE: 1.8496071111695445
Iteration 11:
SSE Loss: 1125715.016334013
Valid MSE: 1.5095013436262013
Iteration 16:
SSE Loss: 1014031.528723923
Valid MSE: 1.4320332960031255
Iteration 21:
SSE Loss: 991617.8316501476
Valid MSE: 1.423608756614058

Reg Param: 3.5
Iteration 1:
SSE Loss: 4680744.825050358
Valid MSE: 3.0400836324619753
Iteration 6:
SSE Loss: 1563448.0999027495
Valid MSE: 1.8081420284707415
Iteration 11:
SSE Loss: 1126124.1127629024
Valid MSE: 1.48276547761154
Iteration 16:
SSE Loss: 

[1.4283450485066866,
 1.4244431684470615,
 1.4215352684714353,
 1.4199220546082472,
 1.418888557027016]

In [57]:
all_mses = [x[-1][-1] for x in latent_factor_infos]
lamb = reg_params[np.argmin(all_mses)]
best_params, best_num_training_iteration, best_all_losses, best_all_valid_mses = latent_factor_infos[np.argmin(all_mses)]
validMSE = best_all_valid_mses[-1]

In [58]:
def calculate_mse(dataset,alpha,user_beta,recipe_beta):
        return np.mean([np.square(alpha + user_beta[user_id] + recipe_beta[recipe_id] - rating) for user_id, recipe_id, date, rating, review in dataset])

In [59]:
predictions = []
best_alpha, best_user_beta_param, best_recipe_beta_param = best_params
calculate_mse(interaction_test,best_alpha, best_user_beta_param, best_recipe_beta_param)

1.4411919882421653