# Data Preparation

In [None]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from collections import defaultdict
from surprise.model_selection import KFold
import numpy as np

In [None]:
recipes = pd.read_csv("data/RAW_recipes.csv")
ratings = pd.read_csv("data/RAW_interactions.csv")

print(f"recipes shape: {recipes.shape} and headings: {recipes.columns}")
print(f"ratings shape: {ratings.shape} and headings: {ratings.columns}")

Create the ratings matrix R

- each recipe has an id
for each recipe make a new row
- each review is associated with a recipe_id and was made by a user
for each review, find the row and store the rating in the column for the user. if the column doesn't exist then make a new one.

There's a lot of looking up so hopefully a dictionary like structure can be used with fast random access.

In [None]:
# Create CF ratings matrix R
df_UI = ratings.merge(recipes, how='left', left_on='recipe_id', right_on = 'id')

df_UI = df_UI[['user_id', 'recipe_id', 'rating']]

df_UI.head()

# Matrix Formation

In [None]:
reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_UI[['user_id', 'recipe_id', 'rating']], reader)

# Matrix Factorisation

## Helper Functions

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

## SVD

In [None]:
model = SVD(verbose=True)

In [None]:
kf = KFold(n_splits=5)
precision_kfold = []
recall_kfold = []

for trainset, testset in kf.split(data):
    model.fit(trainset)
    predictions = model.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=3.5)

    # Precision and recall can then be averaged over all users
    precision_kfold.append(sum(prec for prec in precisions.values()) / len(precisions))
    recall_kfold.append(sum(rec for rec in recalls.values()) / len(recalls))

# Producing Recommendations

In [None]:
recipes_names = dict([(rep_id,name) for name,rep_id in zip(recipes.name,recipes.id)])

# print(recipes_names.keys())

In [35]:
r_ids = [23, 56, 34, 111]; uid = 226571; n = 10

uid = 3

# def get_n_predictions(iids, uid, model, n = 10):
iid_to_test = [r_id for r_id in range(231637) if r_id not in r_ids]
test_set = [[uid,r_id,4.] for r_id in iid_to_test]

predictions = model.test(test_set)
pred_ratings = [pred.est for pred in predictions]

# return top_n indices
top_n = np.argpartition(pred_ratings,1)[-n:]
print(top_n)
# results = [recipes_names[i] for i in top_n]
# return results

[231623 231624 231625 231626 231627 231628 231629 231630 231631 231632]


In [36]:
type(predictions)
predictions[100]

Prediction(uid=3, iid=103, r_ui=4.0, est=4.411162895438098, details={'was_impossible': False})