## Imports and data loading

In [None]:
import math
import random
import time

import pandas as pd
import numpy as np

from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity

from sparselsh import LSH

## Content-based, regression

In [None]:
# load food.com data
directory = 'data/food.com'
df_recipe_rating = pd.read_csv(f'{directory}/recipe_ratings.csv')
df_recipe = pd.read_csv(f'{directory}/recipe.csv')

In [None]:
df_recipe.head()

In [None]:
df_recipe_rating

In [None]:
#EXERCISE: Build a content-based recommender system that uses linear regression 
#          to predict ratings.
#          Try it out on users with a high number of ratings.
#          Try some train-test split to evaluate performance.

#select the ratings of a specific user 
target_id = ...
df_user = df_recipe_rating[df_recipe_rating['user_id']==target_id][['recipe_id','rating']]
df_rec = pd.merge(df_user, df_recipe, on='recipe_id', how='inner')

# define features
features = [...] 

# split training and test
...

# fit on the training, test on the rest
...

## Content-based, KNN (with LSH)

In [None]:
directory = 'data/movielens/ml-latest-small'
#directory = 'data/movielens/ml-latest' #change into this one for the full dataset (slow)

df_movies = pd.read_csv(f'{directory}/movies.csv')
df_ratings = pd.read_csv(f'{directory}/ratings.csv')
df_tags = pd.read_csv(f'{directory}/tags.csv')

#transform tags such that they are lower-case, single-word tokens
df_tags['tag'] = df_tags['tag'].apply(lambda x: str(x).lower().replace(' ', '_'))

In [None]:
df_movies.head()

In [None]:
df_tags.head()

### Step1: Calculate item profiles

In [None]:
# calculates the lexicon of most frequent tags.
tag_frequency_threshold = 5 # increase number to filter
df_lexicon = ... # get a dataframe with tags and respective counts

# discard movies with no tags
...

# you can drop the userId and timestamp columns because we don't care who assigned the tag and when
...

In [None]:
#calculate the sparse feature vector based on the TF-IDF of words in documents
#the TF-IDF vectors are saved as sparse representations into the dataframe
df_features = df_tags.groupby('movieId').agg(lambda x: ' '.join(x)).reset_index()
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(' ')).fit(sorted(df_features['tag']))
vectorizer.vocabulary_
df_features['feature_vector'] = df_features['tag'].apply(lambda x : vectorizer.transform([x]))
df_features

### Step2: Index item profiles into LSH

In [None]:
#index all item vectors into LSH
lsh = LSH(...)

#run an example query to the LSH
lsh.query(...)

### Step 3: Calculate user profile

In [None]:
# restricts the ratings to the set of most popular movies (optional, not needed for content-based)
numratings_threshold = 0 #increase this number if you want to filter
df_item_popularity = df_ratings[['movieId','rating']].groupby('movieId').count().reset_index()
df_item_popularity.columns = ['movieId','count'] 
df_item_popularity = df_item_popularity.sort_values(by='count', ascending=False)
df_item_popularity = df_item_popularity[df_item_popularity['count'] >= numratings_threshold]
print(f'Number of movies reduced from {len(df_ratings.movieId.unique())} to {len(df_item_popularity.movieId.unique())}')
df_ratings = pd.merge(df_ratings, df_item_popularity, on='movieId', how='inner')[['userId', 'movieId', 'rating']]
df_ratings = df_ratings.sort_values(by='userId')

#rescale the ratings by the user's individual average 
df_ratings['rating_scaled'] = ...

df_ratings.head()

In [None]:
# join ratings with movie feature vectors
df_profile = pd.merge(df_ratings, df_features[['movieId','feature_vector']],
              on='movieId')
#scaling feature vector by rating (this will take a few minutes)
df_profile['feature_vector_scaled'] = df_profile['rating_scaled'] * df_profile['feature_vector']
df_profile

In [None]:
start = time.time()
#stack all sparse vectors of user's movies
df_user_vectors = df_profile[['userId', 'feature_vector_scaled']].groupby('userId').agg(sparse.vstack).reset_index()
#compute the average of the vectors without considering the zero entries (this will take a while)
df_user_vectors['feature_vector_scaled'] = df_user_vectors['feature_vector_scaled'].apply(lambda x: csr_matrix(np.nan_to_num(x.sum(axis=0)/x.getnnz(axis=0), 0)))
end = time.time()
print(end - start)
df_user_vectors

### Step 4: Rank potential recommendation candidates

In [None]:
#pick a target user to provide recommendations to
idx = 42
target_userId = df_user_vectors.iloc[idx].userId

In [None]:
#get user rating history
df_user_history = ...

#select candidate recommendations to user
df_recommendation = ...

In [None]:
df_recommendation

In [None]:
df_user_history.head(10)

In [None]:
df_user_history.tail(10)

### Step 5: Predict ratings of candidate items

In [None]:
#index all user vectors into LSH
df_usr = df_profile[df_profile['userId'] == target_userId]
lsh_usr = LSH(...)
lsh_usr.index(..., extra_data=[...]) # repeat for all users. Insert movieid and rating as extra data for future retrieval
lsh_usr

In [None]:
# compute recommendation
df_recommendation = ...

## Collaborative filtering

In [None]:
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.knns import KNNBasic

In [None]:
directory = 'data/movielens/ml-latest-small'
#directory = 'data/movielens/ml-latest' #change into this one for the full dataset (slow)
df_ratings = pd.read_csv(f'{directory}/ratings.csv')
df_ratings.head()

In [None]:
# initialize a data reader
reader = Reader(rating_scale=(1, 5))
# provide a dataset with userid, itemtid, and rating in order
data = Dataset.load_from_df(df_ratings[['userId','movieId','rating']], reader)

# surprise has also some built-in datasets that can be imported directly
#data = Dataset.load_builtin('ml-100k')

In [None]:
# initialize a user-based K nearest neighbors implementation
...
# execute 5-fold cross-validation and measure RMSE and MAE
...