## Imports and data loading

In [378]:
import math
import random
import time

import pandas as pd
import numpy as np

from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity

from sparselsh import LSH

## Content-based, regression

In [379]:
# load food.com data
directory = 'data/food.com'
df_recipe_rating = pd.read_csv(f'{directory}/recipe_ratings.csv')
df_recipe = pd.read_csv(f'{directory}/recipe.csv')

In [380]:
df_recipe.head()

Unnamed: 0,recipe_id,name,minutes,n_steps,n_ingredients,calories,fat,sugar,sodium,protein,saturated,carbs
0,137739,arriba baked winter squash mexican style,55,11,7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,a bit different breakfast pizza,30,9,6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,all in the kitchen chili,130,6,13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potatoes,45,11,11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup for canning,190,5,8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [381]:
df_recipe_rating

Unnamed: 0,user_id,recipe_id,rating
0,2046,4684,5.0
1,2046,517,5.0
2,1773,7435,5.0
3,1773,278,4.0
4,2046,3431,5.0
...,...,...,...
698896,926904,457971,5.0
698897,2002312797,27208,5.0
698898,1290903,131607,5.0
698899,226867,363072,5.0


In [382]:
#EXERCISE: Build a content-based recommender system that uses linear regression 
#          to predict ratings.
#          Try it out on users with a high number of ratings.
#          Try some train-test split to evaluate performance.

#select the ratings of a specific user 


sorted_users = df_recipe_rating.groupby(["user_id"]).agg(count=("user_id", "count")).sort_values("count", ascending=False)
target_id = sorted_users.index[0]
df_user = df_recipe_rating[df_recipe_rating['user_id']==target_id][['recipe_id','rating']]
df_rec = pd.merge(df_user, df_recipe, on='recipe_id', how='inner')



# define features
features = ["minutes", 
            "n_steps", 
            "n_ingredients", 
            "calories", 
            "fat", 
            "sugar",
            "sodium",
            "protein",
            "saturated",
            "carbs"
        ] 

# split training and test
x = df_rec.drop(["recipe_id" , "name"], axis=1)
y = df_rec["rating"]
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True)


model = LinearRegression();

predicted = model.fit(x_train, y_train).predict(x_test)


expected = []
for index, row in x_test.iterrows():
    expected.append(row['rating'])

mse = (np.square(expected - predicted)).mean()

print(f"mse:  {mse}")

# ...

mse:  5.967977230266869e-31


## Content-based, KNN (with LSH)

In [383]:
directory = 'data/movielens/ml-latest-small'
#directory = 'data/movielens/ml-latest' #change into this one for the full dataset (slow)

df_movies = pd.read_csv(f'{directory}/movies.csv')
df_ratings = pd.read_csv(f'{directory}/ratings.csv')
df_tags = pd.read_csv(f'{directory}/tags.csv')

#transform tags such that they are lower-case, single-word tokens
df_tags['tag'] = df_tags['tag'].apply(lambda x: str(x).lower().replace(' ', '_'))

In [384]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [385]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,highly_quotable,1445714996
2,2,60756,will_ferrell,1445714992
3,2,89774,boxing_story,1445715207
4,2,89774,mma,1445715200


### Step1: Calculate item profiles

In [386]:
# calculates the lexicon of most frequent tags.
tag_frequency_threshold = 5 # increase number to filter
# sorted_users = df_recipe_rating.groupby(["user_id"]).agg(count=("user_id", "count")).sort_values("count", ascending=False)
df_lexicon = df_tags.groupby("tag").agg(count=("tag", "count")) # get a dataframe with tags and respective counts
# discard movies with no tags
item_profiles = df_movies[df_movies["movieId"].isin(df_tags["movieId"])]
item_profiles
df_lexicon
# you can drop the userId and timestamp columns because we don't care who assigned the tag and when
# ...

Unnamed: 0_level_0,count
tag,Unnamed: 1_level_1
"""artsy""",1
06_oscar_nominated_best_movie_-_animation,3
1900s,1
1920s,2
1950s,2
...,...
younger_men,1
zither,1
zoe_kazan,1
zombies,6


In [387]:
#calculate the sparse feature vector based on the TF-IDF of words in documents
#the TF-IDF vectors are saved as sparse representations into the dataframe
df_features = df_tags.groupby('movieId').agg(lambda x: ' '.join(x)).reset_index()
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(' ')).fit(sorted(df_features['tag']))
vectorizer.vocabulary_
df_features['feature_vector'] = df_features['tag'].apply(lambda x : vectorizer.transform([x]))
df_features

  df_features = df_tags.groupby('movieId').agg(lambda x: ' '.join(x)).reset_index()


Unnamed: 0,movieId,tag,feature_vector
0,1,pixar pixar fun,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0..."
1,2,fantasy magic_board_game robin_williams game,"(0, 1129)\t0.48431677463057304\n (0, 812)\t..."
2,3,moldy old,"(0, 967)\t0.7071067811865475\n (0, 876)\t0...."
3,5,pregnancy remake,"(0, 1106)\t0.6808321878901952\n (0, 1044)\t..."
4,7,remake,"(0, 1106)\t1.0"
...,...,...,...
1567,183611,comedy funny rachel_mcadams,"(0, 1084)\t0.7153847877740599\n (0, 515)\t0..."
1568,184471,adventure alicia_vikander video_game_adaptation,"(0, 1418)\t0.6173008972335793\n (0, 46)\t0...."
1569,187593,josh_brolin ryan_reynolds sarcasm,"(0, 1156)\t0.5453455400986055\n (0, 1147)\t..."
1570,187595,emilia_clarke star_wars,"(0, 1266)\t0.6876494487057803\n (0, 433)\t0..."


### Step2: Index item profiles into LSH

In [388]:
m = sparse.vstack(df_features["feature_vector"])
#index all item vectors into LSH
lsh = LSH(4, m.shape[1], num_hashtables=2, storage_config={"dict":None})
#run an example query to the LSH
print(df_features["movieId"].shape)
print(m.shape)
lsh.index(m, df_features["movieId"])
query = np.zeros(m.shape[1])
query[0] = 1
lsh.query(csr_matrix(query))

(1572,)
(1572, 1475)


[((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 9 stored elements in Compressed Sparse Row format>,
   1258),
  1.9999999999999996),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 3 stored elements in Compressed Sparse Row format>,
   8783),
  1.9999999999999996),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 7 stored elements in Compressed Sparse Row format>,
   135133),
  1.9999999999999996),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 5 stored elements in Compressed Sparse Row format>,
   6669),
  1.9999999999999996),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 2 stored elements in Compressed Sparse Row format>,
   152077),
  1.9999999999999996),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 2 stored elements in Compressed Sparse Row format>,
   4164),
  1.9999999999999998),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 2 stored e

### Step 3: Calculate user profile

In [389]:
# restricts the ratings to the set of most popular movies (optional, not needed for content-based)
numratings_threshold = 0 #increase this number if you want to filter
df_item_popularity = df_ratings[['movieId','rating']].groupby('movieId').count().reset_index()
df_item_popularity.columns = ['movieId','count'] 
df_item_popularity = df_item_popularity.sort_values(by='count', ascending=False)
df_item_popularity = df_item_popularity[df_item_popularity['count'] >= numratings_threshold]
print(f'Number of movies reduced from {len(df_ratings.movieId.unique())} to {len(df_item_popularity.movieId.unique())}')
df_ratings = pd.merge(df_ratings, df_item_popularity, on='movieId', how='inner')[['userId', 'movieId', 'rating']]
df_ratings = df_ratings.sort_values(by='userId')

# ----- SLIDE 12 FROM LECTURE 06 ------
# rescale the ratings by the user's individual average 
# We do this by taking the average of the users rating and subtract the average with that movie rating. In other words we do a simple normalization
mu = df_ratings.groupby("userId").agg(avg=("rating", "mean"), stdev=("rating","std")).reset_index()
df_ratings = df_ratings.merge(mu, on='userId')
df_ratings['rating_scaled'] = (df_ratings["rating"] - df_ratings["avg"]) / df_ratings["stdev"]
df_ratings = df_ratings.drop(columns=["avg", "stdev"]) 

preprocessing.minmax_scale(df_ratings)
df_ratings.pivot_table(index='userId', columns='movieId', values='rating')

Number of movies reduced from 9724 to 9724


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [390]:
# join ratings with movie feature vectors
df_profile = pd.merge(df_ratings, df_features[['movieId','feature_vector']],
              on='movieId')
#scaling feature vector by rating (this will take a few minutes)
df_profile['feature_vector_scaled'] = df_profile['rating_scaled'] * df_profile['feature_vector']
df_profile

Unnamed: 0,userId,movieId,rating,rating_scaled,feature_vector,feature_vector_scaled
0,1,1,4.0,-0.457947,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0...","(0, 1016)\t-0.411799929981753\n (0, 513)\t-..."
1,5,1,4.0,0.367146,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0...","(0, 1016)\t0.33014922576704736\n (0, 513)\t..."
2,7,1,4.5,0.954981,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0...","(0, 1016)\t0.8587487567457724\n (0, 513)\t0..."
3,15,1,2.5,-0.836549,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0...","(0, 1016)\t-0.7522511071309106\n (0, 513)\t..."
4,17,1,4.5,0.571252,"(0, 1016)\t0.8992312620852461\n (0, 513)\t0...","(0, 1016)\t0.5136879287900765\n (0, 513)\t0..."
...,...,...,...,...,...,...
48282,610,104944,4.0,0.363233,"(0, 1367)\t0.4686023497730619\n (0, 848)\t0...","(0, 1367)\t0.17021181211818398\n (0, 848)\t..."
48283,567,117877,3.0,0.617935,"(0, 1379)\t0.772855402005405\n (0, 1009)\t0...","(0, 1379)\t0.47757417357918713\n (0, 1009)\..."
48284,594,7023,4.5,0.423521,"(0, 660)\t1.0","(0, 660)\t0.4235214565103843"
48285,606,6107,4.0,0.473127,"(0, 1466)\t1.0","(0, 1466)\t0.4731269122480648"


In [391]:
start = time.time()
#stack all sparse vectors of user's movies
df_user_vectors = df_profile[['userId', 'feature_vector_scaled']].groupby('userId').agg(sparse.vstack).reset_index()
#compute the average of the vectors without considering the zero entries (this will take a while)
df_user_vectors['feature_vector_scaled'] = df_user_vectors['feature_vector_scaled'].apply(lambda x: csr_matrix(np.nan_to_num(x.sum(axis=0)/x.getnnz(axis=0), 0)))
end = time.time()
print(end - start)
df_user_vectors

0.7831761837005615


Unnamed: 0,userId,feature_vector_scaled
0,1,"(0, 7)\t-0.4579466343583536\n (0, 8)\t-0.13..."
1,2,"(0, 23)\t-0.05716786218232697\n (0, 52)\t0...."
2,3,"(0, 5)\t-0.5820277420256257\n (0, 15)\t-0.9..."
3,4,"(0, 3)\t0.7927255654862778\n (0, 4)\t0.8544..."
4,5,"(0, 8)\t0.10538570667994816\n (0, 21)\t0.10..."
...,...,...
605,606,"(0, 0)\t-0.10938483543421161\n (0, 1)\t0.33..."
606,607,"(0, 3)\t0.15976445552714327\n (0, 8)\t-0.06..."
607,608,"(0, 7)\t-0.5876012286900908\n (0, 8)\t0.132..."
608,609,"(0, 8)\t0.12406344272438276\n (0, 21)\t0.12..."


### Step 4: Rank potential recommendation candidates

In [392]:
#pick a target user to provide recommendations to
idx = 42
target_userId = df_user_vectors.iloc[idx].userId

In [393]:
#get user rating history
df_user_history = df_ratings.loc[df_ratings["userId"] == target_userId]
#select candidate recommendations to user

# ----- SLIDE 14 FROM LECTURE 06 ------
query = df_user_vectors.merge(df_user_history, on="userId")["feature_vector_scaled"][0]

def my_filter(x):
    (point, label), dist = x
    return not df_user_history.loc[df_user_history["movieId"] == label]["movieId"].any()
    
df_recommendation = list(filter(my_filter, lsh.query(query)))

# 5
# 610,Heavy Metal (1981),Action|Adventure|Animation|Horror|Sci-Fi
# 828,"Adventures of Pinocchio, The (1996)",Adventure|Children
# 367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
# 261,Little Women (1994),Drama
# 158,Casper (1995),Adventure|Children
# 4
# 208,Waterworld (1995),Action|Adventure|Sci-Fi
# 2572,10 Things I Hate About You (1999),Comedy|Romance

In [394]:
df_recommendation

[((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 1 stored elements in Compressed Sparse Row format>,
   2355),
  56.778411120497694),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 7 stored elements in Compressed Sparse Row format>,
   3114),
  57.031468311336916),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 2 stored elements in Compressed Sparse Row format>,
   4546),
  57.04121143243113),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 2 stored elements in Compressed Sparse Row format>,
   1010),
  57.16259861148298),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 1 stored elements in Compressed Sparse Row format>,
   907),
  57.22853647412561),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 1 stored elements in Compressed Sparse Row format>,
   8424),
  57.22853647412561),
 ((<1x1475 sparse matrix of type '<class 'numpy.float64'>'
   	with 1 stored elements i

In [395]:
df_user_history.head(10)

Unnamed: 0,userId,movieId,rating,rating_scaled
6315,43,344,4.0,-0.813215
6316,43,592,5.0,0.658317
6317,43,810,5.0,0.658317
6318,43,442,5.0,0.658317
6319,43,185,4.0,-0.813215
6320,43,292,5.0,0.658317
6321,43,165,5.0,0.658317
6322,43,1084,5.0,0.658317
6323,43,500,5.0,0.658317
6324,43,590,4.0,-0.813215


In [396]:
df_user_history.tail(10)

Unnamed: 0,userId,movieId,rating,rating_scaled
6419,43,11,4.0,-0.813215
6420,43,1105,5.0,0.658317
6421,43,47,4.0,-0.813215
6422,43,610,5.0,0.658317
6423,43,596,5.0,0.658317
6424,43,588,5.0,0.658317
6425,43,595,5.0,0.658317
6426,43,532,4.0,-0.813215
6427,43,454,4.0,-0.813215
6428,43,271,5.0,0.658317


### Step 5: Predict ratings of candidate items

In [397]:
#index all user vectors into LSH
df_usr = df_profile[df_profile['userId'] == target_userId]
m = sparse.vstack(df_usr["feature_vector_scaled"])
lsh_usr = LSH(6, m.shape[1], num_hashtables=2,  storage_config={"dict":None})
for i in range(len(df_usr)):
    lsh_usr.index(m[i], extra_data=[df_usr.iloc[i]["movieId"], df_usr.iloc[i]["rating"]]) # repeat for all users. Insert movieid and rating as extra data for future retrieval

lsh_usr

<sparselsh.lsh.LSH at 0x7fc0e65a9ed0>

In [398]:
# compute recommendation
# (point, label), dist = df_recommendation[0]
predicted_ratings = []
movie_id_dict = {}

def getMean(x):
    (point, label), dist = x
    return label[1]
def getMovId(x):
    (point, label), dist = x
    return label[0]
for i in range(len(df_recommendation)):
    (point, label), dist = df_recommendation[i]
    query_res = lsh_usr.query(point)
    if len(query_res) != 0 and type(query_res[0][0][1]) == list:
        ratings = list(map(getMean, query_res))
        movieIds = list(map(getMovId, query_res))
        predicted_ratings.append(np.mean(ratings))
        for id in movieIds:
            movie_id_dict[id] = id

# df_recommendation
print(f"Mean for predicted rating for predicted movies is {np.mean(predicted_ratings)}")
print(f"Recommended movies are {list(movie_id_dict.values())}")

Mean for predicted rating for predicted movies is 4.524965986394557
Recommended movies are [1, 110, 288, 29, 150, 161, 587, 5, 317, 596, 595, 588, 594, 616, 329, 300, 377, 364, 589, 648, 457, 1356, 3, 480, 351, 318, 261, 454, 350, 7, 107, 539, 413, 1084, 356, 277, 590, 185, 410, 316, 349, 500, 592, 153, 380, 34, 47, 339, 597, 11]


## Collaborative filtering

In [399]:
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.knns import KNNBasic

In [400]:
directory = 'data/movielens/ml-latest-small'
#directory = 'data/movielens/ml-latest' #change into this one for the full dataset (slow)
df_ratings = pd.read_csv(f'{directory}/ratings.csv')
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [401]:
# initialize a data reader
reader = Reader(rating_scale=(1, 5))
# provide a dataset with userid, itemtid, and rating in order
data = Dataset.load_from_df(df_ratings[['userId','movieId','rating']], reader)

# surprise has also some built-in datasets that can be imported directly
#data = Dataset.load_builtin('ml-100k')

In [402]:
# initialize a user-based K nearest neighbors implementation
anti_set = data.build_full_trainset().build_anti_testset()
algo = KNNBasic()

# execute 5-fold cross-validation and measure RMSE and MAE
_dict = cross_validate(algo, data, measures=["rmse", "mae"], cv=5)
rmse = _dict["test_rmse"]
mae = _dict["test_mae"]
print(f'rmse {rmse}')
print(f'mae {mae}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
