# Content-based recommendation

# Exercise 1
Based on the TF-IDF vectors obtained in the Exercise 2 from Session 4, represent each user in the same vector space. Amongst other feasible solutions, you can represent a user (user profile) by computing the weighted mean of the items vectors. Compute the cosine similarity for user 'A39WWMBA0299ZF' and all products in the training set not rated by the user. What are the top-5 recommended items for user 'A39WWMBA0299ZF'? Print out the top-5 items and their similarity score.  

In [10]:
import import_ipynb
from collections import defaultdict
import pandas as pd
import numpy as np
    
from Session_4 import tf_idf_array, df
from Session_1 import training_data, test_data

user_item = training_data.pivot('reviewerID', 'asin', 'overall')

combine_matrix = pd.DataFrame(index=df.asin.values, data=tf_idf_array)

same_vector_space = defaultdict(pd.DataFrame)
for user in user_item.index.values:
    rated_by = training_data['reviewerID'].isin([user])
    items = training_data[rated_by].asin.values
    ratings = user_item.loc[user, items].values
    product_vector = combine_matrix.loc[items]
    product_vector['overall'] = ratings
    same_vector_space[user] = product_vector

# build user profile
user_profile = defaultdict()
for user, vector in same_vector_space.items():
    item_feature = vector.drop(['overall'], axis=1)
    rating = vector.overall.values
    user_profile[user] = np.mean(np.array(item_feature.multiply(rating, axis=0).values), axis=0)

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = defaultdict()
for user, vector in same_vector_space.items():
    rated_by_user_row = training_data['reviewerID'].isin([user])
    rated_by_user = training_data[rated_by_user_row].asin.values
    not_rated_by = combine_matrix.drop(rated_by_user)
    temp = pd.DataFrame(index=not_rated_by.index, columns=['cosine_similarity'])
    cosine_similarities = cosine_similarity([user_profile[user]], not_rated_by)
    temp['cosine_similarity'] = cosine_similarities[0]
    similarity[user] = temp


In [4]:
print("Top-5 recommended items for user 'A39WWMBA0299ZF':")
similarity['A39WWMBA0299ZF'].sort_values(by=['cosine_similarity'], ascending=False).head()

Top-5 recommended items for user 'A39WWMBA0299ZF':


Unnamed: 0,cosine_similarity
B019FWRG3C,0.396049
B00W259T7G,0.176053
B0006O10P4,0.145249
B00IJHY54S,0.106674
B00006L9LC,0.088709


# Exercise 2



Compute the systems’ hit rate based on the top-5, top-10 and top-20 recommendations, averaged over the total number of users. Remember that, as we are evaluating the system, you should compute the hit rate over the test set. How well/bad does this Content-based approach perform compared to the Collaborative Filtering?

In [5]:
user_item_test = test_data.pivot('reviewerID', 'asin', 'overall')
user_item_test = user_item_test.fillna(0)

# # user_item_test.loc['A18HENNBJ25817']
# user_item.loc['A18HENNBJ25817']
def transfer_to_user_item_rating_relevant(similarity):
    # map the predictions to each user.
    user_item_rating = defaultdict(list)
    for user, sim in similarity.items():
        if user not in list(user_item_test.index):  # very important part
            continue
        for iid in sim.index.values:
            true_rating = user_item_test.loc[user, iid] if iid in list(user_item_test.columns) else 0
            relevant = 1 if true_rating >= 4.0 else 0
            user_item_rating[user].append((iid, sim.loc[iid].values, relevant))
    # {uid: [(iid, est, relevant)]}
    return user_item_rating

# compute HR@k for one user
def HR_at_k(k, user_ratings):
    for i in range(k):
        _, _, relevant = user_ratings[i]
        if relevant == 1:
            return 1.0
    return 0.0

def compute_hit_rate(k, user_item_rating):
    hr = dict() # hit rate
    for uid, user_ratings in user_item_rating.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        hr[uid] = HR_at_k(k, user_ratings)

    return sum(prec for prec in hr.values()) / len(hr) 

user_item_rating_cb = transfer_to_user_item_rating_relevant(similarity)

k_set = [5, 10, 20]
for k in k_set:
    mhr_at_k_nb = compute_hit_rate(k, user_item_rating_cb)
    print(f"Hit Rate (top-{k}):", round(mhr_at_k_nb, 3))

Hit Rate (top-5): 0.426
Hit Rate (top-10): 0.448
Hit Rate (top-20): 0.514
