In [1]:
#Making use of the imports
import numpy as np
import scipy
import pandas as pd
from pandas import read_csv
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
Ratings=pd.read_csv('C:\\Users\\Romell\\Desktop\\MovieLensTest\\ratings.csv', delimiter=',', encoding="latin-1")
Movies=pd.read_csv('C:\\Users\\Romell\\Desktop\\MovieLensTest\\movies.csv', delimiter=',', encoding="latin-1")
Tags=pd.read_csv('C:\\Users\\Romell\\Desktop\\MovieLensTest\\tags.csv', delimiter=',', encoding="latin-1")

In [2]:
#Displaying the movies Dataframe
Movies.head

<bound method NDFrame.head of       movieId                                              title  \
0           1                                   Toy Story (1995)   
1           2                                     Jumanji (1995)   
2           3                            Grumpier Old Men (1995)   
3           4                           Waiting to Exhale (1995)   
4           5                 Father of the Bride Part II (1995)   
5           6                                        Heat (1995)   
6           7                                     Sabrina (1995)   
7           9                                Sudden Death (1995)   
8          10                                   GoldenEye (1995)   
9          11                     American President, The (1995)   
10         12                 Dracula: Dead and Loving It (1995)   
11         13                                       Balto (1995)   
12         14                                       Nixon (1995)   
13         15     

In [3]:
#Creating training and test data for evaluation(80/20 split)
Ratings_train_df, Ratings_test_df = train_test_split(Ratings,
                                   stratify=Ratings['userId'], 
                                   test_size=0.20,
                                   random_state=42)

print('# Ratings on Train set: %d' % len(Ratings_train_df))
print('# Ratings on Test set: %d' % len(Ratings_test_df))

# Ratings on Train set: 211604
# Ratings on Test set: 52901


In [4]:
#Indexing by personId to speed up the searches during evaluation
Ratings_full_indexed_df = Ratings.set_index('userId')
Ratings_train_indexed_df = Ratings_train_df.set_index('userId')
Ratings_test_indexed_df = Ratings_test_df.set_index('userId')

In [5]:
#Making use of the corpus for making use of stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Romell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#Content based filtering
#Ignoring stopwords (words with no semantics) from English and Portuguese (as we have a corpus with mixed languages)
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000)
item_ids = Movies['movieId'].tolist()
tfidf_matrix = vectorizer.fit_transform( Movies['genres'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<2500x92 sparse matrix of type '<class 'numpy.float64'>'
	with 10863 stored elements in Compressed Sparse Row format>

In [7]:
# Creating the item profiles 
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles


In [8]:
#Creating user profiles
def build_users_profile(userId, Ratings_indexed):
    Ratings_df = Ratings_indexed.loc[userId]
    user_item_profiles = get_item_profiles(Ratings_df['movieId'])
    
    user_item_strengths = np.array(Ratings_df['rating']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    Ratings_indexed = Ratings[Ratings['movieId'] \
                              .isin(Movies['movieId'])].set_index('userId')
    user_profiles = {}
    for userId in Ratings_indexed.index.unique():
        user_profiles[userId] = build_users_profile(userId, Ratings_indexed)
    return user_profiles

In [9]:
#Calculating the user profiles adn displaying the length of the user profiles
user_profiles = build_users_profiles()
len(user_profiles)

862

In [10]:
#Taking as a test user whose id is 121987 and thus calcualting the relevance of each genre for the test user
myprofile = user_profiles[121987][3]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[121987].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 92)


Unnamed: 0,token,relevance
0,action,0.376249
1,adventure,0.362931
2,action adventure,0.301529
3,comedy,0.268494
4,fantasy,0.221679
5,fi,0.217771
6,sci,0.217771
7,sci fi,0.217771
8,animation,0.213784
9,children,0.207788


In [11]:
#This part is the core of the content based model and contains the definition of the model
class ContentBasedRecommender:[3]
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, userId, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[userId], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['movieId', 'rating']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'movieId', 
                                                          right_on = 'movieId')[['rating', 'movieId', 'title']]


        return recommendations_df
    
    
    
content_based_recommender_model = ContentBasedRecommender(Movies)

In [14]:
# for evaluation of model we use top-N accuracy
def get_items_interacted(userId, Ratings):
    # Get the user's data and merge in the movie information.
    interacted_items = Ratings.loc[userId]['movieId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [15]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, userId, sample_size, seed=42):
        interacted_items = get_items_interacted(userId, Ratings_full_indexed_df)
        all_items = set(Movies['movieId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, movieId, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == movieId)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, userId):
        #Getting the items in test set
        Ratings_values_testset = Ratings_test_indexed_df.loc[userId]
        if type(Ratings_values_testset['movieId']) == pd.Series:
            user_interacted_movies_testset = set(Ratings_values_testset['movieId'])
        else:
            user_interacted_movies_testset = set([int(Ratings_values_testset['movieId'])])  
        movies_items_count_testset = len(user_interacted_movies_testset) 

        #Getting a ranked recommendation list from a model for a given user
        user_recs_df = model.recommend_items(userId, 
                                               items_to_ignore=get_items_interacted(userId, 
                                                                                    Ratings_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for movieId in user_interacted_movies_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(userId, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=movieId%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([movieId]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = user_recs_df[user_recs_df['movieId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['movieId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(movieId, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(movieId, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(movies_items_count_testset)
        recall_at_10 = hits_at_10_count / float(movies_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': movies_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}  [1] [2]
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, userId in enumerate(list(Ratings_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, userId)  
            person_metrics['_person_id'] = userId
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(), [1]
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()   

In [16]:
#Thus making use of the previous function printing the various global metrics that decide the performance of the model
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
861 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.10115120697151282, 'recall@10': 0.17621595054913897}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
110,76630,32,14,380,0.084211,0.036842
65,133811,25,10,311,0.080386,0.032154
263,70201,31,15,300,0.103333,0.05
104,79531,37,26,299,0.123746,0.086957
291,42096,33,14,282,0.117021,0.049645
29,21391,46,25,276,0.166667,0.09058
69,117144,23,10,267,0.086142,0.037453
114,34587,33,26,261,0.126437,0.099617
93,113806,39,13,251,0.155378,0.051793
180,35227,23,13,251,0.091633,0.051793


In [None]:
Refernces 
1)(https://medium.com/@m_n_malaeb/recall-and-precision-at-k-for-recommender-systems-618483226c54)
2)(https://www.quora.com/What-is-Hit-ratio-HR-n-in-evaluations-recommendation-system-is-it-same-with-A-B-testing)
3)(https://medium.com/@tomar.ankur287/content-based-recommender-system-in-python-2e8e94b16b9e)


Metrics Description
1)Recall@k(Recall@5 and Recall @10)
We can interpret that the maximum is 15% (That is the maximum)percent of the relevant items were recommended in the top-10 items
We can interpret that the maximum is 9% (That is the maximum)percent of the relevant items were recommended in the top-5items
2)Hits@n(Hits@5 and Hits @10)  
Is  a way of calculating how many "hits" you have in an n-sized list of ranked items
3)Interacted_count
Number of interactions of user with the items.

#Drawbacks
1)Does not take into account the relationship between the users.
2)Accuracy provided by the model does not seem to be satisfactory

#Improvements
1)CNan improve the model taking into account other features that describes the movies other than the genre
