In [1]:
import pandas as pd

## The Data Class

This class is specific for downloading different collaborative-filtering datasets, and building new ways of simulating user ratings for the data used in this research.

In [2]:
class Data:
    
    '''
    The datasets for collaborative filtering must be:
        - The dataframe containing the ratings. 
        - It must have three columns, corresponding to the user (raw) ids, 
          the item (raw) ids, and the ratings, in this order.   
    '''
    
    def __init__(self):
        
        '''
        The databases (ml_100k, ml_1m and jester) are built-in the surprise package for
        collaborative-filtering
        '''
        
        self.available_databases=['ml_100k', 'ml_1m','jester', 'lda_topics', 'lda_rankings', 'uniform']

    def show_available_databases(self):
        print('The avaliable database are:')
        for i,database in enumerate(self.available_databases):
            print(str(i)+': '+database)            
        
    def read_data(self,database_name):
        self.database_name=database_name
        self.the_data_reader= getattr(self, 'read_'+database_name.lower())
        self.the_data_reader()   

    def read_ml_100k(self):
        
        '''
        Please search the surprise package for the documentation of this dataset.
        https://grouplens.org/datasets/movielens/
        '''
        
        from surprise import Dataset
        data = Dataset.load_builtin('ml-100k')
        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
        self.df.drop(columns=['timestamp'],inplace=True)
        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)

    def read_ml_1m(self):
        
        '''
        Please search the surprise package for the documentation of this dataset.
        https://grouplens.org/datasets/movielens/
        '''
        
        from surprise import Dataset
        data = Dataset.load_builtin('ml-1m')
        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
        self.df.drop(columns=['timestamp'],inplace=True)
        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)

    def read_jester(self):
        
        '''
        Please search the surprise package for the documentation of this dataset.
        https://eigentaste.berkeley.edu/dataset/
        '''
        
        from surprise import Dataset
        data = Dataset.load_builtin('jester')
        self.df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])
        self.df.drop(columns=['timestamp'],inplace=True)
        self.df.rename({'user_id':'userID','item_id':'itemID'},axis=1,inplace=True)
        
    def read_uniform(self):
        
        '''
        Hyperparameters -
        `n_users` : number of simulated users in the database;
        `n_ratings` : number of simulated rating events in the database.
        
        This is a fictional dataset based in the choice of an uniformly distributed random rating 
        (from 1 to 5) for one of the simulated users of the recommender-system that is being designed in
        this research project.
        '''
        n_users = 20
        n_ratings = 10000
        
        import random
        
        opo = pd.read_csv('../oportunidades.csv')
        df = [(random.randrange(n_users), random.randrange(len(opo)), random.randrange(1,5)) for i in range(n_ratings)]
        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
        
    def read_lda_topics(self):
        
        '''
        Hyperparameters -
        `n_users` : number of simulated users in the database;
        `n_ratings` : number of simulated rating events in the database.
        
        This first LDA based dataset builds a model with K = `n_users` topics. LDA topics
        are used as proxies for simulated users with different clusters of interest. At first
        a random opportunity is chosen, than the amount of a randomly chosen topic inside the description
        is multiplied by five. The ceiling operation of this result is the rating that the fictional user
        will give to that opportunity.
        Because the amount of each topic predicted by the model is disollved among various topics,
        it is very rare to find an opportunity that has a higher LDA value. The consequence is that this dataset
        has really low volatility and the major part of ratings are equal to 1.
        '''
        
        n_users = 20
        n_ratings = 10000
        
        import gensim
        import random
        import math
        
        opo = pd.read_csv('../oportunidades.csv')
        try:
            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
        except:
            import generate_users
            generate_users.gen_model(n_users)
            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')

        df = []
        for i in range(n_ratings):
            opo_n = random.randrange(len(opo))
            txt = opo.loc[opo_n,'opo_texto']
            opo_bow = lda_model.id2word.doc2bow(txt.split())
            topics = lda_model.get_document_topics(opo_bow)
            topics = {topic[0]:topic[1] for topic in topics}
            user = random.sample(topics.keys(), 1)[0]
            rating = math.ceil(topics[user]*5)
            df.append((user, opo_n, rating))

        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
        
    def read_lda_rankings(self):
        
        '''
        Hyperparameters -
        `n_users` : number of simulated users in the database;
        `n_ratings` : number of simulated rating events in the database.
        
        This second LDA based dataset builds a model with K = `n_users` topics. LDA topics
        are used as proxies for simulated users with different clusters of interest. At first
        a random opportunity is chosen, than the amounts of the topics are sorted, and the rating 
        value is equal to the quintile in which the topic is located.
        Because the ratings will be equal to the relative significance of that topic with respect
        to the overall opportunity description, this dataset has a considerably larger volatility
        and ratings are relatively equally occurrent.
        '''
        
        n_users = 20
        n_ratings = 10000
        
        import gensim
        import random
        import math
        
        opo = pd.read_csv('../oportunidades.csv')
        try:
            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
        except:
            import generate_users
            generate_users.gen_model(n_users)
            lda_model = gensim.models.ldamodel.LdaModel.load(f'models/lda_model{n_users}.model')
        
        df = []
        for i in range(n_ratings):
            opo_n = random.randrange(len(opo))
            txt = opo.loc[opo_n,'opo_texto']
            opo_bow = lda_model.id2word.doc2bow(txt.split())
            topics = lda_model.get_document_topics(opo_bow)
            topics = {topic[0]:topic[1] for topic in topics}

            prop = pd.DataFrame([topics], index=['prop']).T.sort_values('prop', ascending=True)
            prop['rating'] = range(1, len(prop)+1)
            prop['rating'] = prop['rating']/len(prop)
            prop['rating'] = prop['rating'].apply(lambda x: math.ceil(x*5))
            prop.reset_index(inplace=True)

            prop = prop.sample(1)

            df.append((prop['index'].values[0], opo_n, prop['rating'].values[0]))

        self.df = pd.DataFrame(df, columns = ['userID', 'itemID', 'rating'])
        

### Example:

In [13]:
data=Data()
data.read_data('lda_rankings')
data.df

Unnamed: 0,userID,itemID,rating
0,13,736,2
1,17,254,3
2,11,429,3
3,8,82,1
4,19,501,1
...,...,...,...
9995,5,520,4
9996,5,92,4
9997,9,87,4
9998,10,612,5


## The Method class

The *surprise* library provides 11 classifier models that try to predict the classification of training data based on several different *collaborative-filtering* techniques. The models provided with a brief explanation in English are mentioned below, for more information please refer to the [package documentation.](https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html)

*random_pred.NormalPredictor*:
Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.

*baseline_only.BaselineOnly*:
Algorithm predicting the baseline estimate for given user and item.

*knns.KNNBasic*:
A basic collaborative filtering algorithm.

*knns.KNNWithMeans*:
A basic collaborative filtering algorithm, taking into account the mean ratings of each user.

*knns.KNNWithZScore*:
A basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

*knns.KNNBaseline*:
A basic collaborative filtering algorithm taking into account a baseline rating.

*matrix_factorization.SVD*:
The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize.

*matrix_factorization.SVDpp*:
The SVD++ algorithm, an extension of SVD taking into account implicit ratings.

*matrix_factorization.NMF*:
A collaborative filtering algorithm based on Non-negative Matrix Factorization.

*slope_one.SlopeOne*:
A simple yet accurate collaborative filtering algorithm.

*co_clustering.CoClustering*:
A collaborative filtering algorithm based on co-clustering.

It is possible to pass a custom dataframe as an argument to this class. The dataframe in question needs to have 3 columns with the following name: ['userID', 'itemID', 'rating'].

In [4]:
class Method:
    def __init__(self,df):
        
        self.df=df
        self.available_methods=[
            'surprise.NormalPredictor',
            'surprise.BaselineOnly',
            'surprise.KNNBasic',
            'surprise.KNNWithMeans',
            'surprise.KNNWithZScore',
            'surprise.KNNBaseline',
            'surprise.SVD',
            'surprise.SVDpp',
            'surprise.NMF',
            'surprise.SlopeOne',
            'surprise.CoClustering',
        ]        
        
    def show_methods(self):
        print('The avaliable methods are:')
        for i,method in enumerate(self.available_methods):
            print(str(i)+': '+method)



    def run(self,the_method):
        self.the_method=the_method
        if(self.the_method[0:8]=='surprise'):
            self.run_surprise()
        elif(self.the_method[0:6]=='Gensim'):
            self.run_gensim()
        elif(self.the_method[0:13]=='Transformers-'):
            self.run_transformers()
        else:
            print('This method is not defined! Try another one.')

    def run_surprise(self):
        from surprise import Reader
        from surprise import Dataset
        from surprise.model_selection import train_test_split
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(self.df[['userID', 'itemID', 'rating']], reader)        
        trainset, testset = train_test_split(data, test_size=.30)
        the_method=self.the_method.replace("surprise.", "")
        eval(f"exec('from surprise import {the_method}')")
        the_algorithm=locals()[the_method]()
        the_algorithm.fit(trainset)
        self.predictions=the_algorithm.test(testset)
        list_predictions=[(uid,iid,r_ui,est) for uid,iid,r_ui,est,_ in self.predictions]        
        self.predictions_df = pd.DataFrame(list_predictions, columns =['user_id', 'item_id', 'rating','predicted_rating'])

## The Evaluator class

The *surprise* library provides 4 different methods to assess the accuracy of the ratings prediction. For further discussion on each metric please visit the [package documentation](https://surprise.readthedocs.io/en/stable/accuracy.html).

In [5]:
class Evaluator:

    def __init__(self,predictions_df):

        self.available_evaluators=['surprise.rmse','surprise.mse',
                                   'surprise.mae','surprise.fcp']
        self.predictions_df=predictions_df
        
    def show_evaluators(self):
        print('The avaliable evaluators are:')
        for i,evaluator in enumerate(self.available_evaluators):
            print(str(i)+': '+evaluator)
        


    def run(self,the_evaluator):        
        self.the_evaluator=the_evaluator
        if(self.the_evaluator[0:8]=='surprise'):
            self.run_surprise()
        else:
            print('This evaluator is not available!')

    def run_surprise(self):
        import surprise
        from surprise import accuracy
        predictions=[surprise.prediction_algorithms.predictions.Prediction(row['user_id'],row['item_id'],row['rating'],row['predicted_rating'],{}) for index,row in self.predictions_df.iterrows()]
        self.predictions=predictions
        self.the_evaluator= 'accuracy.' + self.the_evaluator.replace("surprise.", "")
        self.acc = eval(f'{self.the_evaluator}(predictions,verbose=True)')

## Experiment:

### Example

In [6]:
data=Data()
data.show_available_databases()
data.read_data('ml_100k')
method=Method(data.df)  
method.show_methods()
method.run('surprise.KNNWithMeans')
predictions_df=method.predictions_df
evaluator=Evaluator(predictions_df)
evaluator.show_evaluators()
evaluator.run('surprise.mse')

The avaliable database are:
0: ml_100k
1: ml_1m
2: jester
3: lda_topics
4: lda_rankings
5: uniform
The avaliable methods are:
0: surprise.NormalPredictor
1: surprise.BaselineOnly
2: surprise.KNNBasic
3: surprise.KNNWithMeans
4: surprise.KNNWithZScore
5: surprise.KNNBaseline
6: surprise.SVD
7: surprise.SVDpp
8: surprise.NMF
9: surprise.SlopeOne
10: surprise.CoClustering
Computing the msd similarity matrix...
Done computing similarity matrix.
The avaliable evaluators are:
0: surprise.rmse
1: surprise.mse
2: surprise.mae
3: surprise.fcp
MSE: 0.1512


In [7]:
def model_table(label):
    
    '''
    Code that builds the table with the accuracy metrics for all rating prediction
    models built-in the surprise package. The expected return of this function
    is a pandas dataframe (11x4) corresponding to the 11 classifier models and
    4 different accuracy metrics.
    '''
    
    table = pd.DataFrame()
    
    data=Data()
    data.read_data(label)
    
    method=Method(data.df)
    
    for m in method.available_methods:
        print(m)
        method.run(m)
        predictions_df=method.predictions_df
        evaluator=Evaluator(predictions_df)
        
        metrics = []
        
        for e in evaluator.available_evaluators:
            evaluator.run(e)
            metrics.append(evaluator.acc)
            
        table = table.append(dict(zip(evaluator.available_evaluators,metrics)),ignore_index=True)
        
    table.index = [x[9:] for x in method.available_methods]
            
    return table


import sys, os

sys.stdout = open(os.devnull, 'w') # Codigo para desativar os prints

uniform = model_table('uniform')  
topics = model_table('lda_topics')
ranking = model_table('lda_rankings')

sys.stdout = sys.__stdout__ # Codigo para reativar os prints

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [8]:
uniform

Unnamed: 0,surprise.rmse,surprise.mse,surprise.mae,surprise.fcp
NormalPredictor,1.534804,2.355623,1.250746,0.491641
BaselineOnly,1.134006,1.285971,1.001062,0.491694
KNNBasic,1.202139,1.445138,1.031629,0.493996
KNNWithMeans,1.215825,1.478229,1.038545,0.492914
KNNWithZScore,1.207519,1.458102,1.037799,0.503013
KNNBaseline,1.202906,1.446983,1.028633,0.493364
SVD,1.19328,1.423917,1.028005,0.489487
SVDpp,1.200286,1.440688,1.030948,0.497485
NMF,1.341951,1.800833,1.125911,0.491918
SlopeOne,1.197672,1.434417,1.017256,0.50068


In [9]:
topics

Unnamed: 0,surprise.rmse,surprise.mse,surprise.mae,surprise.fcp
NormalPredictor,0.649374,0.421686,0.427919,0.435448
BaselineOnly,0.421905,0.178004,0.248629,0.734866
KNNBasic,0.444912,0.197947,0.245562,0.745807
KNNWithMeans,0.398414,0.158734,0.227852,0.76761
KNNWithZScore,0.38646,0.149351,0.20717,0.76154
KNNBaseline,0.385385,0.148521,0.221078,0.819626
SVD,0.375885,0.14129,0.211979,0.804881
SVDpp,0.359489,0.129233,0.206338,0.783897
NMF,0.29813,0.088882,0.117041,0.835527
SlopeOne,0.338028,0.114263,0.194162,0.832792


In [10]:
ranking

Unnamed: 0,surprise.rmse,surprise.mse,surprise.mae,surprise.fcp
NormalPredictor,1.841699,3.391856,1.509347,0.505111
BaselineOnly,1.067423,1.139391,0.858431,0.57234
KNNBasic,1.123141,1.261446,0.891902,0.694179
KNNWithMeans,0.934017,0.872388,0.703772,0.731334
KNNWithZScore,0.947603,0.897952,0.71569,0.718407
KNNBaseline,0.912495,0.832646,0.69032,0.722609
SVD,0.829436,0.687964,0.605397,0.808765
SVDpp,0.836658,0.699996,0.602738,0.779652
NMF,0.859389,0.73855,0.554761,0.781849
SlopeOne,0.892899,0.797268,0.671398,0.735438
