In [1]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset, Reader, AlgoBase
from surprise.model_selection import train_test_split
import surprise as sp
from collections import defaultdict

In [2]:
# Define the popularity-based algorithm
class PopularityAlgorithm(AlgoBase):
    def __init__(self):
        # Call the base class constructor
        AlgoBase.__init__(self)

    def fit(self, trainset,k=10):
        '''
        This method computes the popularity based on the trainset, and stores the popularity as a member variable.
        Then it returns the list of top k popular games to recommend.
        '''
        # Store the trainset as a member variable
        self.trainset = trainset

        # Compute item popularity based on training set
        item_popularity = np.zeros(trainset.n_items)
        for _, item_id, _ in trainset.all_ratings():
            item_popularity[item_id] += 1

        # Store the item popularity as a member variable
        self.item_popularity = item_popularity
        top_items = np.argsort(self.item_popularity)[::-1]

        # Get the top 10 popular games
        top_popular_games = top_items

        liste = []
        # Print the top 10 popular games
        for iid in top_popular_games:
            item_id = trainset.to_raw_iid(iid)
            liste.append(item_id)
        self.liste = liste

        return self

    def estimate(self, k=False):
        '''Recommends the top k popular games right away regardless of 
        customer's unique preferences.'''
        if k:
            # Return the item popularity as the estimated rating
            return self.liste[:k]
        else:
            return self.liste
            
    def mapk(self, testset, k):
        '''Returns the mean average precision at k based on the testset
        and the given k value. It is important to use it in a range of k values'''
        top_k = k  # Number of top recommendations to consider
        num_users = 0
        precision_sum = 0
        self.fit(self.trainset, k=top_k)
        # Create a dictionary to store ground truth items for each user
        ground_truth_items_dict = {}
        for uid, item_id, rating in testset:
            if rating == 1.0:
                ground_truth_items_dict.setdefault(uid, []).append(item_id)

        #as far as this is a popularity based algorithm, the recommendations are the same for all users
        recommended_items = self.estimate(top_k)
        # Iterate over each user-item pair in the test set
        for user_id, item_id, rating in testset:
            ground_truth_items = ground_truth_items_dict.get(user_id, [])
            
            # Compute precision at k
            num_correct = len(set(recommended_items).intersection(set(ground_truth_items)))
            precision = num_correct / top_k
            precision_sum += precision
            num_users += 1

        # Compute the average precision at k
        precision_at_k = precision_sum / num_users
        # Print the precision at k
        print("Precision at", top_k, ":", precision_at_k)
        
        return precision_at_k
    
    def recall_at_k(self, testset,k=10):
        '''Returns the recall based on the testset. If a k value is given, it returns the average recall at k. Else, it returns the average recall
        based on each user's test recommendations list length'''
        liste = self.estimate(k)
        # Create a dictionary to store ground truth items for each user
        ground_truth_items_dict = {}
        
        for uid, item_id, rating in testset:
            if rating == 1.0:
                ground_truth_items_dict.setdefault(uid, []).append(item_id)
        
        recall_sum = 0
        num_users = 0
        for user_id, item_id, rating in testset:
            ground_truth_items = ground_truth_items_dict.get(user_id, [])
            num_correct = len(set(liste).intersection(set(ground_truth_items)))
            num_users += 1
            if len(ground_truth_items) == 0:
                recall = 0
            else:
                recall = num_correct / len(ground_truth_items)
            recall_sum += recall
        average_recall = recall_sum / num_users
        print("Average recall at {}:".format(k), average_recall)
        return average_recall
    
    def f1_score(self, testset, k=10):
        '''Returns the f1 score based on the precision and recall'''
        precision = self.mapk(testset,k=k)
        recall = self.recall_at_k(testset,k=k)
        f1_score = 2 * ((precision * recall) / (precision + recall))
        print("F1 score at k:", f1_score)
        return f1_score


In [3]:
my_seed = 1
random.seed(my_seed)
np.random.seed(my_seed)

In [4]:
def generate_int_id(df, id_col_name):
    new_df=df.assign(
        int_id_col_name=np.arange(len(df))
        ).reset_index(drop=True)
    return new_df.rename(columns={'int_id_col_name': id_col_name})

In [5]:
recommendations = pd.read_csv("recommendations.csv")
games = pd.read_csv("games.csv")
users = pd.read_csv("users.csv")
games_metadata = pd.read_json("games_metadata.json", lines=True)
user_app_ratings = pd.merge(users.loc[users['reviews'] >= 20], recommendations, how="inner", on=["user_id"])
user_app_ratings['is_recommended'] = user_app_ratings['is_recommended'].map({False: 0, True: 1})
df_games = generate_int_id(games, 'app_id_num')
df_games = pd.merge(df_games, games_metadata, how='left', on='app_id')

In [6]:
# Load the dataset
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(user_app_ratings[["user_id", "app_id", "is_recommended"]], reader)

# Split the dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

In [7]:
# Create an instance of the popularity-based algorithm
algo = PopularityAlgorithm()

# Fit the algorithm on the training set
algo.fit(trainset)

<__main__.PopularityAlgorithm at 0x7f99a719ee90>

In [8]:
# K validates the number of recommendations to be made based on popularity. Recommends k most popular games to all users.
for i in range(1, 11):
    algo.mapk(testset, i)


Precision at 1 : 0.06036886191690939
Precision at 2 : 0.06009754819716711
Precision at 3 : 0.0557534101327795
Precision at 4 : 0.05253662735216521
Precision at 5 : 0.05327853004095471
Precision at 6 : 0.053349009237158
Precision at 7 : 0.052904838828943056
Precision at 8 : 0.0521460291521914
Precision at 9 : 0.05084758683256865
Precision at 10 : 0.04949385957803083


In [9]:
algo.f1_score(testset, k=10)

Precision at 10 : 0.04949385957803083
Average recall at 10: 0.0687880156383848
F1 score at k: 0.05756730488806292


0.05756730488806292

In [10]:
algo.recall_at_k(testset, k=10)

Average recall at 10: 0.0687880156383848


0.0687880156383848