In [1]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [3]:
from util.data import DataLoader
from util.models import RecommendResult, Dataset
from recommend.base import BaseRecommender
from collections import defaultdict
import numpy as np
import implicit
from scipy.sparse import csr_matrix
np.random.seed(0)

In [4]:
dataset = DataLoader().load()

In [6]:
factors = 10
minimum_num_rating = 0
n_epochs = 50

train_dataset = dataset.train.groupby('movie_id').filter(lambda x: len(x['movie_id']) >= minimum_num_rating)
high_rating_dataset = train_dataset[dataset.train.rating >= 4]
unique_user_ids = list(sorted(high_rating_dataset.user_id.unique()))
unique_movie_ids = list(sorted(high_rating_dataset.movie_id.unique()))
user_id2index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
movie_id2index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

matrix = csr_matrix((len(unique_user_ids), len(unique_movie_ids)))

for i, row in high_rating_dataset.iterrows():
    user_index = user_id2index[row['user_id']]
    movie_index = movie_id2index[row['movie_id']]
    matrix[user_index, movie_index] = 1.0

model = implicit.bpr.BayesianPersonalizedRanking(
    factors=factors, iterations=n_epochs
)

model.fit(matrix)

recommendations = model.recommend_all(matrix)
print(recommendations.shape)
print(recommendations)

  self._set_intXint(row, col, x.flat[0])
100%|██████████| 50/50 [00:00<00:00, 118.51it/s, train_auc=82.88%, skipped=17.25%]


(997, 10)
[[ 566  119  318 ...  517  481  592]
 [ 483  480  296 ...  433  392  119]
 [1235 2045 1984 ...  433  262 1667]
 ...
 [ 262  296  433 ...  487   32  375]
 [ 296  262 1235 ...  188 1942   91]
 [1830 1902  953 ... 2141 1536 1425]]


In [7]:
class BPRRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        factors = kwargs.get('factors', 10)
        minimum_num_rating = kwargs.get('minimum_num_rating', 0)
        n_epochs = kwargs.get('n_epochs', 50)

        train_dataset = dataset.train.groupby('movie_id').filter(lambda x: len(x['movie_id']) >= minimum_num_rating)
        high_rating_dataset = train_dataset[dataset.train.rating >= 4]
        unique_user_ids = list(sorted(high_rating_dataset.user_id.unique()))
        unique_movie_ids = list(sorted(high_rating_dataset.movie_id.unique()))
        user_id2index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
        movie_id2index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

        matrix = csr_matrix((len(unique_user_ids), len(unique_movie_ids)))

        for i, row in high_rating_dataset.iterrows():
            user_index = user_id2index[row['user_id']]
            movie_index = movie_id2index[row['movie_id']]
            matrix[user_index, movie_index] = 1.0

        model = implicit.bpr.BayesianPersonalizedRanking(
            factors=factors, iterations=n_epochs
        )

        model.fit(matrix)

        recommendations = model.recommend_all(matrix)

        pred_user2items = defaultdict(list)

        for user_id, user_index in user_id2index.items():
            movie_indices = recommendations[user_index, :]
            pred_user2items[user_id] = [unique_movie_ids[movie_index] for movie_index in movie_indices]
        
        return RecommendResult(dataset.test.rating, pred_user2items)

In [9]:
recommender = BPRRecommender()
metrics = recommender.run_sample()
print(metrics)

  self._set_intXint(row, col, x.flat[0])
100%|██████████| 50/50 [00:00<00:00, 121.50it/s, train_auc=82.81%, skipped=17.44%]


rmse: 0.000, precision@K: 0.021, recall@K: 0.067


In [10]:
recommender = BPRRecommender()
metrics = recommender.run_sample(minimum_num_rating=100)
print(metrics)

  high_rating_dataset = train_dataset[dataset.train.rating >= 4]
  self._set_intXint(row, col, x.flat[0])
100%|██████████| 50/50 [00:00<00:00, 402.12it/s, train_auc=80.72%, skipped=26.73%]

rmse: 0.000, precision@K: 0.021, recall@K: 0.064



