In [4]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [5]:
from util.models import Dataset, RecommendResult
from recommend.base import BaseRecommender
from collections import defaultdict
import logging
from gensim.corpora.dictionary import Dictionary
import gensim
import numpy as np

np.random.seed(0)

In [8]:
class LdaCollaborativeFilteringRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        n_topics = kwargs.get('n_topics', 50)
        n_epochs = kwargs.get('n_epochs', 30)

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        lda_data = []
        high_rating = dataset.train[dataset.train.rating >= 4]
        for user_id, data in high_rating.groupby('user_id'):
            lda_data.append(data['movie_id'].apply(str).tolist())

        common_dictionary = Dictionary(lda_data)
        common_corpus = [common_dictionary.doc2bow(doc) for doc in lda_data]

        lda_model = gensim.models.LdaModel(
            corpus=common_corpus,
            num_topics=n_topics,
            id2word=common_dictionary,
            passes=n_epochs,
        )

        lda_topics = lda_model[common_corpus]

        user_evaluated_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()

        pred_user2items = defaultdict(list)

        for i, (user_id, data) in enumerate(high_rating.groupby('user_id')):
            user_topic = sorted(lda_topics[i], key=lambda x: x[1], reverse=True)[0][0]
            topic_movies = lda_model.get_topic_terms(user_topic, topn=len(dataset.item_content))
            for term_id, score in topic_movies:
                movie_id = int(common_dictionary.id2token[term_id])
                if movie_id not in user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id]) >= k:
                    break
        
        return RecommendResult(dataset.test.rating, pred_user2items)

In [9]:
recommender = LdaCollaborativeFilteringRecommender()
metrics = recommender.run_sample()
print(metrics)

2023-08-07 22:28:17,845 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-08-07 22:28:17,882 : INFO : built Dictionary<4987 unique tokens: ['185', '231', '292', '316', '329']...> from 997 documents (total 67731 corpus positions)
2023-08-07 22:28:17,882 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<4987 unique tokens: ['185', '231', '292', '316', '329']...> from 997 documents (total 67731 corpus positions)", 'datetime': '2023-08-07T22:28:17.882551', 'gensim': '4.3.1', 'python': '3.9.9 (main, May 31 2023, 19:14:41) \n[Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.2.1-arm64-arm-64bit', 'event': 'created'}
2023-08-07 22:28:17,904 : INFO : using symmetric alpha at 0.02
2023-08-07 22:28:17,905 : INFO : using symmetric eta at 0.02
2023-08-07 22:28:17,905 : INFO : using serial LDA version on this node
2023-08-07 22:28:17,915 : INFO : running online (multi-pass) LDA training, 50 topics, 30 passes over the supplied corpus of 997 documents, updatin

rmse: 0.000, precision@K: 0.020, recall@K: 0.065
