암시적 평가에 대한 행렬 분해는 평가값이 명시적(ex. 넷플릭스의 별점, 상품 별점 등)이지 않은 경우에 대한 행렬 분해이다.
암시적 평가의 예시로는 커머스의 상품 클릭, 구매 혹은 동영상 스트리밍에서 유저의 체류시간 등이 있다.

암시적 평가 데이터를 활용한 MF 모델의 손실 함수는 명시적 피드백의 경우와 다름. 수식은 세우기 나름인데, 예를 들어 클릭 횟수가 1번 이상인 경우 평가값이 1, 아니면 0과 같으면서, 이러한 평가값에 대한 신뢰도를 모델링하는 항이 존재함. 클릭이 여러번일수록 신뢰도가 올라가고 유저의 해당 아이템에 대한 예측값이 1에 더 가까워지도록 모델링 하는게 일반적임. 이외에 정규화나 bias 등의 항은 추가하기 나름

In [2]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [22]:
from util.models import Dataset, RecommendResult
from util.data import DataLoader
from recommend.base import BaseRecommender
from collections import defaultdict
import numpy as np
import implicit
from scipy.sparse import csr_matrix
np.random.seed(0)

In [10]:
dataset = DataLoader().load()

In [23]:
factors = 10
minimum_num_rating = 0
n_epochs = 50
alpha = 1.0

train_data = dataset.train.groupby('movie_id').filter(lambda x: len(x['movie_id']) >= minimum_num_rating)
high_rating_train_data = train_data[dataset.train.rating >= 4]

unique_user_ids = high_rating_train_data.user_id.unique()
unique_movie_ids = high_rating_train_data.movie_id.unique()

user_id2index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
movie_id2index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

matrix = csr_matrix((len(unique_user_ids), len(unique_movie_ids)))

for i, row in high_rating_train_data.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    user_index = user_id2index[user_id]
    movie_index = movie_id2index[movie_id]

    matrix[user_index, movie_index] = 1.0 * alpha

print(matrix)

  self._set_intXint(row, col, x.flat[0])


  (0, 0)	1.0
  (0, 3)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 11)	1.0
  (0, 12)	1.0
  (0, 14)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	1.0
  (0, 27)	1.0
  (0, 29)	1.0
  (0, 34)	1.0
  (0, 37)	1.0
  (0, 39)	1.0
  (0, 42)	1.0
  (0, 44)	1.0
  (0, 45)	1.0
  (0, 46)	1.0
  (0, 48)	1.0
  (0, 50)	1.0
  :	:
  (992, 1771)	1.0
  (992, 3571)	1.0
  (993, 694)	1.0
  (993, 961)	1.0
  (993, 1218)	1.0
  (993, 1362)	1.0
  (993, 2050)	1.0
  (993, 2178)	1.0
  (993, 2425)	1.0
  (994, 860)	1.0
  (994, 3082)	1.0
  (994, 3399)	1.0
  (994, 3548)	1.0
  (995, 1580)	1.0
  (995, 3253)	1.0
  (995, 3473)	1.0
  (995, 3549)	1.0
  (995, 4076)	1.0
  (995, 4670)	1.0
  (995, 4813)	1.0
  (995, 4814)	1.0
  (996, 1815)	1.0
  (996, 3394)	1.0
  (996, 3415)	1.0
  (996, 3637)	1.0


In [26]:
model = implicit.als.AlternatingLeastSquares(
    factors=factors,
    iterations=n_epochs,
    calculate_training_loss=True,
    random_state=1,
)
model.fit(matrix)

recommendations = model.recommend_all(matrix)
print(recommendations.shape)
print(recommendations)

100%|██████████| 50/50 [00:01<00:00, 43.16it/s, loss=0.0096] 


(997, 10)
[[1354    4   75 ...  618   30  508]
 [   7   73  132 ...   11 1322   18]
 [  22  132  593 ...  136   34   14]
 ...
 [  64  766   60 ...   56  278  529]
 [1232 1082  933 ...   43  479  481]
 [  64  766   63 ...  852   56 2740]]


In [33]:
class IMFRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        factors = kwargs.get('factors', 10)
        minimum_num_rating = kwargs.get('minimum_num_rating', 0)
        n_epochs = kwargs.get('n_epochs', 50)
        alpha = kwargs.get('alpha', 1.0)

        train_data = dataset.train.groupby('movie_id').filter(lambda x: len(x['movie_id']) >= minimum_num_rating)
        high_rating_train_data = train_data[dataset.train.rating >= 4]

        unique_user_ids = high_rating_train_data.user_id.unique()
        unique_movie_ids = high_rating_train_data.movie_id.unique()

        user_id2index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
        movie_id2index = {movie_id: index for index, movie_id in enumerate(unique_movie_ids)}

        matrix = csr_matrix((len(unique_user_ids), len(unique_movie_ids)))

        for i, row in high_rating_train_data.iterrows():
            user_id = row['user_id']
            movie_id = row['movie_id']
            user_index = user_id2index[user_id]
            movie_index = movie_id2index[movie_id]

            matrix[user_index, movie_index] = 1.0 * alpha

        model = implicit.als.AlternatingLeastSquares(
            factors=factors,
            iterations=n_epochs,
            calculate_training_loss=True,
            random_state=1,
        )
        model.fit(matrix)

        recommendations = model.recommend_all(matrix)
        pred_user2items = defaultdict(list)
        for user_id, user_index in user_id2index.items():
            movie_indices = recommendations[user_index]
            pred_user2items[user_id] = [unique_movie_ids[movie_index] for movie_index in movie_indices]

        return RecommendResult(dataset.test.rating, pred_user2items)

In [34]:
recommender = IMFRecommender()
recommender.run_sample()

  self._set_intXint(row, col, x.flat[0])
100%|██████████| 50/50 [00:01<00:00, 49.09it/s, loss=0.0096] 


rmse: 0.000, precision@K: 0.024, recall@K: 0.075