In [1]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [2]:
from util.data import DataLoader

dataset = DataLoader().load()

In [3]:
user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
user_num = len(user_movie_matrix.index)
item_num = len(user_movie_matrix.columns)
non_null_num = user_num * item_num - user_movie_matrix.isnull().sum().sum()

non_null_ratio = non_null_num / (user_num * item_num)

print(f'사용자 수={user_num}, 아이템 수={item_num}, 밀도={non_null_ratio:.3f}')

사용자 수=1000, 아이템 수=6673, 밀도=0.019


1. 행렬분해 방식으로 SVD를 적용
2. 결손값을 0 혹은 평가값의 평균으로 치환

In [4]:
import scipy
import numpy as np

matrix = user_movie_matrix.fillna(dataset.train.rating.mean()).to_numpy()
matrix

array([[3.57479074, 3.57479074, 3.57479074, ..., 3.57479074, 3.57479074,
        3.57479074],
       [3.57479074, 3.57479074, 3.57479074, ..., 3.57479074, 3.57479074,
        3.57479074],
       [3.57479074, 3.57479074, 3.57479074, ..., 3.57479074, 3.57479074,
        3.57479074],
       ...,
       [5.        , 3.57479074, 3.        , ..., 3.57479074, 3.57479074,
        3.57479074],
       [3.57479074, 3.57479074, 3.57479074, ..., 3.57479074, 3.57479074,
        3.57479074],
       [5.        , 3.57479074, 3.57479074, ..., 3.57479074, 3.57479074,
        3.57479074]])

In [5]:
P, S, Qt = scipy.sparse.linalg.svds(matrix, k=5) # 인자 수가 5개인 유저/아이템 행렬

pred_matrix = np.dot(np.dot(P, np.diag(S)), Qt)

print(f'P: {P.shape}, S: {S.shape}, Qt: {Qt.shape}, pred_matrix: {pred_matrix.shape}')

P: (1000, 5), S: (5,), Qt: (5, 6673), pred_matrix: (1000, 6673)


In [6]:
from pandas import DataFrame
from sklearn.decomposition import NMF

# Singular Value Decomposition
def svd(matrix: DataFrame, factors: int):
    P, S, Qt = scipy.sparse.linalg.svds(matrix, k=factors)
    pred_matrix = np.dot(np.dot(P, np.diag(S)), Qt)
    return pred_matrix

# Nonnegative Matrix Factorization
def nmf(matrix: DataFrame, factors: int):
    nmf = NMF(n_components=factors)
    nmf.fit(matrix)
    P = nmf.fit_transform(matrix)
    Q = nmf.components_
    pred_matrix = np.dot(P, Q)
    return pred_matrix

In [7]:
from collections import defaultdict
from util.models import RecommendResult, Dataset
from recommend.base import BaseRecommender

class MFRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        fillna_with_zero = kwargs.get('fillna_with_zero', True)
        factors = kwargs.get('factors', 10)

        user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
        user_id2index = {user_id: index for index, user_id in enumerate(user_movie_matrix.index)}
        movie_id2index = {movie_id: index for index, movie_id in enumerate(user_movie_matrix.columns)}
        movie_index2id = {index: movie_id for movie_id, index in movie_id2index.items()}

        average_score = dataset.train.rating.mean()
        if fillna_with_zero:
            matrix = user_movie_matrix.fillna(0).to_numpy()
        else:
            matrix = user_movie_matrix.fillna(average_score).to_numpy()

        factorizer = kwargs.get("factorizer")
        pred_matrix = factorizer(matrix, factors=factors)

        movie_rating_predict = dataset.test.copy()
        pred_results = []
        for _, row in dataset.test.iterrows():
            user_id = row['user_id']
            movie_id = row['movie_id']

            if not (user_id in user_id2index and movie_id in movie_id2index):
                pred_results.append(average_score)
                continue
                
            user_index = user_id2index[user_id]
            movie_index = movie_id2index[movie_id]
            pred_score = pred_matrix[user_index, movie_index]
            pred_results.append(pred_score)

        movie_rating_predict['rating_pred'] = pred_results

        pred_user2items = defaultdict(list)
        user_evaluated_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()

        for user_id in dataset.train.user_id.unique():
            user_index = user_id2index[user_id]
            movie_indices = np.argsort(-pred_matrix[user_index, :])
            for movie_index in movie_indices:
                movie_id = movie_index2id[movie_index]
                if movie_id not in user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id])>= k:
                    break

        return RecommendResult(movie_rating_predict.rating_pred, pred_user2items)

In [8]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=True, factors=10, factorizer=svd)
print(metrics)

rmse: 3.274, precision@K: 0.027, recall@K: 0.084


In [9]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=False, factors=10, factorizer=svd)
print(metrics)

rmse: 1.042, precision@K: 0.021, recall@K: 0.067


In [10]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=True, factors=100, factorizer=svd)
print(metrics)

rmse: 3.343, precision@K: 0.029, recall@K: 0.093


In [11]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=False, factors=100, factorizer=svd)
print(metrics)

rmse: 1.044, precision@K: 0.023, recall@K: 0.074


In [12]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=True, factors=10, factorizer=nmf)
print(metrics)

rmse: 3.290, precision@K: 0.025, recall@K: 0.080


In [13]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=False, factors=10, factorizer=nmf)
print(metrics)

rmse: 1.053, precision@K: 0.016, recall@K: 0.050


In [14]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=True, factors=100, factorizer=nmf)
print(metrics)



rmse: 3.306, precision@K: 0.028, recall@K: 0.091


In [15]:
recommender = MFRecommender()
metrics = recommender.run_sample(fillna_with_zero=False, factors=100, factorizer=nmf)
print(metrics)

rmse: 1.081, precision@K: 0.013, recall@K: 0.042
