In [1]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [2]:
from util.data import DataLoader

dataset = DataLoader().load()

### 메모리 기반 협업 필터링

1. 기존의 평가값들을 사용해 사용자 사이의 유사도를 계산
2. 기호가 비슷한(유사도가 큰) 유저가 이미 평가한 아이템들을 사용해 사용자의 예측 평가값을 계산해서 채워넣는다
3. 예측 평가값이 높은 후보들을 추천한다

In [3]:
import numpy as np

# 피어슨 유사도 함수
def pearson_coefficient(u: np.ndarray, v: np.ndarray):
    u_diff = u - np.mean(u)
    v_diff = v - np.mean(v)

    u_len = np.sqrt(np.sum(u_diff ** 2))
    v_len = np.sqrt(np.sum(v_diff ** 2))

    if u_len == 0 or v_len == 0:
        return 0

    return np.dot(u_diff, v_diff) / (u_len * v_len)

In [10]:
user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
user_id2index = {user_id: index for index, user_id in enumerate(user_movie_matrix.index)}
movie_id2index = {movie_id: index for index, movie_id in enumerate(user_movie_matrix.columns)}

movie_rating_predict = dataset.test.copy()

test_user_ids = movie_rating_predict.user_id.unique()

for user1_id in test_user_ids:
    similar_users = []
    similarities = []
    avgs = []

    # 성향이 비슷한 사용자 구하기
    for user2_id in user_movie_matrix.index:
        if user1_id == user2_id:
            continue

        user1_vector = user_movie_matrix.loc[user1_id, :].to_numpy()
        user2_vector = user_movie_matrix.loc[user2_id, :].to_numpy()

        common_evaluations = ~np.isnan(user1_vector) & ~np.isnan(user2_vector)

        if not common_evaluations.any():
            continue

        user1_vector = user1_vector[common_evaluations]
        user2_vector = user2_vector[common_evaluations]

        similarity = pearson_coefficient(user1_vector, user2_vector)

        if similarity > 0:
            similar_users.append(user2_id)
            similarities.append(similarity)
            avgs.append(np.mean(user2_vector))
    
    # 유저의 평가 평균값으로 미리 predict를 채워넣음
    user1_avg = np.mean(user_movie_matrix.loc[user1_id, :].dropna().to_numpy())
    movie_rating_predict.loc[(movie_rating_predict['user_id'] == user1_id), 'rating_pred'] = user1_avg

    test_movie_ids = movie_rating_predict[movie_rating_predict['user_id'] == user1_id].movie_id.values

    if similar_users:
        for movie_id in test_movie_ids:
            if movie_id in movie_id2index:
                similar_user_ratings = user_movie_matrix.loc[similar_users, movie_id].to_numpy()
                exists = ~np.isnan(similar_user_ratings)

                if not exists.any():
                    continue

                exist_ratings = similar_user_ratings[exists]
                exist_similarities = np.array(similarities)[exists]
                exist_averages = np.array(avgs)[exists]

                pred = user1_avg + np.dot(exist_similarities, (exist_ratings - exist_averages)) / np.sum(exist_similarities)

                movie_rating_predict.loc[(movie_rating_predict['user_id'] == user1_id) & (movie_rating_predict['movie_id'] == movie_id), 'rating_pred'] = pred

In [13]:
from collections import defaultdict
from util.models import Dataset, RecommendResult
from base import BaseRecommender
from surprise import KNNWithMeans, Reader
from surprise import Dataset as SurpriseDataset

np.random.seed(0)

reader = Reader(rating_scale=(0.5, 5))
data_train = SurpriseDataset.load_from_df(
    dataset.train[['user_id', 'movie_id', 'rating']], reader
).build_full_trainset()

print(data_train)

<surprise.trainset.Trainset object at 0x137537cd0>


In [33]:

        
class UMCFRecommender(BaseRecommender):

    def __init__(self):
        reader = Reader(rating_scale=(0.5, 5))
        data_train = SurpriseDataset.load_from_df(
            dataset.train[['user_id', 'movie_id', 'rating']], reader
        ).build_full_trainset()

        sim_options = {
            'name': 'pearson',
            'user_based': True,
        }

        self.knn = KNNWithMeans(k=30, min_k=1, sim_options=sim_options)
        self.knn.fit(data_train)

        data_test = data_train.build_anti_testset(None)
        self.predictions = self.knn.test(data_test)

    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
        user_id2index = {user_id: index for index, user_id in enumerate(user_movie_matrix.index)}
        movie_id2index = {movie_id: index for index, movie_id in enumerate(user_movie_matrix.columns)}

        movie_rating_predict = dataset.test.copy()
        pred_user2items = defaultdict(list)

        stats = dataset.train.groupby('movie_id').agg({'rating': [np.size, np.mean]})
        at_least = stats['rating']['size'] >= kwargs.get('min_rating_size', 200)
        at_least_movie_ids = set(stats[at_least].index.to_list())

        def get_top_n(predictions, n=10):
            top_n = defaultdict(list)
            for uid, iid, true_r, est, _ in predictions:
                if iid in at_least_movie_ids:
                    top_n[uid].append((iid, est))

            for uid, user_ratings in top_n.items():
                user_ratings.sort(key=lambda x: x[1], reverse=True)
                top_n[uid] = [d[0] for d in user_ratings[:n]]

            return top_n

        pred_user2items = get_top_n(self.predictions, n=10)

        average_score = dataset.train.rating.mean()
        pred_results = []
        for _, row in dataset.test.iterrows():
            user_id = row['user_id']
            movie_id = row['movie_id']

            if user_id not in user_id2index or movie_id not in movie_id2index:
                pred_results.append(average_score)
                continue

            pred_score = self.knn.predict(user_id, movie_id).est
            pred_results.append(pred_score)

        movie_rating_predict['rating_pred'] = pred_results

        return RecommendResult(movie_rating_predict.rating_pred, pred_user2items)

In [34]:
recommender = UMCFRecommender()
for min_rating_size in (0, 100, 200, 300):
    metrics = recommender.run_sample(min_rating_size=min_rating_size)
    print(f'min_rating_size={min_rating_size} => {metrics}')

Computing the pearson similarity matrix...
Done computing similarity matrix.
min_rating_size=0 => rmse: 0.962, precision@K: 0.002, recall@K: 0.004
min_rating_size=100 => rmse: 0.962, precision@K: 0.009, recall@K: 0.027
min_rating_size=200 => rmse: 0.962, precision@K: 0.013, recall@K: 0.041
min_rating_size=300 => rmse: 0.962, precision@K: 0.015, recall@K: 0.050


일정 수 이상 평가된 영화로 제한하면 precision, recall이 좋아짐.(점점 다양성은 줄어듦)