In [1]:
import pandas as pd
import numpy as np

dataframes = pd.read_pickle('data/review.pkl')

category_score = pd.merge(
    dataframes["stores"], dataframes["reviews"], left_on="id", right_on="store"
)[["category", "user", "score"]]

category_score["category"] = category_score["category"].apply(lambda c: c.split("|"))

category_explode = category_score.explode("category")

In [2]:
category_scores = category_explode.groupby(['category', 'user'])['score'].mean().reset_index()

# 각 유저의 카테고리별 평균 평점입니다.
# category_scores = category_scores[category_scores['category'] != '']

In [12]:
# user와 category를 index / column로 갖고, 평균 점수인 score를 value로 갖는 full matrix입니다
score_matrix = category_scores.pivot(index='user', columns='category', values='score')

from sklearn.metrics.pairwise import cosine_similarity

# sklearn 라이브러리를 활용하여 각각의 유저에 대한 코사인 유사도를 계산합니다.
matrix_dummy = score_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=score_matrix.index,
                               columns=score_matrix.index)
print(user_similarity)

user    7       15      23      62      64      74      105     137     \
user                                                                     
7          1.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
15         0.0     1.0     0.0     0.0     0.0     0.0     0.0     0.0   
23         0.0     0.0     1.0     0.0     0.0     0.0     0.0     0.0   
62         0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0   
64         0.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0   
...        ...     ...     ...     ...     ...     ...     ...     ...   
949946     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
949951     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
950224     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
950225     0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0   
950331     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

user      147     156     ...  949436

In [4]:
def CF_simple(user, category):
    """
        가장 간단하게 작성한 협업 필터링 코드입니다.
    """
    if category in score_matrix:
        # 점수가 한번이라도 매겨진 category일 경우 입니다.
        
        # 특정 유저의 다른 유저들에 대한 코사인 유사도입니다.
        sim_scores = user_similarity[user].copy()
        
        # 모든 유저의 특정 카테고리에 대한 평균 평점들입니다.
        category_score = score_matrix[category].copy()
        
        # 평점을 입력하지 않은 유저를 제외시켜줍니다.
        none_rating_idx = category_score[category_score.isnull()].index
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 평점이 입력되지 않은 카테고리들도 제거해줍니다.
        category_score = category_score.dropna()
        if sim_scores.sum() > 1:
            # 평균값은 유저의 코사인 유사도와 카테고리별 평점의 내적으로 계산했습니다.
            mean_rating = np.dot(sim_scores, category_score) / sim_scores.sum()
        else:
            mean_rating = 3.0
    else:
        # 점수가 매겨진적 없는 카테고리라면 3.0을 넣어줍니다.
        mean_rating = 3.0
    return mean_rating

def recommend_simple(user, n_items):
    # 특정 유저의 카테고리별 평점 정보입니다.
    user_category = score_matrix.loc[user].copy()
    for category in score_matrix:
        if pd.notnull(user_category.loc[category]):
            # 만약 카테고리의 평점 정보가 null이 아니라면, 이미 평가한 항목입니다.
            # 추천에서 제외하기 위해 0을 넣어줍니다.
            user_category.loc[category] = 0
        else:
            user_category.loc[category] = CF_simple(user, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

In [5]:
print('7번 유저 추천')
print(recommend_simple(user=7, n_items=5))

print('15번 유저 추천')
print(recommend_simple(user=15, n_items=5))

7번 유저 추천
category
고려음식      4.858061
칠면조       4.848691
우럭조림      4.668416
소고기해장국    4.651163
보말칼국수     4.592455
Name: 7, dtype: float64
15번 유저 추천
category
중국음식     4.662956
프렌치      4.658387
초두부      4.635693
시오라멘     4.550817
사시미정식    4.544322
Name: 15, dtype: float64


In [6]:
from sklearn.model_selection import train_test_split

x = category_scores.copy()
y = category_scores['user']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [7]:
# RMSE 계산해주는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)- np.array(y_pred)) ** 2))

# 모델별 RMSE 계산
def score(model):
    id_pairs = zip(x_test['user'], x_test['category'])
    y_pred = np.array([model(user, category) for (user, category) in id_pairs])
    y_true = np.array(x_test['score'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full Matrix 구하기
score_matrix = x_train.pivot(index='user', columns='category', values='score')

In [8]:
index = x['user'].unique()
columns = x['category'].unique()
train_matrix = pd.DataFrame(index=index, columns=columns)
train_matrix = train_matrix.fillna(score_matrix)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = train_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=train_matrix.index,
                               columns=train_matrix.index)

In [10]:
import numpy as np
def CF_simple(user, category):
    if category in train_matrix:
        sim_scores = user_similarity[user].copy()
        category_score = train_matrix[category].copy()
        none_rating_idx = category_score[category_score.isnull()].index
        category_score = category_score.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        if sim_scores.sum() > 1:
            mean_rating = np.dot(sim_scores, category_score) / sim_scores.sum()
        else:
            mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

print(score(CF_simple))

1.1606569292785525


In [11]:
# 모델별 RMSE를 계산하는 함수
def score(model, neighbor_size):
    id_pairs = zip(x_test['user'], x_test['category'])
    y_pred = np.array([model(user, category, neighbor_size) for (user, category) in id_pairs])
    y_true = np.array(x_test['score'])
    return RMSE(y_true, y_pred)

# Neighbor size를 정해서 예측치를 계산하는 함수
def cf_knn(user, category, neighbor_size=0):
    if category in train_matrix:
        sim_scores = user_similarity[user].copy()
        category_score = train_matrix[category].copy()
        none_rating_idx = category_score[category_score.isnull()].index
        category_score = category_score.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # Neighbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            if sim_scores.sum() > 1:
                mean_rating = np.dot(sim_scores, category_score) / sim_scores.sum()
            else:
                mean_rating = 3.0
        
        # Neighbor size가 지정된 경우
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                category_score = np.array(category_score)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                category_score = category_score[user_idx][-neighbor_size:]
                if sim_scores.sum() > 1:
                    mean_rating = np.dot(sim_scores, category_score) / sim_scores.sum()
                else:
                    mean_rating = 3.0
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("이웃 크기: {0}, RMSE: {1:.4f}".format(neighbor_size, score(cf_knn, neighbor_size)))

이웃 크기: 10, RMSE: 1.1890
이웃 크기: 20, RMSE: 1.1698
이웃 크기: 30, RMSE: 1.1650
이웃 크기: 40, RMSE: 1.1632


KeyboardInterrupt: 

In [None]:
def recommend_knn(user, n_items):
    user_category = score_matrix.loc[user].copy()
    for category in score_matrix:
        if pd.notnull(user_category.loc[category]):
            user_category.loc[category] = 0
        else:
            user_category.loc[category] = cf_knn(user, category, 30)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

print('7번 유저 추천')
print(recommend_knn(user=7, n_items=10))

print('15번 유저 추천')
print(recommend_knn(user=15, n_items=10))