In [62]:
import pandas as pd

dataframes = pd.read_pickle('data/review.pkl')

category_score = pd.merge(
    dataframes["stores"], dataframes["reviews"], left_on="id", right_on="store"
)[["category", "user", "score"]]

category_score["category"] = category_score["category"].apply(lambda c: c.split("|"))

category_explode = category_score.explode("category").set_index('user')
print(category_explode)

       category  score
user                  
68632    호주레스토랑      5
389728     샌드위치      5
389728       쥬스      5
68716     전주비빔밥      4
774353     굴비정식      2
...         ...    ...
190766      소고기      5
201564     고기뷔페      4
201564      소고기      4
611078     고기뷔페      5
611078      고기집      5

[172565 rows x 2 columns]


In [63]:
category_scores = (category_explode.groupby(["category"])["score"]
                  .agg(["mean", "size"])
                  .reset_index()
                  .drop([0]))

category_scores_mean = category_scores[category_scores["size"] >= 5].set_index(["category"])["mean"]
print(category_scores_mean)

category
bar       4.090909
cafe      3.806283
la갈비      3.545455
lp바       4.400000
pasta     3.886364
            ...   
흑염소       3.526316
흑우        3.500000
흑임자팥빙수    3.214286
히레까스      4.040000
히츠마부시     4.272727
Name: mean, Length: 2385, dtype: float64


In [64]:
category_user_score = category_explode[category_explode["category"].isin(category_scores_mean.index)].reset_index()
category_user_score = category_user_score.groupby(["user","category"])["score"].mean().reset_index()
print(category_user_score)

rating_matrix = pd.pivot(data=category_user_score, index="user", columns="category", values="score")
print(rating_matrix)

          user category  score
0            7       국수    5.0
1            7      삼겹살    4.0
2            7    소고기국밥    4.0
3            7    소고기국수    5.0
4            7     쇠고기국    4.0
...        ...      ...    ...
142352  950224    치즈닭갈비    1.0
142353  950224      해산물    5.0
142354  950224      흑돼지    5.0
142355  950331       김치    5.0
142356  950331    김치돈까스    5.0

[142357 rows x 3 columns]
category  bar  cafe  la갈비  lp바  pasta  pizza  pub  가라아게  가리국밥  가리비  ...  \
user                                                                ...   
7         NaN   NaN   NaN  NaN    NaN    NaN  NaN   NaN   NaN  NaN  ...   
15        NaN   NaN   NaN  NaN    NaN    NaN  NaN   NaN   NaN  NaN  ...   
23        NaN   NaN   NaN  NaN    NaN    NaN  NaN   NaN   NaN  NaN  ...   
64        NaN   NaN   NaN  NaN    NaN    NaN  NaN   NaN   NaN  NaN  ...   
74        NaN   NaN   NaN  NaN    NaN    NaN  NaN   NaN   NaN  NaN  ...   
...       ...   ...   ...  ...    ...    ...  ...   ...   ...  ...  ...   
9

In [4]:
import numpy as np

#RMSE 계산해주는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)- np.array(y_pred)) ** 2))

# 모델별 RMSE 계산
def score(model):
    id_pairs = zip(x_test['user'], x_test['category'])
    y_pred = np.array([model(user, category) for (user, category) in id_pairs])
    y_true = np.array(x_test['score'])
    return RMSE(y_true, y_pred)

In [12]:
def recommend_category(user, n_items):
    user_category = rating_matrix.loc[user]
    for category in rating_matrix:
        if pd.isnull(user_category.loc[category]):
            user_category.loc[category] = best_seller(user, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

# 전체 평균으로 예측치를 계산하는 기본 모델
def best_seller(user, category):
    try:
        score = category_mean[category]
    except:
        score = category_explode['score'].mean()
    return score

category_mean = category_explode.groupby(['category'])['score'].mean()
print(recommend_category(7, 10))

category
해산물뷔페     5.000000
철판제육볶음    5.000000
국수        5.000000
소고기국수     5.000000
떡볶이돈까스    5.000000
고려음식      4.863636
칠면조       4.857143
오리초밥      4.833333
루프트탑      4.833333
문화카페      4.833333
Name: 7, dtype: float64


In [57]:
users = dataframes['users'].rename(columns={'id': 'user'}).set_index('user')

merged_scores = pd.merge(category_explode, users, left_index=True, right_index=True)

gender_scores = (merged_scores.groupby(['category', 'gender'])['score']
                 .agg(["size", "mean"])
                 .reset_index()
                 .drop([0, 1]))

gender_scores = gender_scores[gender_scores['size'] >= 5]
gender_scores = gender_scores.set_index(['category', 'gender'])
category_mean = category_explode.groupby(['category'])['score'].mean()
print(gender_scores.loc['cafe'])

        size      mean
gender                
남         67  3.537313
여        124  3.951613


In [69]:
def recommend_gender_category(user, n_items):
    user_category = rating_matrix.loc[user]
    for category in rating_matrix:
        if pd.isnull(user_category.loc[category]):
            user_category.loc[category] = best_seller(user, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

def cf_gender(user, category):
    if category in rating_matrix:
        gender = users.loc[user]['gender']
        if gender in gender_scores.loc[category]:
            gender_rating = gender_scores.loc[category, gender]
        else:
            gender_rating = category_mean[category]
    else:
        gender_rating = 3.0
    return gender_rating

print(recommend_gender_category(5666, 30))

category
모밀       5.0
나시고랭     5.0
중식당      5.0
랍스타      5.0
태국음식     5.0
        ... 
돈카츠      3.0
야채고로케    3.0
대구탕      3.0
낙지볶음     2.0
포장마차     2.0
Name: 5666, Length: 206, dtype: float64
