In [85]:
import pandas as pd

dataframes = pd.read_pickle('data/review.pkl')

In [86]:
category_score = pd.merge(
    dataframes["stores"], dataframes["reviews"], left_on="id", right_on="store"
)[["category", "user", "score"]]

# 기존에 저장된 category 문자열을 | 기준으로 나누어서 리스트로 저장해줍니다.
category_score["category"] = category_score["category"].apply(lambda c: c.split("|"))
# print(category_score.explode('category'))

# DataFrame.explode를 쓰면 리스트를 가진 row들을 분리해줍니다.
category_explode = category_score.explode("category").set_index('user')
# print 찍어보세요!
# print(category_explode)

In [87]:
def recom_category(n_items):
    return category_mean.sort_values(ascending=False)[:n_items]

category_mean = category_explode.groupby(['category'])['score'].mean()
# print(category_explode.groupby(['category'])['score'].mean())

# # 계층적 색인으로 저장된 걸 풀어주었습니다. (Column을 각각 가져오고 싶습니다.)
category_scores = category_explode.groupby(["user", "category"])['score'].agg(["mean", "size"]).rename(columns={'mean':'score'}).reset_index()
# print(category_scores)
category_scores = category_scores[category_scores['size'] > 20]

# print(pd.pivot(data=category_scores, index='user', columns='category', values='score'))

In [12]:
import numpy as np

#RMSE 계산해주는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)- np.array(y_pred)) ** 2))

# 모델별 RMSE 계산
def score(model):
    id_pairs = zip(x_test['user'], x_test['category'])
    y_pred = np.array([model(user, category) for (user, category) in id_pairs])
    y_true = np.array(x_test['score'])
    return RMSE(y_true, y_pred)

In [13]:
# 전체 데이터를 활용한 예측 - 전체 유저의 리뷰 평점을 예측값으로 두고 실제 값과 오차 계산
rmse = []
for user in set(category_explode.index):
    y_true = category_explode.loc[user]['score']
    y_pred = category_mean[category_explode.loc[user]['category']]
    accuracy = RMSE(y_true, y_pred)
    rmse.append(accuracy)
print(np.mean(rmse))

1.2220800583865947


In [88]:
from sklearn.model_selection import train_test_split

x = category_scores.copy()
y = category_scores['user']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 75%는 train set / 25%는 test set

# train 데이터로 Full Matrix 
rating_matrix = x_train.pivot(index='user', columns='category', values='score')

In [70]:
# 전체 평균으로 예측치를 계산하는 기본 모델
def best_seller(user, category):
    try:
        score = train_mean[category]
    except:
        score = 3.0
    return score

train_mean = x_train.groupby(['category'])['score'].mean()
score(best_seller)

1.127945982397897

In [89]:
# Full Matrix를 사용자 데이터와 Merge
users = dataframes['users'].rename(columns={'id': 'user'}).set_index('user')
merged_ratings = pd.merge(x_train, users, left_on='user', right_index=True)

# gender별 평균평점 계산
g_mean = merged_ratings[['category', 'gender', 'score']].groupby(['category', 'gender'])['score'].mean()
print(g_mean)
# age별 평균 평점 계산
a_mean = merged_ratings[['category', 'age', 'score']].groupby(['category', 'age'])['score'].mean()

category  gender
          여         4.000000
냉면        남         4.047619
중국집       남         3.480000
치킨        남         3.380952
카페        남         3.960043
          여         3.889776
커피        여         3.800000
평양냉면      남         4.322581
Name: score, dtype: float64


In [17]:
########## Gender 기준 추천 ############
# gender 별 평균을 예측치로 돌려주는 함수
def cf_gender(user, category):
    if category in rating_matrix:
        gender = users.loc[user]['gender']
        if gender in g_mean[category]:
            gender_rating = g_mean[category][gender]
        else:
            gender_rating = train_mean[category]
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.144324779731331

In [18]:
# age 별 평균을 예측치로 돌려주는 함수
def cf_age(user, category):
    if category in rating_matrix:
        age = users.loc[user]['age']
        if age in a_mean[category]:
            age_rating = a_mean[category][age]
        else:
            age_rating = train_mean[category]
    else:
        age_rating = 3.0
    return age_rating

score(cf_age)

1.2495723961651406

In [19]:
users2 = users.copy()
users2['age'] = users2['age'].apply(lambda x: x//10)

merged_ratings2 = pd.merge(x_train, users2, left_on='user', right_index=True)

a_mean2 = merged_ratings2[['category', 'age', 'score']].groupby(['category', 'age'])['score'].mean()
# age 별 평균을 예측치로 돌려주는 함수
def cf_age2(user, category):
    if category in rating_matrix:
        age = users2.loc[user]['age']
        if age in a_mean2[category]:
            age_rating = a_mean2[category][age]
        else:
            age_rating = 3.0
    else:
        age_rating = 3.0
    return age_rating

score(cf_age2)

1.1693291200441522

In [20]:
users2 = users.copy()
users2['age'] = users2['age'].apply(lambda x: x//10)

merged_ratings3 = pd.merge(x_train, users2, left_on='user', right_index=True)

total_mean = merged_ratings3[['category', 'age', 'gender', 'score']].groupby(['category', 'gender', 'age'])['score'].mean()

def cf_age3(user, category):
    if category in rating_matrix:
        age = users2.loc[user]['age']
        gender = users2.loc[user]['gender']
        if gender in total_mean[category]:
            if age in total_mean[category][gender]:
                avg_rating = total_mean[category][gender][age]
            else:
                # age 데이터가 없으면 => gender 평균값 넣어줌
                avg_rating = g_mean[category][gender]
        else:
            # gender 데이터가 없으면
            # 만약 age 데이터가 있으면 => age 평균값 넣어줌
            if age in total_mean.loc[category, :]:
                avg_rating = a_mean2[category][age]
            else:
                avg_rating = train_mean[category]
    else:
        avg_rating = 3.0
    return avg_rating

score(cf_age3)

1.188234240613462

In [83]:
def recommend_category(user, n_items):
    user_category = rating_matrix.loc[user]
    for category in rating_matrix:
        if pd.isnull(user_category.loc[category]):
            user_category.loc[category] = best_seller(user, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

# 전체 평균으로 예측치를 계산하는 기본 모델
def best_seller(user, category):
    try:
        score = train_mean[category]
    except:
        score = 3.0
    return score

print(category_scores)
# recommend_category(62, 5)

          user category     score  size
3109      5666       카페  3.840000    25
4286      8647       카페  3.851852    27
6829     23284      중국집  3.480000    25
9287     28600       카페  3.565217    23
10520    32143       카페  3.233333    90
20443    69019       카페  3.727273    22
21184    70650       카페  3.636364    22
21745    71632       카페  4.480000    25
23479    74999       카페  3.886525   141
23482    74999       커피  3.800000    30
37242   129333       카페  3.451613    31
38762   132504       카페  3.727273    22
38951   132643       카페  4.440000    25
42607   145101       카페  3.913043    23
44711   151946       카페  3.846154    26
46791   156952       카페  4.318182    22
53078   179719      삼겹살  3.434783    23
53199   179719       치킨  3.380952    21
53921   180519       카페  3.756757    37
56442   191568       카페  3.755102    49
63319   217350       카페  3.931034    29
63907   218901       카페  3.939394    66
64475   220831       카페  3.782609    23
67146   237668       카페  4.423077    26


In [75]:
def recommend_category2(user, n_items):
    user_category = rating_matrix.loc[user]
    for category in rating_matrix:
        if pd.isnull(user_category.loc[category]):
            user_category.loc[category] = best_seller(user, category)
    category_sort = user_category.sort_values(ascending=False)[:n_items]
    return category_sort

recommend_category2(62, 5)

category
블루베리막걸리    5.0
똥집구이       5.0
된장술밥       5.0
된장찌게       5.0
유부         5.0
Name: 62, dtype: float64

In [71]:
test_df = category_score = pd.merge(
    dataframes["stores"], dataframes["reviews"], left_on="id", right_on="store"
)[["category", "user", "score"]]

print(test_df[test_df['user']==68632])

         category   user  score
0          호주레스토랑  68632      5
325                68632      5
1412        피자|맥주  68632      5
1527      팬케이크|카페  68632      5
2300   오리불고기|유황오리  68632      5
...           ...    ...    ...
87932       카페|홍차  68632      5
88311              68632      4
88564   닭볶음탕|꼬막무침  68632      5
88749  능이버섯백숙|토종닭  68632      5
90365     보쌈정식|보쌈  68632      5

[121 rows x 3 columns]
