In [434]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import math 

from collections import Counter
pd.options.mode.chained_assignment = None  # default='warn'


In [207]:
rating_df = pd.read_csv("data/ratings_small.csv")

In [9]:
rating_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [18]:
c = Counter(rating_df.movieId)

# 데이터 분석

- [x] 그래프 그리기 
  - x : 유저가 평가한 영화 수, y : 유저수 
  - x : 영화당 평가받은 레이팅 수, y : 영화수 
  - x : 유저가 평가한 평균 레이팅 값, y : 유저수 
  - x : 영화당 평가받은 평균 레이팅 값, y : 영화수 
- [x] 테스트 유저 뽑기 (k >= 5, k = 평가한 아이템 수)
  - 테스트 유저의 공개 레이팅 / 비공개 레이팅 뽑기 



#### 유저의 점수 분포

In [122]:
rating_df.groupby("userId").mean().rating.describe()

count    671.000000
mean       3.657587
std        0.471339
min        1.333333
25%        3.396193
50%        3.675000
75%        3.984026
max        4.948718
Name: rating, dtype: float64

#### 영화의 점수 분포

In [123]:
rating_df.groupby("movieId").mean().rating.describe()

count    9066.000000
mean        3.292054
std         0.881968
min         0.500000
25%         2.843750
50%         3.500000
75%         3.966250
max         5.000000
Name: rating, dtype: float64

#### 유저의 리뷰수 분포

In [124]:
rating_df.groupby("userId").count().movieId.describe()

count     671.000000
mean      149.037258
std       231.226948
min        20.000000
25%        37.000000
50%        71.000000
75%       161.000000
max      2391.000000
Name: movieId, dtype: float64

#### 영화의 리뷰수 분포

In [125]:
rating_df.groupby("movieId").count().userId.describe()

count    9066.000000
mean       11.030664
std        24.050800
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       341.000000
Name: userId, dtype: float64

### 테스트 유저 뽑기

In [236]:
grouped = rating_df.groupby("userId").count().reset_index()

In [482]:
idx = (grouped.movieId > 20).mul(grouped.movieId < 30, axis=0)
test_users = grouped[idx].sample(n=1, random_state=2).userId.tolist()

In [498]:
test_users

[368]

In [483]:
test_user_rating_df = rating_df[rating_df.userId.isin(test_users)]
closed_rating_df = test_user_rating_df.sample(n=10, random_state=1)
open_rating_df = test_user_rating_df[~test_user_rating_df.index.isin(test_user_close_rating_df.index)]

train_rating_df = rating_df[~rating_df.index.isin(closed_rating_df.index)]



## Model : 1.0 Pearson 기반 User Similarity

Q. 유사도는 어떻게 측정할 것인가?

- **Similarity Function: Pearson Correlation**  
- 유저 셋을 U.
- 유저 a와 b가 모두 평가한 영화 아이템셋을 P.   
- sim(a, b) =  $\frac{\Sigma_{p \in P} (r_{a, p} - \bar{r}_{a})(r_{b, p} - \bar{r}_{b})}{\sqrt{\Sigma_{p \in P}(r_{a, p} - \bar{r}_a)^2}\sqrt{\Sigma_{p \in P}(r_{b, p} - \bar{r}_b)^2}}$ 


Q. 아이템에 대해 예상 점수는 어떻게 측정할 것인가?

- **Prediction(a, p)** = 유저 a의 평균 평가 점수 +  (모든 유저 b에 대해서 (유저 b의 아이템 p에 대한 평가 점수 -  유저 b의 평균 평가 점수) *  (유저 a와 b의 유사도) 의 합)/(각 유저와 유저 a와의 유사도 합)


### 데이터 전처리 - Training Set

In [484]:
movie_ids = list(set(train_rating_df.movieId))
movie_id2idx = {v:k for k, v in enumerate(movie_ids)}
user_ids = list(set(train_rating_df.userId))
user_id2idx = {v:k for k, v in enumerate(user_ids)}
train_rating_df["movie_idx"] = train_rating_df.apply(lambda x : movie_id2idx[x.movieId], axis=1)
train_rating_df["user_idx"] = train_rating_df.apply(lambda x : user_id2idx[x.userId], axis=1)

In [485]:
user_idxes = train_rating_df.user_idx.tolist()
ratings = train_rating_df.rating.tolist()
movie_idxes = train_rating_df.movie_idx.tolist()

In [486]:
user_rating_book = [dict() for _ in range(len(user_idxes))]

for user_idx, rating, movie_idx in zip(user_idxes, ratings, movie_idxes):
    user_rating_book[user_idx][movie_idx] = rating

### 데이터 전처리 - Test Set

In [487]:
open_rating_df["movie_idx"] = open_rating_df.apply(lambda x : movie_id2idx.get(x.movieId, -1), axis=1)
open_rating_df["user_idx"] = open_rating_df.apply(lambda x : user_id2idx.get(x.userId, -1), axis=1)

closed_rating_df["movie_idx"] = closed_rating_df.apply(lambda x : movie_id2idx.get(x.movieId, -1), axis=1)
closed_rating_df["user_idx"] = closed_rating_df.apply(lambda x : user_id2idx.get(x.userId, -1), axis=1)



In [None]:
test_user_idx = 

In [488]:
open_rating_dict = {}
for index, row in open_rating_df.iterrows():
    open_rating_dict[row["movie_idx"]] = row["rating"]

r_a = np.mean(list(open_rating_dict.values()))

### 유저 유사도 평가

In [489]:
user_sim_dict = {}

In [490]:
for user_idx, rating_dict in enumerate(user_rating_book):
    numerator = 0

    denominator_a = 0
    denominator_b = 0
    r_b = np.mean(list(rating_dict.values()))
    for movie_idx, rating in rating_dict.items():
        if movie_idx in open_rating_dict.keys():
            b_rating_diff = (rating - r_b)
            a_rating_diff = (open_rating_dict[movie_idx] - r_a)
            numerator +=  a_rating_diff * b_rating_diff
            denominator_a += math.pow(a_rating_diff, 2)
            denominator_b += math.pow(b_rating_diff, 2)
    denominator = np.sqrt(denominator_a) * np.sqrt(denominator_b)
    if denominator != 0:
        sim = numerator / denominator
    else:
        sim = 0
    user_sim_dict[user_idx] = sim
    

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


### 예측

In [497]:
errors = [] 
for row_idx in range(len(closed_rating_df)):
    test_user_idx = closed_rating_df.iloc[row_idx].user_idx
    test_item_idx = closed_rating_df.iloc[row_idx].movie_idx
    test_item_rating = closed_rating_df.iloc[row_idx].rating
    r_a = closed_rating_df.rating.mean()

    numerator = 0
    denominator = 0
    for user_idx, rating_dict in enumerate(user_rating_book):
        if test_item_idx in rating_dict.keys():
            r_b_p = rating_dict[test_item_idx]
            r_b = np.mean(list(rating_dict.values()))
            sim = user_sim_dict[user_idx]
            if sim >= 0.5:
                rating_diff = r_b_p - r_b
                numerator += rating_diff * sim
                denominator += sim

    if denominator != 0:
        preference_score = numerator / denominator
    else:
        preference_score = 0
    predicted_rating = r_a + preference_score
    error = test_item_rating - predicted_rating
    errors.append(error)
    print("movie_idx = {}, rated_score = {:.2f}, predicted score = {:.2f}, error = {:.2f}".format(test_item_idx, test_item_rating, predicted_rating, error))

mean_error = np.mean(errors)
print("Mean Error : {}".format(mean_error))

movie_idx = 2521.0, rated_score = 5.00, predicted score = 4.78, error = 0.22
movie_idx = 2284.0, rated_score = 3.00, predicted score = 3.12, error = -0.12
movie_idx = 563.0, rated_score = 4.00, predicted score = 4.68, error = -0.68
movie_idx = 2800.0, rated_score = 5.00, predicted score = 4.51, error = 0.49
movie_idx = 2799.0, rated_score = 3.00, predicted score = 3.37, error = -0.37
movie_idx = 2419.0, rated_score = 4.00, predicted score = 4.36, error = -0.36
movie_idx = 2738.0, rated_score = 3.00, predicted score = 3.27, error = -0.27
movie_idx = 819.0, rated_score = 3.00, predicted score = 3.34, error = -0.34
movie_idx = 519.0, rated_score = 5.00, predicted score = 4.73, error = 0.27
movie_idx = 964.0, rated_score = 4.00, predicted score = 4.47, error = -0.47
Mean Error : -0.164588592234007


In [501]:
closed_rating_df.movieId.tolist()

[2858, 2581, 593, 3176, 3175, 2739, 3101, 920, 541, 1077]

In [None]:
## Model : 1.1 MF 





In [None]:
## Model : 2.0 Pearson 기반 Item Similarity

- Item에 대한 Similarity 
