# 컨텐츠 기반 필터링(Content-based filtering)

- 참고: https://assaeunji.github.io/machine%20learning/2020-11-29-implicitfeedback/
- SVD 참고: https://darkpgmr.tistory.com/106, https://angeloyeo.github.io/2019/08/01/SVD.html, https://suhak.tistory.com/1203, https://deep-learning-study.tistory.com/481

In [12]:
import numpy as np
import pandas as pd

In [13]:
from surprise import Dataset
data = Dataset.load_builtin('ml-100k', prompt=False)
df = pd.DataFrame(data.raw_ratings, columns=['user-id','movie-id','rating','timestamp'])    
df.head()

Unnamed: 0,user-id,movie-id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [14]:
df.shape

(100000, 4)

특정 아이디(rows)의 사람이 특정 영화(columns)의 평점을 몇 점을 매겼는지 DataFrame으로 만들어서 분석하기 

- Adjacent Matrix 생성
    - 행(row) == 사용자
    - 열(column) == 영화
    - 내용 == 평점

In [15]:
# Python 인덱스는 맨 첫번째가 0으로 시작, ID는 1부터 시작함
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data[:, :2] -= 1            # user-id, movie-id가 0부터 시작하도록 재설정
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

In [22]:
n_users = df['user-id'].nunique()
n_movies = df['movie-id'].astype(int).max()
n_users, n_movies

(943, 1682)

- adj_matrix: 0/1, 평점점수를 주는 경우

In [24]:
# 본 영화는 1, 안 본 영화는 0
adj_matrix = np.zeros((n_users, n_movies), int)
for user_id, movie_id, _, _ in raw_data:
    adj_matrix[user_id, movie_id] = 1      
adj_matrix[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

- 벡터의 내적 참고: https://waraccc.tistory.com/17, https://novemberfirst.tistory.com/24?category=788587

In [27]:
# 유사도 - 이진 벡터의 내적
my_id, my_vector = 0, adj_matrix[0]

np.dot(my_vector, adj_matrix[1]), np.dot(my_vector, adj_matrix[2])

(18, 8)

In [31]:
# 누가 나(user-id가 0)랑 가장 닮았는가?
best_score, best_match_id = 0, 0

for i in range(1, len(adj_matrix)):
    dot = np.dot(my_vector, adj_matrix[i])
    if dot > best_score:
        best_score = dot
        best_match_id = i

best_score, best_match_id

(183, 275)

In [35]:
# my_vector:0 -> 영화를 안봄, my_vector:1 -> 영화를 봄
# best_vector:0 -> 영화를 안봄, best_vector:1 -> 영화를 봄
best_vector = adj_matrix[best_match_id]
my_vector[:10], best_vector[:10]

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0]))

In [36]:
# 내가 보지 않은 영화중에서 best_match 사용자가 본 영화 - 추천
# my_match:0 이면서 best_vector:1 인 것을 추천
recommend_list = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view >= 1:
            recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(335, [272, 273, 275, 280, 281, 283, 287, 288, 289, 290])

- adj_matrix: 평점 점수

In [38]:
adj_matrix = np.zeros((n_users, n_movies), int)
for user_id, movie_id, rating, _ in raw_data:
    adj_matrix[user_id, movie_id] = rating
adj_matrix[:5]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0]])

- 거리기반 유사도(유클리드 거리)와 각도 기반 유사도(코사인 유사도) 참고: https://specialscene.tistory.com/48, https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=cjh226&logNo=220810613028, https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=bsw2428&logNo=221388885007, 

In [40]:
# 누가 나(user-id가 0)랑 가장 닮았나? - 유클리드 거리
best_score, best_match_id = 100000, 0
my_vector = adj_matrix[0]

for i in range(1, len(adj_matrix)):
    user_vector = adj_matrix[i]
    euc_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
    if euc_dist < best_score:
        best_score = euc_dist
        best_match_id = i

best_score, best_match_id

(55.06359959174482, 737)

In [41]:
# 내가 보지 않은 영화중에서 best_match 사용자가 본 영화 - 추천
recommend_list = []
best_vector = adj_matrix[best_match_id]

for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view >= 1:
            recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(48, [297, 312, 317, 342, 356, 366, 379, 384, 392, 402])

- 코사인 유사도

In [42]:
def compute_cos_sim(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    return np.dot(v1, v2) / (norm1 * norm2)

In [44]:
# 누가 나(user-id가 0)랑 가장 닮았나? - 코사인 유사도
best_score, best_match_id = -1, 0
my_vector = adj_matrix[0]

for i in range(1, len(adj_matrix)):
    user_vector = adj_matrix[i]
    cos_sim = compute_cos_sim(my_vector, user_vector)
    if cos_sim > best_score:
        best_score = cos_sim
        best_match_id = i

best_score, best_match_id

(0.569065731527988, 915)

In [45]:
# 내가 보지 않은 영화중에서 best_match 사용자가 본 영화 - 추천
recommend_list = []
best_vector = adj_matrix[best_match_id]

for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view >= 1:
            recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(162, [272, 275, 279, 280, 283, 285, 289, 294, 297, 316])