# UBCF: 사용자 기반 협업 필터링

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error

#### 데이터 읽기

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


## 추천 프로세스
<img align='left' src='http://drive.google.com/uc?export=view&id=10QS0xBx21NahiKdlstDoh0gkQRyrC2vR'>

#### 입력데이터 구성: Ratings Matrix 만들기

In [3]:
# title 컬럼을 얻기 이해 movies와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# 행은 userID, 열은 title로 pivoting 수행. rating이 없는 값(NaN)은 모두 0으로 변환
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title', fill_value=0)

print(ratings_matrix.shape)
ratings_matrix.head()

(610, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,4.0,0
2,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
3,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
4,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
5,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0


#### 유사집단 탐색: 사용자 간 유사도 산출

In [4]:
# 추천 알고리즘 간에 실행시간을 비교하기 위한 코드 
import time
startTime = time.time()

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine_similarity()는 행을 기준으로 유사도를 계산하기 때문에 ratings_matrix를 transpose한 후 적용
user_sim = cosine_similarity(ratings_matrix, ratings_matrix)

# cosine_similarity()로 반환된 numpy 행렬에 영화명을 매핑하기 위해 DataFrame으로 변환
user_sim = pd.DataFrame(user_sim, ratings_matrix.index, ratings_matrix.index)

print(user_sim.shape)
user_sim.head()

(610, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [6]:
# Id=9인 사용자와 유사도가 높은 상위 5명 리스트
user_sim.loc[9].sort_values(ascending=False)[1:6]
# Item Base로 할때보다 유사도가 떨어지는 것을 볼 수 있음. 
# CF의 컨셉은 나와 유사한 User를 찾는것. 그러나 User를 기준으로 하면 비어있는 값이 많이 때문에 값이 작다. 

userId
508    0.209600
165    0.199900
399    0.168494
77     0.167616
407    0.149953
Name: 9, dtype: float64

#### 추천 상품 결정:  개인화된 영화 추천

$$
\Large \hat{R}_{u,j} = \frac{\sum_{i= 1}^{K}{S_{u,i}*R_{i,j}}} {\sum_{i = 1}^{K}{\left|S_{u,i}\right|}} 
$$

IBCF와 차이점은 K가 Item이냐 User냐

영화 평점 예측

In [7]:
# 유사도가 가장 높은 이웃의 수 설정
K = 20

In [8]:
# 위의 평점예측 수식을 아래와 같이 구현함
R, S = ratings_matrix.values, user_sim.values
# 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
ratings_pred = np.zeros(R.shape)

# 사용자-아이템 평점 행렬의 열 크기만큼 Loop 수행. 
for u in range(R.shape[0]):
    # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 index 반환
    top_k = np.argsort(S[:, u])[::-1][1:K+1]
    # 개인화된 예측 평점을 계산
    for i in range(R.shape[1]):
        ratings_pred[u, i] = S[u, :][top_k].dot(R[:,i][top_k].T) 
        ratings_pred[u, i] /= np.sum(np.abs(S[u, :][top_k]))
        
ratings_pred = pd.DataFrame(ratings_pred, ratings_matrix.index, ratings_matrix.columns)
ratings_pred

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.761716,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.199473,0.000000,0.000000,0.689137,0.000000
2,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.140703,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.0,0.21164,0.0,0.000000,0.000000,0.767511,0.000000,0.000000,0.000000,...,0.000000,0.139740,0.000000,0.000000,0.0,0.229173,0.000000,0.000000,0.810881,0.059676
4,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.048357,0.145070,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.847966,0.000000,0.000000,0.672517,0.000000
5,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.653661,0.597786,0.000000,0.365720,0.000000
8,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.0,0.00000,0.0,0.000000,0.000000,0.280824,0.000000,0.394431,0.321076,...,0.000000,0.000000,0.000000,0.000000,0.0,0.258942,0.283626,0.000000,0.149530,0.000000
10,0.000000,0.0,0.00000,0.0,0.000000,0.072782,0.000000,0.000000,2.126009,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.323719,0.000000,0.198658,0.000000


In [9]:
# 추천 알고리즘 간에 실행시간을 비교하기 위한 코드 
endTime = time.time() - startTime

Top-N 영화 추천

In [10]:
# 추천 영화의 수 설정
N = 10
uid = 9

In [11]:
# id로 지정된 사용자의 모든 영화정보 추출하여 Series로 반환함
# 반환된 user_rating은 영화명(title)을 index로 가지는 Series 객체임 
user_rating = ratings_matrix.loc[uid,:]
    
# user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list로 만듬
already_seen = user_rating[user_rating > 0].index.tolist()
   
# list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함
unseen_list = [movie for movie in ratings_matrix.columns.tolist() if movie not in already_seen]
    
# unseen_list에서 가장 평점이 높은 N개의 영화를 추천함 
recomm_items = ratings_pred.loc[uid, unseen_list].sort_values(ascending=False)[:N]
list(recomm_items.index)

['Matrix, The (1999)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Indiana Jones and the Last Crusade (1989)',
 'Saving Private Ryan (1998)',
 'Forrest Gump (1994)',
 'Terminator 2: Judgment Day (1991)',
 'Jurassic Park (1993)']

추천성능 평가

In [12]:
# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 RMSE를 구함
actual, pred = ratings_matrix.values, ratings_pred.values     
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
rmse = np.sqrt(mean_squared_error(pred, actual))
print('RMSE of UBCF@K=20: ', rmse)
print(f'Speed of UBCF@K=20: {endTime} sec.')

RMSE of UBCF@K=20:  2.52393545743251
Speed of UBCF@K=20: 49.30130076408386 sec.


# End