# 아이템 기반 협업 추천 시스템

In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/ml-latest-small')
movies = pd.read_csv('movies.csv', encoding='utf-8')
ratings = pd.read_csv('ratings.csv', encoding='utf-8')
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## User-item 행렬 만들기

In [4]:
# 우선 두 데이터프레임의 공통 key는 movieId이기 때문에 merge해주기
merge_df = pd.merge(movies, ratings, on='movieId')
merge_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [5]:
# pivot_table이용해서 User-item 행렬 만들기
user_item = merge_df.pivot_table(index='userId', columns='title', values='rating')

In [6]:
user_item.isnull().sum()

title
'71 (2014)                                   609
'Hellboy': The Seeds of Creation (2004)      609
'Round Midnight (1986)                       608
'Salem's Lot (2004)                          609
'Til There Was You (1997)                    608
                                            ... 
eXistenZ (1999)                              588
xXx (2002)                                   586
xXx: State of the Union (2005)               605
¡Three Amigos! (1986)                        584
À nous la liberté (Freedom for Us) (1931)    609
Length: 9719, dtype: int64

In [7]:
# 결측치는 평점을 안매긴 것이므로 0으로 대체하기
user_item = user_item.fillna(0)

In [8]:
user_item.shape

(610, 9719)

In [9]:
# user_item을 Transpose 해서 Item-user 행렬 만들기
item_user = user_item.T

## Item끼리 유사도 행렬 만들기 

- 코사인 유사도 계산 위해 ``행 벡터``끼리 계산하기 때문에 ``item-user 행렬``사용!

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim_matrix = cosine_similarity(item_user.values, item_user.values)
item_sim_matrix.shape

(9719, 9719)

## 개인화된 평점 만들기

- 개인화된 평점 : ``user-item 행렬의 행 벡터(아이템에 대한 사용자들의 평가)``과 ``item 유사도 행렬의 열 벡터(아이템들끼리의 유사도)`` 내적

In [11]:
custom_weighted_matrix = np.dot(user_item, item_sim_matrix)

In [12]:
# 사용자가 각 아이템에 대한 개인화된 평점!
custom_weighted_matrix.shape

(610, 9719)

In [13]:
# 데이터프레임화 시키기
custom_weighted_df = pd.DataFrame(custom_weighted_matrix, index=user_item.index, columns=user_item.columns)
custom_weighted_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,35.740115,26.27582,23.441206,4.200928,7.826024,20.971658,178.026535,85.085722,144.541322,124.028198,...,79.163865,97.238015,63.291507,79.402481,0.399589,190.692213,181.057175,71.105265,267.848707,30.052376
2,9.277418,1.943638,1.37436,0.0,0.0,3.878789,9.55578,1.919737,29.640982,10.345093,...,10.898454,11.158151,9.505257,9.723388,3.232324,13.376131,20.28667,12.821697,16.057925,0.0
3,6.038167,1.376814,4.695395,0.069598,0.141748,0.293269,10.4194,1.730442,5.213261,4.365703,...,4.823781,6.241044,5.57505,7.549412,0.0,7.367676,6.585123,4.824222,9.526851,3.52534
4,24.969347,12.624132,11.691495,3.827885,11.708575,4.562202,92.651498,96.619034,91.817113,67.814082,...,35.725332,40.690702,26.251057,33.433302,0.539544,143.18925,94.732936,32.542315,133.537401,9.644971
5,3.697901,3.044372,3.051645,0.256805,0.939378,1.965523,18.812218,15.493783,19.866791,13.113867,...,6.75167,11.902812,6.312127,7.643937,0.0,23.234232,22.254188,9.415772,25.722053,2.182547


In [15]:
# 개인 사용자 Id를 입력했을 때 상위 5개의 추천 영화 추출하도록 하는 함수 정의
def recommend_similar_movies(weighted_df, item_user_df, user_Id, top_n=10):
    user_idx = user_Id
    score_idx = weighted_df.iloc[user_idx].values.argsort()[::-1]
    top_score_idx = score_idx[:top_n]
    score = weighted_df.iloc[user_idx].values[top_score_idx]
    top_score_movies = item_user_df.index[top_score_idx].values
    print(f"User{user_Id}에게 추천하는 상위 {top_n}개 영화&점수")
    top_movies_df = pd.DataFrame({'영화 제목': top_score_movies,
                                  '점수': score})
    return top_movies_df

In [16]:
user_1 = recommend_similar_movies(custom_weighted_df, item_user, user_Id=1, top_n=10)

User1에게 추천하는 상위 10개 영화&점수


In [17]:
user_1

Unnamed: 0,영화 제목,점수
0,Inception (2010),49.689342
1,"Dark Knight, The (2008)",49.028531
2,Django Unchained (2012),47.239525
3,Inglourious Basterds (2009),46.891931
4,Shutter Island (2010),46.265496
5,"Dark Knight Rises, The (2012)",46.109695
6,"Departed, The (2006)",45.991602
7,Mad Max: Fury Road (2015),45.854069
8,"Wolf of Wall Street, The (2013)",44.673436
9,Kill Bill: Vol. 1 (2003),44.134111
