# 모듈, 라이브러리 불러오기


In [2]:
import pandas as pd # 데이터 분석 라이브러리
import numpy as np # 수학 계산용 라이브러리
import matplotlib.pyplot as plt # 그래프 그리는 라이브러리
import seaborn as sns # 데이터 시각화 라이브러리 (matplot 하위 모듈)
from ast import literal_eval # 리스트를 문자열로 읽히지 않게 하기 위한 모듈
from sklearn.feature_extraction.text import TfidfVectorizer # Tf-idf값으로 CountVectorizer 보완 
from sklearn.feature_extraction.text import CountVectorizer # 텍스트 출현 횟수를 벡터화 자연어 처리 모듈
from sklearn.metrics.pairwise import cosine_similarity # 코사인 유사도 계산

# 데이터 전처리

본 예제에서는 사용자 평점을 기준으로 코사인 유사도를 매겨서 추천 시스템을 구현한다.
<br>데이터는 kaggle에서 긁어온 10만개짜리 영화 & 평점 데이터를 사용했습니다.

In [3]:
rating_data = pd.read_csv('./ratings.csv') # 관객 평점 데이터
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movie_data = pd.read_csv('./movies.csv') #영화 데이터
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


사용자 평점 데이터와 영화 데이터를 movieId 요소로 합친다.

In [5]:
rating_data.drop('timestamp', axis = 1, inplace=True) # timestamp 항목 제거
user_movie_rating = pd.merge(rating_data, movie_data, on = 'movieId') #두 데이터를 movieId 기준으로  merge
user_movie_rating.head(10)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,31,2.5,Dangerous Minds (1995),Drama
1,7,31,3.0,Dangerous Minds (1995),Drama
2,31,31,4.0,Dangerous Minds (1995),Drama
3,32,31,4.0,Dangerous Minds (1995),Drama
4,36,31,3.0,Dangerous Minds (1995),Drama
5,39,31,3.0,Dangerous Minds (1995),Drama
6,73,31,3.5,Dangerous Minds (1995),Drama
7,88,31,3.0,Dangerous Minds (1995),Drama
8,96,31,2.5,Dangerous Minds (1995),Drama
9,110,31,4.0,Dangerous Minds (1995),Drama


각 userId(관객) 별 영화의 평점(rating)을 확인해볼수 있다.

In [6]:
movie_user_rating = user_movie_rating.pivot_table('rating', index = 'title', columns='userId') # 영화 - 사용자 테이블
movie_user_rating.fillna(0, inplace = True) # 결측값 NaN을 0으로 채워줌
movie_user_rating.head(10)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
$9.99 (2008),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Neath the Arizona Skies (1934),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'night Mother (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
user_movie_rating = user_movie_rating.pivot_table('rating', index = 'userId', columns='title')
user_movie_rating.fillna(0, inplace = True)
user_movie_rating.head(10)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


영화 - 사용자 테이블 / 사용자 - 영화 테이블 도 구현해본다.




# 아이템 기반 협업 필터링 구현

코사인 유사도를 사용하여 영화 - 사용자 간의 아이템 기반 협업 필터링을 구현해본다.
<br>아이템 기반 협업 필터링은 A 영화를 본 사용자는 유사한(점수가 높은) B 영화를 볼 확률이 높다. 라는 알고리즘으로 구현된다.
<br>이 알고리즘은 평점 간의 유사도를 바탕으로 구현된다.
<br>따라서 영화 간의 점수가 높을 수록 아이템의 유사도가 높다고 판단할 수 있다.

In [8]:
item_based_collabor = cosine_similarity(movie_user_rating) # 코사인 유사도 분석
item_based_results = pd.DataFrame(data = item_based_collabor, index = movie_user_rating.index, columns = movie_user_rating.index)
item_based_results

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",1.000000,0.000000,0.0,0.164399,0.020391,0.0,0.014046,0.000000,0.000000,0.003166,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0000
$9.99 (2008),0.000000,1.000000,0.0,0.000000,0.000000,0.0,0.000000,0.079474,0.000000,0.156330,...,0.000000,0.000000,0.0,0.000000,0.0,0.013899,0.000000,0.058218,0.0,0.0000
'Hellboy': The Seeds of Creation (2004),0.000000,0.000000,1.0,0.000000,0.000000,1.0,0.000000,0.217357,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0000
'Neath the Arizona Skies (1934),0.164399,0.000000,0.0,1.000000,0.124035,0.0,0.085436,0.000000,0.000000,0.019259,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0000
'Round Midnight (1986),0.020391,0.000000,0.0,0.124035,1.000000,0.0,0.010597,0.143786,0.000000,0.136163,...,0.000000,0.000000,0.0,0.121567,0.0,0.000000,0.000000,0.000000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xXx (2002),0.000000,0.013899,0.0,0.000000,0.000000,0.0,0.000000,0.123940,0.000000,0.144961,...,0.161281,0.076029,0.0,0.017465,0.0,1.000000,0.152057,0.140222,0.0,0.2661
xXx: State of the Union (2005),0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.134815,...,0.000000,0.000000,0.0,0.000000,0.0,0.152057,1.000000,0.000000,0.0,0.0000
¡Three Amigos! (1986),0.000000,0.058218,0.0,0.000000,0.000000,0.0,0.081620,0.331663,0.214498,0.064908,...,0.112588,0.159223,0.0,0.166622,0.0,0.140222,0.000000,1.000000,0.0,0.0000
À nous la liberté (Freedom for Us) (1931),0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,1.0,0.0000


위처럼 아이템 기반 협업 필터링으로 아이템 유사도를 구할 수 있다.

In [9]:
item_based_results["Lion King, The (1994)"].sort_values(ascending=False)[1:15] # 각 영화별 유사도를 높은 순으로 나열해볼수도 있다.

title
Aladdin (1992)                       0.696639
Beauty and the Beast (1991)          0.693582
Mrs. Doubtfire (1993)                0.661286
Jurassic Park (1993)                 0.629787
Mask, The (1994)                     0.605931
Forrest Gump (1994)                  0.602429
Home Alone (1990)                    0.580264
Ace Ventura: Pet Detective (1994)    0.576034
Dances with Wolves (1990)            0.574302
Speed (1994)                         0.573047
Apollo 13 (1995)                     0.556272
True Lies (1994)                     0.554366
Pretty Woman (1990)                  0.551990
Batman (1989)                        0.550762
Name: Lion King, The (1994), dtype: float64

# 사용자 기반 협업 필터링 구현

이는 위의 아이템 기반 협업 필터링에서 행렬의 행과 열을 바꿔주기만 하면 된다.
실제 값은 사용자 기반 보다 아이템 기반 필터링이 더 유사하게 나타난다.


In [10]:
user_based_collabor = cosine_similarity(user_movie_rating) # 코사인 유사도 분석
user_based_results = pd.DataFrame(data = user_based_collabor, index = user_movie_rating.index, columns = user_movie_rating.index)
user_based_results.head(20)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.074482,0.016818,0.0,0.083884,0.0,0.012843,0.0,...,0.0,0.0,0.014481,0.043719,0.0,0.0,0.0,0.062917,0.0,0.017466
2,0.0,1.0,0.124295,0.118821,0.103646,0.0,0.212985,0.11319,0.113333,0.043213,...,0.477306,0.063202,0.077784,0.164162,0.466281,0.425462,0.084646,0.02414,0.170595,0.113175
3,0.0,0.124295,1.0,0.08164,0.151531,0.060691,0.154714,0.249781,0.134475,0.114672,...,0.161205,0.064198,0.176222,0.158357,0.177098,0.124562,0.124911,0.080984,0.136606,0.170193
4,0.074482,0.118821,0.08164,1.0,0.130649,0.079648,0.319745,0.191013,0.030417,0.137186,...,0.114319,0.047228,0.136647,0.25403,0.121905,0.088735,0.068483,0.104309,0.054512,0.211609
5,0.016818,0.103646,0.151531,0.130649,1.0,0.063796,0.095888,0.165712,0.086616,0.03237,...,0.191029,0.021142,0.146246,0.224245,0.139721,0.058252,0.042926,0.038358,0.062642,0.225086
6,0.0,0.0,0.060691,0.079648,0.063796,1.0,0.0,0.128502,0.021745,0.045264,...,0.012962,0.009033,0.12454,0.082602,0.0,0.0,0.019563,0.024583,0.019465,0.087705
7,0.083884,0.212985,0.154714,0.319745,0.095888,0.0,1.0,0.149572,0.059728,0.186493,...,0.205832,0.077539,0.134861,0.147643,0.168489,0.232051,0.058773,0.073151,0.09624,0.268672
8,0.0,0.11319,0.249781,0.191013,0.165712,0.128502,0.149572,1.0,0.157356,0.162724,...,0.108371,0.085964,0.27452,0.231523,0.122108,0.069005,0.112366,0.055143,0.247687,0.406414
9,0.012843,0.113333,0.134475,0.030417,0.086616,0.021745,0.059728,0.157356,1.0,0.127341,...,0.078187,0.104944,0.077584,0.155774,0.06069,0.066412,0.194493,0.029291,0.384429,0.168497
10,0.0,0.043213,0.114672,0.137186,0.03237,0.045264,0.186493,0.162724,0.127341,1.0,...,0.03773,0.040454,0.126497,0.102269,0.035319,0.032653,0.098561,0.060549,0.15865,0.189703


In [12]:
user_based_results[5].sort_values(ascending=False)[1:15]

userId
313    0.338180
500    0.336893
292    0.320923
442    0.312862
654    0.310034
125    0.304973
561    0.302773
93     0.299025
187    0.293121
69     0.291611
88     0.290647
461    0.290427
562    0.288513
295    0.288450
Name: 5, dtype: float64

이는 19번 사용자와 유사도가 높은 사용자의 목록을 나타낸 것이다. 이를 통해 이론적으로 19번 사용자가 본 영화를 토대로 다른 사용자에게 영화를 추천해 줄 수 있다.
<br>하지만 실제 결과와 크게 차이가 나기에 잘 쓰이지 않는 협업 필터링 방식이다.
