# EXPLORATION_SBA 3.  Movielens 영화 추천 실습

이번에 활용할 데이터셋은 추천시스템의 MNIST라고 부를만한 Movielens 데이터입니다.
---
* 유저가 영화에 대해 평점을 매긴 데이터가 데이터 크기 별로 있습니다. MovieLens 1M Dataset 사용을 권장합니다.
* 별점 데이터는 대표적인 explicit 데이터입니다. 하지만 implicit 데이터로 간주하고 테스트해볼 수 있습니다.
* 별점을 시청횟수로 해석해서 생각하겠습니다.
* 또한 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외하겠습니다.

# Import Library (step.01)

In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_rows=150
%matplotlib inline
import os

# Load Data (step.02)

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Data Processing (step.03)

In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# rating 컬럼의 이름을 play_count로 바꿉니다.
ratings.rename(columns={'rating':'play_count'}, inplace=True)
print(ratings.shape)
ratings.head()

(836478, 4)


Unnamed: 0,user_id,movie_id,play_count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
print(movies.shape)
movies.head()

(3883, 3)


Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# ratings 와 movies를 합쳐줍니다.
df = pd.merge(ratings, movies, on = 'movie_id') # 'movie_id' 를 기준으로 merge합니다.
print(df.shape)
df.head()

(836478, 6)


Unnamed: 0,user_id,movie_id,play_count,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


# EDA (step.02)

In [7]:
## Data Profiling
#profile = df.profile_report()
#profile

### Columns 분석 (step. 2-1)

* user_id: 각 유저의 고유한 id 값입니다. 총 6,039명입니다.
* movie_id: 각 영화의 고유한 id 값입니다. 총 3,628편입니다.
* play_count: 각 영화에 대한 유저의 별점입니다. 이번 프로젝트에는 재생 횟수로 해석하여 사용합니다. 3회 이하의 수는 제외 했으며,   
  4(41.7% / 348,971번), 3(31.2% / 261,196번), 5(27.1% 226,310번) 순으로 분포되어 있습니다.
* timestamp 1970년 1월 1일(Universal Time (UTC) of January 1, 1970) 이후의 시간정보
* title: 각 영화의 제목
* genre: 각 영화의 장르

### 가장 인기있는 영화 순위 (step. 2-2)

In [8]:
top_mv_df = df.pivot_table(index = ['title', 'movie_id'], values = 'play_count', aggfunc = np.sum)
top_mv_df = top_mv_df.sort_values(by = 'play_count', ascending=False) # 'play_count' 기준으로 내림차순 정렬
top_mv_df.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,play_count
title,movie_id,Unnamed: 2_level_1
American Beauty (1999),2858,14449
Star Wars: Episode IV - A New Hope (1977),260,13178
Star Wars: Episode V - The Empire Strikes Back (1980),1196,12648
Saving Private Ryan (1998),2028,11348
Star Wars: Episode VI - Return of the Jedi (1983),1210,11303
Raiders of the Lost Ark (1981),1198,11179
"Silence of the Lambs, The (1991)",593,11096
"Matrix, The (1999)",2571,10903
"Sixth Sense, The (1999)",2762,10703
Terminator 2: Judgment Day (1991),589,10513


## Build Recommender System (step. 3)

### 임의의 유저 데이터 만들기 (step. 3-1)

In [9]:
# 선호하는 영화 5편을 선택합니다.
my_favorite = [2028, 318, 356, 1704,1721]

all_movie_name = []
for i in my_favorite:
    movie_name = list(df[df['movie_id'] == i]['title'].unique())
    all_movie_name.append(movie_name)
all_movie_name

[['Saving Private Ryan (1998)'],
 ['Shawshank Redemption, The (1994)'],
 ['Forrest Gump (1994)'],
 ['Good Will Hunting (1997)'],
 ['Titanic (1997)']]

In [10]:
# 'yg'라는 user_id가 위 영화를 5번씩 봤다고 가정하겠습니다.
my_movielist = pd.DataFrame({'user_id': ['yg']*5,'movie_id':my_favorite,'play_count':[5]*5})

my_movielist

Unnamed: 0,user_id,movie_id,play_count
0,yg,2028,5
1,yg,318,5
2,yg,356,5
3,yg,1704,5
4,yg,1721,5


In [11]:
# ratings 와 my_playlist 를 합쳐줍니다.
if not ratings.isin({'user_id':['yg']})['user_id'].any():  # user_id에 6041 이라는 데이터가 없다면
    ratings = ratings.append(my_movielist)                           # 위에 임의로 만든 my_movielist 데이터를 추가해 줍니다. 

print(ratings.shape)
ratings[ratings['user_id'] == 'yg']

(836483, 4)


Unnamed: 0,user_id,movie_id,play_count,timestamp
0,yg,2028,5,
1,yg,318,5,
2,yg,356,5,
3,yg,1704,5,
4,yg,1721,5,


In [12]:
# ratings 와 movies를 합쳐줍니다.
df = pd.merge(ratings, movies, on = 'movie_id') # 'movie_id' 를 기준으로 merge합니다.
print(df.shape)
df.head()

(836483, 6)


Unnamed: 0,user_id,movie_id,play_count,timestamp,title,genre
0,1,1193,5,978300760.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413.0,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179.0,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279.0,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471.0,One Flew Over the Cuckoo's Nest (1975),Drama


### CSR matrix (step. 3-2)

In [13]:
# ratings의 user_id,movie_id에 누락된 값이 있습니다. 몇몇 영화는 아예 play_count에 포함되지 않았기 때문입니다. CSR matrix를 만들 때,
# 'row index exceeds matrix dimensions' 오류가 발생함으로, user_id, movie_id를 초기화하여 다시 값을 부여하겠습니다.

user_idx_to_unique = {v: k for k, v in enumerate(df.user_id.unique())}
movie_idx_to_unique = {v: k for k, v in enumerate(df.movie_id.unique())}

In [14]:
temp_user_data = df.user_id.map(user_idx_to_unique.get).dropna()
temp_movie_data = df.movie_id.map(movie_idx_to_unique.get).dropna()

In [15]:
df.user_id = temp_user_data
df.movie_id = temp_movie_data

In [16]:
from scipy.sparse import csr_matrix
num_user = df['user_id'].nunique()
num_movie = df['movie_id'].nunique()

csr_data = csr_matrix((df.play_count, (df.user_id, df.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

### MF model (step. 3-3)

In [44]:
from implicit.als import AlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [45]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [46]:
movie_idx_to_unique[9]

1941

In [47]:
list(df[df['movie_id'] == i]['title'].unique())

[]

In [48]:
# 선호하는 영화 5편을 선택합니다.
my_favorite = [2028, 318, 356, 1704,1721]

all_movie_name = []
for i in my_favorite:
    movie_name = list(df[df['movie_id'] == i]['title'].unique())
    all_movie_name.append(movie_name)
all_movie_name

[['Jean de Florette (1986)'],
 ['Rosencrantz and Guildenstern Are Dead (1990)'],
 ['Legends of the Fall (1994)'],
 ['Karate Kid, Part II, The (1986)'],
 ["Man Bites Dog (C'est arriv� pr�s de chez vous) (1992)"]]

In [49]:
# 선호하는 영화의 movie_id 값을 확인합니다.
my_favorite = ['Saving Private Ryan (1998)',
               'Shawshank Redemption, The (1994)',
               'Forrest Gump (1994)',
               'Good Will Hunting (1997)',
               'Titanic (1997)']

In [50]:
# 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해봅시다.

# 선호하는 영화의 movie_id 값을 확인합니다.
my_favorite = ['Saving Private Ryan (1998)',
               'Shawshank Redemption, The (1994)',
               'Forrest Gump (1994)',
               'Good Will Hunting (1997)',
               'Titanic (1997)']

all_movie_id = []
for i in my_favorite:
    movie_id = list(df[df['title'] == i]['movie_id'].unique())
    all_movie_id.append(movie_id)
all_movie_id

[[48], [157], [160], [248], [27]]

In [51]:
# 훈련된 모델이 'Saving Private Ryan (1998)'을 예측한 나의 선호도를 파악해봅시다.
yg, movie_48 = user_idx_to_unique['yg'], 48
yg_vector, movie_48_vector = als_model.user_factors[yg], als_model.item_factors[movie_48]

In [52]:
yg_vector

array([ 0.7056371 ,  0.01878149, -0.6062156 ,  0.24621682, -1.0354201 ,
       -0.62352437, -0.38490838, -0.2927726 ,  0.3726907 ,  0.03147785,
        1.2144166 ,  0.14594801,  0.13477975,  0.7436724 ,  0.19090833,
        0.04729424, -0.7352171 , -0.16699345,  0.7663506 ,  0.72034013,
        0.32039195, -0.6981637 , -0.2126419 ,  0.0833796 ,  0.07509883,
        0.315905  ,  0.29925123,  1.2491441 , -0.274804  , -0.43643996,
       -0.08533525,  0.44366524,  0.77571386,  0.4692348 , -0.59908444,
        0.20558098, -0.26049158, -0.15914434, -0.5019346 ,  0.41659722,
       -0.06663659,  0.20074189, -0.4628734 , -0.5808983 , -1.5754986 ,
       -0.46126136, -0.5474153 , -0.07387185,  0.05066966, -0.05717289,
        0.25771812, -0.4621772 ,  0.68580794,  0.19390503, -0.0493006 ,
        0.16248679, -0.36225525, -0.58672434, -0.3406426 , -0.2776535 ,
       -0.5017942 ,  1.0440989 ,  0.09267075, -0.1388248 , -0.42532215,
       -0.76017797,  1.0658377 ,  0.62706363, -1.1421101 ,  0.78

In [53]:
movie_48_vector

array([-0.00788096, -0.01702356,  0.00445285,  0.03500699, -0.0438008 ,
       -0.00368862, -0.00532075, -0.00082582, -0.00153709, -0.00149346,
        0.06771555,  0.01899575, -0.00145737, -0.00294687,  0.0289432 ,
        0.01639673,  0.0051317 , -0.00842828,  0.0369563 ,  0.02233652,
        0.00225186, -0.00606768, -0.02160966, -0.0212702 ,  0.02491342,
        0.01597309, -0.008429  ,  0.01901294, -0.03184878,  0.01149685,
       -0.00618435,  0.01337116,  0.02102888,  0.01741194, -0.01086812,
        0.02056763, -0.01055825,  0.00518152,  0.00094959,  0.03740155,
        0.00061631,  0.00472488, -0.00699317,  0.00677558, -0.02593982,
       -0.02356737, -0.02425963,  0.00012563,  0.01887548, -0.00216725,
        0.00783248, -0.02089721,  0.01218198,  0.00326959,  0.01751155,
        0.04910197,  0.00198627,  0.00712294, -0.0090694 ,  0.01962757,
       -0.00876296,  0.02348642,  0.02956719, -0.02224245,  0.01197876,
       -0.04580485,  0.0112141 ,  0.00926991, -0.0184399 ,  0.04

In [54]:
# yg와 'Saving Private Ryan (1998)'를 내적하는 코드
np.dot(yg_vector, movie_48_vector) # 0.75가 나왔습니다.

0.7552569

In [55]:
# 훈련된 모델이 'Toy Story (1995)'을 예측한 나의 선호도를 파악해봅시다.

toy_story = list(df[df['title'] == 'Toy Story (1995)']['movie_id'].unique())[0]
yg, toy_story = user_idx_to_unique['yg'], toy_story
yg_vector, toy_story_vector = als_model.user_factors[yg], als_model.item_factors[toy_story]

In [56]:
# yg와 'Toy Story (1995)''를 내적하는 코드
np.dot(yg_vector, toy_story_vector) # 0.21이 나왔습니다.

0.21786147

### 선호하는 영화와 비슷한 영화 추천하기 (step. 3-4)

In [30]:
favorite_movie = 'Saving Private Ryan (1998)'
movie_id = list(df[df['title'] == 'Saving Private Ryan (1998)']['movie_id'].unique())[0]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(48, 0.99999994),
 (87, 0.68024397),
 (23, 0.6421364),
 (157, 0.53234047),
 (487, 0.47492933),
 (141, 0.40434355),
 (121, 0.40262982),
 (248, 0.35293058),
 (3499, 0.3300507),
 (269, 0.3244954),
 (117, 0.3212908),
 (120, 0.3195239),
 (222, 0.31543338),
 (124, 0.31337205),
 (44, 0.30448502)]

In [31]:
# movie_id와 title 매칭하기
similar_movie_id = []
for i in range(len(similar_movie)):
    id_num = similar_movie[i][0]
    similar_movie_id.append(id_num)

all_movie_name = []
for i in similar_movie_id:
    movie_name = list(df[df['movie_id'] == i]['title'].unique())
    all_movie_name.append(movie_name)
all_movie_name

[['Saving Private Ryan (1998)'],
 ['Braveheart (1995)'],
 ["Schindler's List (1993)"],
 ['Shawshank Redemption, The (1994)'],
 ['Boat, The (Das Boot) (1981)'],
 ['Fugitive, The (1993)'],
 ['Silence of the Lambs, The (1991)'],
 ['Good Will Hunting (1997)'],
 ['Simon Sez (1999)'],
 ['GoodFellas (1990)'],
 ['Star Wars: Episode V - The Empire Strikes Back (1980)'],
 ['Raiders of the Lost Ark (1981)'],
 ['Pulp Fiction (1994)'],
 ['Matrix, The (1999)'],
 ['Star Wars: Episode IV - A New Hope (1977)']]

### 'yg' 유저가 가장 좋아할 만한 영화들 추천 하기(step. 3-5)

In [32]:
user = user_idx_to_unique['yg']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

#[movie_idx_to_unique[i[0]] for i in movie_recommended]

[(23, 0.66614723),
 (87, 0.5395366),
 (121, 0.5369754),
 (39, 0.34662545),
 (222, 0.338398),
 (154, 0.3372633),
 (269, 0.33590078),
 (38, 0.32747293),
 (384, 0.31035423),
 (51, 0.30029082),
 (323, 0.28430736),
 (99, 0.26584828),
 (110, 0.26479778),
 (22, 0.25117236),
 (116, 0.24190584),
 (472, 0.2335277),
 (385, 0.21754341),
 (487, 0.20821421),
 (59, 0.20220762),
 (141, 0.20010799)]

In [33]:
# movie_id와 title 매칭하기
movie_recommended_id = []
for i in range(len(movie_recommended)):
    id_num = movie_recommended[i][0]
    movie_recommended_id.append(id_num)

all_movie_name = []
for i in movie_recommended_id:
    movie_name = list(df[df['movie_id'] == i]['title'].unique())
    all_movie_name.append(movie_name)
all_movie_name

[["Schindler's List (1993)"],
 ['Braveheart (1995)'],
 ['Silence of the Lambs, The (1991)'],
 ['Apollo 13 (1995)'],
 ['Pulp Fiction (1994)'],
 ['As Good As It Gets (1997)'],
 ['GoodFellas (1990)'],
 ['Sixth Sense, The (1999)'],
 ['Jerry Maguire (1996)'],
 ['Fargo (1996)'],
 ['Dead Man Walking (1995)'],
 ['American Beauty (1999)'],
 ['Groundhog Day (1993)'],
 ['Back to the Future (1985)'],
 ['Dances with Wolves (1990)'],
 ['Sling Blade (1996)'],
 ['Truman Show, The (1998)'],
 ['Boat, The (Das Boot) (1981)'],
 ['Few Good Men, A (1992)'],
 ['Fugitive, The (1993)']]

In [58]:
# 추천에 기여한 정도 알아보기
saving_private_ryan= list(df[df['title'] == 'Saving Private Ryan (1998)']['movie_id'].unique())[0]
explain = als_model.explain(user, csr_data, itemid=saving_private_ryan)
explain

(0.7420856188407351,
 [(48, 0.47318643200681354),
  (157, 0.12166177741263898),
  (248, 0.07349396989801349),
  (160, 0.04043885317642806),
  (27, 0.03330458634684107)],
 (array([[ 0.59376028,  0.14999778,  0.09537975, ...,  0.14850649,
           0.1432202 ,  0.11533698],
         [ 0.08906272,  0.6165167 ,  0.08863428, ...,  0.1102795 ,
           0.08741323,  0.11858134],
         [ 0.05663271,  0.06895126,  0.58788961, ...,  0.03843249,
           0.05449925,  0.10376137],
         ...,
         [ 0.08817725,  0.0902648 ,  0.04653311, ...,  0.52194639,
           0.02961199, -0.00459651],
         [ 0.08503846,  0.07537443,  0.05344766, ...,  0.07533781,
           0.52793425,  0.01158399],
         [ 0.06848252,  0.09040767,  0.08251141, ...,  0.05767719,
           0.0625157 ,  0.52504539]]),
  False))