# Preprocessing

이전 스텝에서 배운 MF 모델 학습 방법을 토대로, 내가 좋아할 만한 영화 추천 시스템을 제작해 보겠습니다.

이번에 활용할 데이터셋은 추천 시스템의 MNIST라고 부를만한 Movielens 데이터입니다.

- 유저가 영화에 대해 평점을 매긴 데이터가 데이터 크기 별로 있습니다. MovieLens 1M Dataset 사용을 권장합니다.
- 별점 데이터는 대표적인 explicit 데이터입니다. 하지만 implicit 데이터로 간주하고 테스트해 볼 수 있습니다.
- 별점을 시청횟수로 해석해서 생각하겠습니다.
- 또한 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외하겠습니다.

1) 데이터 준비와 전처리

Movielens 데이터는 rating.dat 안에 이미 인덱싱까지 완료된 사용자-영화-평점 데이터가 깔끔하게 정리되어 있습니다.

In [1]:
# load libraries
import numpy as np
import scipy
import implicit
import os
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [2]:
# load rating data
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
# load movie data
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
# filter df to include only 3 or above
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# change column name 'raitings' to 'counts'
ratings.rename(columns={'ratings':'counts'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
# ratings = pd.merge(left=ratings, right=movies, how='left')
# ratings

# Analyzing data

In [8]:
# num of unique movie
ratings['movie_id'].nunique()

3628

In [9]:
# num of unique user id
ratings['user_id'].nunique()

6039

In [40]:
# view 30 most popular movies
movie_count = ratings.groupby(['movie_id']).count()['user_id'].sort_values(ascending=False).head(30)
movie_count.index

Int64Index([ 99,  44, 117,  64,  48,  92, 121, 120,  22, 124, 107,  38,  51,
             87, 175,  23,   5, 126, 224, 157, 607, 110,  26, 170, 243, 222,
            160, 200,  40, 141],
           dtype='int64', name='movie_id')

In [41]:
# view 30 most popular movies
for i in movie_count.index:
    if i == movies['movie_id']:
        

ValueError: ('Lengths must match to compare', (3883,), (30,))

In [12]:
# 유저별 몇 명의 아티스트를 듣고 있는지에 대한 통계
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [13]:
# 유저별 play횟수 중앙값에 대한 통계
user_median = ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

In [14]:
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_movies = ['Doctor Strange (2016)', 'The Dark Knight (2008)', 'Iron Man (2008)',
             'The Lord of the Rings: The Return of the King (2003)', "Schindler's List (1993)"]

# 'zimin'이라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정하겠습니다.
my_list = pd.DataFrame({'user_id': ['hs']*5, 'movie_id': my_movies, 'counts':[4, 4, 4, 3, 4]})

if not ratings.isin({'user_id':['hs']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_list)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,hs,Doctor Strange (2016),4,
1,hs,The Dark Knight (2008),4,
2,hs,Iron Man (2008),4,
3,hs,The Lord of the Rings: The Return of the King ...,3,
4,hs,Schindler's List (1993),4,


In [15]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_id_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_id_unique)}

In [16]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['hs'])
print(movie_to_idx['Iron Man (2008)'])

6039
3630


In [17]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

user_id column indexing OK!!
movie_id column indexing OK!!


In [18]:
# create CSR matrix
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3633 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [19]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [20]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3633x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [22]:
hs, doctor_strange = user_to_idx['hs'], movie_to_idx['Doctor Strange (2016)']
hs_vector, doctor_strange_vector = als_model.user_factors[hs], als_model.item_factors[doctor_strange]

In [23]:
hs_vector

array([ 9.23022330e-02,  1.23292417e-03, -4.35956046e-02, -1.06458943e-02,
        4.72441316e-02,  1.14092575e-02, -5.14143705e-02, -8.82835463e-02,
       -8.47442597e-02, -3.41473557e-02,  4.73482795e-02,  3.24178971e-02,
       -2.36582477e-02, -5.50643280e-02, -9.41396784e-03, -1.87734533e-02,
       -2.85526030e-02, -1.27397627e-02,  2.22976487e-02,  2.54312810e-02,
        3.11184973e-02, -7.23873973e-02,  9.65329725e-03, -5.79080079e-03,
       -4.06565927e-02, -1.26329541e-01,  1.45100402e-02,  6.00431785e-02,
        8.18528980e-02, -2.38366369e-02, -7.72132203e-02,  4.59056580e-03,
       -3.00217588e-02,  2.87096179e-03,  9.37044993e-03, -3.91430855e-02,
       -9.04945582e-02,  7.47295767e-02, -2.92751882e-02, -3.89610939e-02,
        3.32251005e-02, -8.84914547e-02,  6.73743784e-02,  3.20324190e-02,
        7.12637305e-02, -6.32394478e-02,  1.98921114e-02, -4.36083302e-02,
       -2.65479535e-02,  5.40115386e-02,  4.48223948e-02, -2.05640718e-02,
       -6.67066500e-03,  

In [24]:
doctor_strange_vector

array([0.00631036, 0.00537979, 0.0038249 , 0.00574582, 0.00625711,
       0.00561804, 0.00414592, 0.00403448, 0.00332502, 0.00497408,
       0.00598338, 0.00493255, 0.00461014, 0.00290521, 0.00494242,
       0.00441153, 0.00491627, 0.0038134 , 0.00480262, 0.00544471,
       0.00585497, 0.00294183, 0.00470106, 0.00440628, 0.00389937,
       0.00334714, 0.00659983, 0.00642046, 0.00630962, 0.00511922,
       0.00380163, 0.00577417, 0.00432266, 0.00482015, 0.00456772,
       0.00365405, 0.00457093, 0.00623215, 0.00444344, 0.00430339,
       0.00542743, 0.00209883, 0.0056208 , 0.00507693, 0.00695195,
       0.00424041, 0.00578326, 0.0038936 , 0.00498707, 0.00489764,
       0.00567025, 0.0038201 , 0.00498191, 0.00541972, 0.00471243,
       0.0058847 , 0.00339221, 0.00525638, 0.00549244, 0.0044165 ,
       0.00447131, 0.0055059 , 0.00453611, 0.00441981, 0.00456822,
       0.00450888, 0.00429468, 0.00435553, 0.00493595, 0.00385517,
       0.00454082, 0.00516167, 0.00374585, 0.0043529 , 0.00472

In [25]:
# zimin과 black_eyed_peas를 내적하는 코드
np.dot(hs_vector, doctor_strange_vector) # 0.00028537068 -> 0.021859618

0.027714748

In [26]:
dark_knight = movie_to_idx['The Dark Knight (2008)']
dark_knight_vector = als_model.item_factors[dark_knight]
np.dot(hs_vector, dark_knight_vector) # 0.0002455473 -> 0.021536825

0.0277807

In [27]:
lady_bird = movie_to_idx[260]
lady_bird_vector = als_model.item_factors[lady_bird]
np.dot(hs_vector, lady_bird_vector) # -0.00081391656 -> -0.005021919

-0.0033781074

In [28]:
favorite_movie = 'Doctor Strange (2016)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(3628, 1.0000001),
 (3631, 0.99991363),
 (3629, 0.99991095),
 (3632, 0.9998717),
 (3630, 0.99986964),
 (3615, 0.99740565),
 (3035, 0.99730086),
 (3534, 0.99715436),
 (3531, 0.99707705),
 (3088, 0.99671805),
 (3570, 0.9967022),
 (3549, 0.99666584),
 (3589, 0.9966391),
 (3053, 0.9965271),
 (3156, 0.99649054)]

In [29]:
#movie_to_idx 를 뒤집어, index로부터 movie 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Doctor Strange (2016)',
 'The Lord of the Rings: The Return of the King (2003)',
 'The Dark Knight (2008)',
 "Schindler's List (1993)",
 'Iron Man (2008)',
 3323,
 1579,
 3779,
 872,
 402,
 1107,
 3842,
 3290,
 1062,
 3647]

In [30]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [31]:
get_similar_movie('Iron Man (2008)')

['Iron Man (2008)',
 'The Lord of the Rings: The Return of the King (2003)',
 "Schindler's List (1993)",
 'The Dark Knight (2008)',
 'Doctor Strange (2016)',
 3323,
 1579,
 3779,
 872,
 402]

In [32]:
get_similar_movie('Alien')

KeyError: 'Alien'

In [None]:
user = user_to_idx['hs']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

In [None]:
[idx_to_movie[i[0]] for i in movie_recommended]

In [None]:
alien = movie_to_idx['alien']
explain = als_model.explain(user, csr_data, itemid=alien)

In [None]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

이후에는 이전 스텝에 소개했던 것과 동일한 방식으로 MF model을 구성하여 내가 좋아할 만한 영화를 추천해 볼 수 있습니다.

4) CSR matrix를 직접 만들어 봅시다.

5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.