# [EX_14] 프로젝트 - Movielens 영화 추천 실습

# 1) 데이터 준비와 전처리

In [1]:
import pandas as pd
import os
import numpy as np

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [4]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [5]:
del ratings['timestamp']

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


# 2) 분석해 봅시다.

* ratings에 있는 유니크한 영화 개수
* ratings에 있는 유니크한 사용자 수
* 가장 인기 있는 영화 30개(인기순)

In [8]:
ratings['movie_id'].nunique()

3628

In [9]:
ratings['user_id'].nunique()

6039

In [10]:
# 인기 많은 영화
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

In [11]:
movies[movies['movie_id']==2858]

Unnamed: 0,movie_id,title,genre
2789,2858,American Beauty (1999),Comedy|Drama


In [12]:
movies[movies['movie_id']==260]

Unnamed: 0,movie_id,title,genre
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi


In [13]:
movies[movies['movie_id']==1196]

Unnamed: 0,movie_id,title,genre
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War


# 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

In [14]:
movies['title'] = movies['title'].str.lower()
movies['genre'] = movies['genre'].str.lower()

In [15]:
movies[movies['title'].str.contains('before sunrise')]

Unnamed: 0,movie_id,title,genre
213,215,before sunrise (1995),drama|romance


In [16]:
movies[movies['title'].str.contains('notting hill')]

Unnamed: 0,movie_id,title,genre
2602,2671,notting hill (1999),comedy|romance


In [17]:
movies[movies['title'].str.contains('speed')]

Unnamed: 0,movie_id,title,genre
373,377,speed (1994),action|romance|thriller
1517,1556,speed 2: cruise control (1997),action|romance|thriller
1675,1724,full speed (1996),drama


In [18]:
movies[movies['title'].str.contains('truman show')]

Unnamed: 0,movie_id,title,genre
1636,1682,"truman show, the (1998)",drama


In [19]:
movies[movies['title'].str.contains('forrest gump')]

Unnamed: 0,movie_id,title,genre
352,356,forrest gump (1994),comedy|romance|war


In [20]:
my_favorite_title = ['before sunrise (1995)' , 'notting hill (1999)' ,'speed (1994)' ,'truman show, the (1998)' ,'forrest gump (1994)']
my_favorite = [215, 2671, 377, 1682, 356]

# 'jkm'이라는 user_id가 위 영화의 별점을 5점씩 주었다고 가정 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': ['jkm']*5, 'movie_id': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':['jkm']})['user_id'].any():  # user_id에 'jkm'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,counts
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,jkm,215,5
1,jkm,2671,5
2,jkm,377,5
3,jkm,1682,5
4,jkm,356,5


In [21]:
ratings = pd.merge(ratings, movies)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,title,genre
836473,5494,3530,4,smoking/no smoking (1993),comedy
836474,5556,2198,3,modulations (1998),documentary
836475,5949,2198,5,modulations (1998),documentary
836476,5675,2703,3,broken vessels (1998),drama
836477,5717,2258,4,master ninja i (1984),action
836478,5851,3607,5,one little indian (1973),comedy|drama|western
836479,5854,3026,4,slaughterhouse (1987),horror
836480,5854,690,3,"promise, the (versprechen, das) (1994)",romance
836481,5938,2909,4,"five wives, three secretaries and me (1998)",documentary
836482,5948,1360,5,identification of a woman (identificazione di ...,drama


In [22]:
check = ratings[ratings['user_id']=='jkm']
check

Unnamed: 0,user_id,movie_id,counts,title,genre
154344,jkm,356,5,forrest gump (1994),comedy|romance|war
199500,jkm,215,5,before sunrise (1995),drama|romance
214705,jkm,377,5,speed (1994),action|romance|thriller
298495,jkm,1682,5,"truman show, the (1998)",drama
622897,jkm,2671,5,notting hill (1999),comedy|romance


In [23]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = ratings['user_id'].unique()
title_unique = ratings['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

In [24]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# title_to_idx을 통해 title 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_title_data = ratings['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(ratings):
    print('title column indexing OK!!')
    ratings['title'] = temp_title_data
else:
    print('title column indexing Fail!!')

ratings

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,title,genre
0,0,1193,5,0,drama
1,1,1193,5,0,drama
2,2,1193,4,0,drama
3,3,1193,4,0,drama
4,4,1193,5,0,drama
...,...,...,...,...,...
836478,1621,3607,5,3623,comedy|drama|western
836479,3481,3026,4,3624,horror
836480,3481,690,3,3625,romance
836481,4159,2909,4,3626,documentary


In [25]:
ratings[154344:154345]

Unnamed: 0,user_id,movie_id,counts,title,genre
154344,6022,356,5,160,comedy|romance|war


# 4) CSR matrix를 직접 만들어 봅시다.

In [26]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_title = ratings['title'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape= (num_user, num_title))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [27]:
from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [35]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [36]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [37]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

# 6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

In [38]:
jkm, before_sunrise = user_to_idx['jkm'], title_to_idx['before sunrise (1995)']
jkm_vector, before_sunrise_vector = als_model.user_factors[jkm], als_model.item_factors[before_sunrise]

In [39]:
jkm_vector

array([ 6.7161024e-02, -2.8864270e-02,  7.1996704e-02,  4.3278456e-01,
        1.2645477e+00,  4.8168918e-01,  4.0344214e-01, -4.6367094e-01,
        5.8477527e-01, -1.1569973e-02,  5.9715426e-01, -6.2981814e-01,
       -5.4119951e-01, -2.1036556e-01,  7.7049518e-01, -3.3206972e-01,
        2.3165464e-01,  6.6070372e-01, -2.5676352e-01, -5.3195632e-01,
       -8.9827023e-02,  2.1227957e-01,  4.9853814e-01, -1.6093746e-01,
        2.7108511e-01,  3.9139390e-01,  1.3390206e-01,  4.5429063e-01,
       -4.6284392e-01,  3.2052755e-01, -7.0108312e-01,  3.9459157e-01,
       -8.9429390e-01, -6.6397786e-02, -2.4912640e-01,  3.1020954e-01,
       -5.8772832e-01, -6.8034135e-02,  5.0106341e-01,  1.9193724e-01,
       -2.8376463e-01, -3.3998847e-01, -4.4340196e-01, -8.9136157e-03,
       -6.2171304e-01,  8.0343790e-02, -9.0217955e-02,  1.2613122e-01,
       -4.8585248e-01,  6.6189331e-01,  1.3647260e-02,  5.3271331e-05,
       -2.0157461e-01,  1.4434196e-01,  9.3388200e-01, -3.0254120e-01,
      

In [40]:
before_sunrise_vector

array([ 1.51423039e-02,  1.32873915e-02,  1.27061587e-02,  5.32309338e-03,
        9.25560296e-03, -6.89565763e-03,  1.39595242e-02, -7.74855586e-03,
        2.65614036e-02,  1.71031002e-02,  1.67471301e-02,  2.95381690e-03,
       -3.63025768e-03,  1.03865061e-02,  1.24856818e-03, -4.33352352e-06,
        1.45924743e-02,  9.22925491e-03,  1.25999446e-04, -2.71464931e-03,
        1.78397205e-02,  9.09768697e-03,  8.63874611e-03, -1.38753362e-03,
        2.22981645e-04,  7.29834288e-03,  7.35008391e-03,  4.13678645e-04,
       -6.19436207e-04, -2.02395860e-03,  1.51869177e-03,  1.48323737e-02,
        7.22858915e-03, -8.84370238e-04,  9.34964791e-03,  1.93699487e-02,
       -4.52796137e-03,  1.17802834e-02,  6.94235507e-03,  1.76178198e-02,
        2.82800174e-03,  4.13594674e-03,  1.36197833e-02,  8.93607736e-03,
       -7.45443907e-03, -7.44391326e-03,  1.44475047e-02,  1.69166774e-02,
        1.40013974e-02, -1.66371968e-02,  8.85288406e-04,  8.44297837e-03,
       -5.70391584e-03, -

In [41]:
np.dot(jkm_vector, before_sunrise_vector)

0.14249186

내가 평점을 높게 준 영화인데 스코어가 너무 낮게 나온다

In [42]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=500, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [43]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [44]:
jkm_vector, before_sunrise_vector = als_model.user_factors[jkm], als_model.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.48228255

스코어가 조금은 높아졌지만 아직 0.5가 넘지 않는다.

In [45]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [46]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [47]:
jkm_vector, before_sunrise_vector = als_model.user_factors[jkm], als_model.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.6958496

스코어가 약 0.7에 달하므로 만족스럽지만, 조금 더 개선이 가능할 것 같다.

In [48]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=50, dtype=np.float32)

In [49]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/50 [00:00<?, ?it/s]

In [50]:
jkm_vector, before_sunrise_vector = als_model.user_factors[jkm], als_model.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.7014602

epoch만 높이는 것은 큰 개선이 되지 않는것 같다.

In [51]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=1500, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [52]:
jkm_vector, before_sunrise_vector = als_model.user_factors[jkm], als_model.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.8614682

역시나 factor 값을 올리니 스코어가 훨씬 높아졌다.  
만족스러우므로 다른 영화도 확인해보자.

In [53]:
notting_hill = title_to_idx['notting hill (1999)']
notting_hill_vector = als_model.item_factors[notting_hill]
np.dot(jkm_vector, notting_hill_vector)

0.9777871

In [54]:
speed = title_to_idx['speed (1994)']
speed_vector = als_model.item_factors[speed]
np.dot(jkm_vector, speed_vector)

0.99146086

In [55]:
truman_show = title_to_idx['truman show, the (1998)']
truman_show_vector = als_model.item_factors[truman_show]
np.dot(jkm_vector, truman_show_vector)

0.9879666

In [56]:
forrest_gump = title_to_idx['forrest gump (1994)']
forrest_gump_vector = als_model.item_factors[forrest_gump]
np.dot(jkm_vector, forrest_gump_vector)

0.99452615

스코어가 너무 높게 나오는것을 보니 불안하다.  
overfitting인가...??  
일단 평점을 주지 않은 다른 영화에 대해 스코어를 확인해보자!

In [57]:
toy_story = title_to_idx['toy story (1995)']
toy_story_vector = als_model.item_factors[toy_story]
np.dot(jkm_vector, toy_story_vector)

0.0028973932

In [58]:
american_beauty = title_to_idx['american beauty (1999)']
american_beauty_vector = als_model.item_factors[american_beauty]
np.dot(jkm_vector, american_beauty_vector)

-0.0047357483

스코어가 터무니없이 낮게 나온것을 보니 학습이 제대로 된 것 같지 않다.

In [59]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model_0 = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# 모델 훈련
als_model_0.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [60]:
jkm_vector_0, before_sunrise_vector_0 = als_model_0.user_factors[jkm], als_model_0.item_factors[before_sunrise]
np.dot(jkm_vector_0, before_sunrise_vector_0)

0.7139281

In [61]:
toy_story_vector_0 = als_model_0.item_factors[toy_story]
np.dot(jkm_vector_0, toy_story_vector_0)

0.009165236

일단 영화 추천 받아보자!

# 7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [62]:
favorite_title = 'before sunrise (1995)'
title_id = title_to_idx[favorite_title]
similar_title = als_model.similar_items(title_id, N=10)
similar_title

[(213, 0.99999976),
 (3609, 0.59217805),
 (3455, 0.5889652),
 (3454, 0.5878187),
 (3324, 0.5874258),
 (2371, 0.5865199),
 (3477, 0.58556956),
 (3567, 0.58525395),
 (3489, 0.5850325),
 (3606, 0.58499295)]

In [63]:
#title_to_idx 를 뒤집어, index로부터 title 이름을 얻는 dict를 생성합니다. 
idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]] for i in similar_title]

['before sunrise (1995)',
 '24-hour woman (1998)',
 'sunset park (1996)',
 'blood and sand (sangre y arena) (1989)',
 'race the sun (1996)',
 'gate of heavenly peace, the (1995)',
 'joyriders, the (1999)',
 'bye-bye (1995)',
 'angela (1995)',
 'truce, the (1996)']

In [64]:
def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

In [66]:
get_similar_title('notting hill (1999)')

['notting hill (1999)',
 'woo (1998)',
 'five wives, three secretaries and me (1998)',
 'snows of kilimanjaro, the (1952)',
 'an unforgettable summer (1994)',
 'master ninja i (1984)',
 'i, worst of all (yo, la peor de todas) (1990)',
 'mascara (1999)',
 'bittersweet motel (2000)',
 'late bloomers (1996)']

# 8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [67]:
user = user_to_idx['jkm']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
title_recommended

[(319, 0.06325967),
 (2759, 0.05742381),
 (2726, 0.05689116),
 (241, 0.055702254),
 (2136, 0.054745235),
 (1228, 0.051868934),
 (2887, 0.051728215),
 (1561, 0.051348887),
 (2317, 0.050847962),
 (1967, 0.050710276),
 (2692, 0.05004067),
 (1464, 0.04965952),
 (439, 0.04832155),
 (296, 0.04775565),
 (309, 0.04739391),
 (2573, 0.046833813),
 (894, 0.046723098),
 (2269, 0.04672057),
 (2372, 0.046190266),
 (2702, 0.045143872)]

In [68]:
[idx_to_title[i[0]] for i in title_recommended]

['high art (1998)',
 'indian summer (a.k.a. alive & kicking) (1996)',
 'daytrippers, the (1996)',
 'pillow book, the (1995)',
 'song of the south (1946)',
 'it could happen to you (1994)',
 'guinevere (1999)',
 'object of my affection, the (1998)',
 'burnt by the sun (utomlyonnye solntsem) (1994)',
 'bloodsport (1988)',
 'amateur (1994)',
 'home fries (1998)',
 'my crazy life (mi vida loca) (1993)',
 'afterglow (1997)',
 'cruise, the (1998)',
 'van, the (1996)',
 'laura (1944)',
 "young poisoner's handbook, the (1995)",
 '42 up (1998)',
 'breakfast of champions (1999)']

값이 너무 낮은걸 보니 문제가 있다.  
factor를 낮추고 epoch을 늘려보자.

In [69]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model_1 = AlternatingLeastSquares(factors=500, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)

# 모델 훈련
als_model_1.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [70]:
jkm_vector, before_sunrise_vector = als_model_1.user_factors[jkm], als_model_1.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.46240532

In [96]:
def get_similar_title_1(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model_1.similar_items(title_id)
    similar_title = [[idx_to_title[i[0]], "score: "+ str(i[1])] for i in similar_title]
    return similar_title

In [97]:
get_similar_title_1('before sunrise (1995)')

[['before sunrise (1995)', 'score: 1.0'],
 ['sunset park (1996)', 'score: 0.45140925'],
 ['blood and sand (sangre y arena) (1989)', 'score: 0.45066008'],
 ['24-hour woman (1998)', 'score: 0.44889316'],
 ['truce, the (1996)', 'score: 0.4439045'],
 ['joyriders, the (1999)', 'score: 0.43962327'],
 ['naked in new york (1994)', 'score: 0.43924743'],
 ['1-900 (1994)', 'score: 0.43726677'],
 ['301, 302 (1995)', 'score: 0.43724307'],
 ['horror hotel (a.k.a. the city of the dead) (1960)', 'score: 0.43626896']]

In [100]:
title_recommended_1 = als_model_1.recommend(user, csr_data, N=20, filter_already_liked_items=True)
[[idx_to_title[i[0]], 'score: '+ str(i[1])] for i in title_recommended_1]

[['enemy of the state (1998)', 'score: 0.18063974'],
 ['ghost (1990)', 'score: 0.14897114'],
 ['payback (1999)', 'score: 0.14731687'],
 ['die hard (1988)', 'score: 0.14299801'],
 ['four weddings and a funeral (1994)', 'score: 0.12997118'],
 ['101 dalmatians (1961)', 'score: 0.12879157'],
 ['cider house rules, the (1999)', 'score: 0.1282058'],
 ['daytrippers, the (1996)', 'score: 0.12531233'],
 ['man in the iron mask, the (1998)', 'score: 0.12234634'],
 ['october sky (1999)', 'score: 0.11957215'],
 ['wag the dog (1997)', 'score: 0.11930028'],
 ['sliding doors (1998)', 'score: 0.11486637'],
 ['piano, the (1993)', 'score: 0.11223497'],
 ['runaway bride (1999)', 'score: 0.11180338'],
 ['little women (1994)', 'score: 0.11028056'],
 ['fried green tomatoes (1991)', 'score: 0.10944751'],
 ['brady bunch movie, the (1995)', 'score: 0.106386386'],
 ['do the right thing (1989)', 'score: 0.10635714'],
 ['pretty woman (1990)', 'score: 0.10566403'],
 ['johnny mnemonic (1995)', 'score: 0.104895286']]

그래도 스코어가 너무 낮다....학습을 다시 한 번 해보자

In [101]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model_2 = AlternatingLeastSquares(factors=300, regularization=0.01, use_gpu=False, iterations=200, dtype=np.float32)

# 모델 훈련
als_model_2.fit(csr_data_transpose)

  0%|          | 0/200 [00:00<?, ?it/s]

In [102]:
jkm_vector, before_sunrise_vector = als_model_2.user_factors[jkm], als_model_2.item_factors[before_sunrise]
np.dot(jkm_vector, before_sunrise_vector)

0.3328108

In [103]:
def get_similar_title_2(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model_2.similar_items(title_id)
    similar_title = [[idx_to_title[i[0]], "score: "+ str(i[1])] for i in similar_title]
    return similar_title

In [104]:
get_similar_title_2('before sunrise (1995)')

[['before sunrise (1995)', 'score: 1.0'],
 ['heavy (1995)', 'score: 0.40456784'],
 ['pillow book, the (1995)', 'score: 0.38540584'],
 ['guinevere (1999)', 'score: 0.35095745'],
 ['amateur (1994)', 'score: 0.35077566'],
 ['whole wide world, the (1996)', 'score: 0.34817305'],
 ['bitter moon (1992)', 'score: 0.34675047'],
 ['naked in new york (1994)', 'score: 0.33168024'],
 ['career girls (1997)', 'score: 0.3273006'],
 ['leading man, the (1996)', 'score: 0.32708752']]

In [105]:
title_recommended_2 = als_model_2.recommend(user, csr_data, N=20, filter_already_liked_items=True)
[[idx_to_title[i[0]], 'score: '+ str(i[1])] for i in title_recommended_2]

[['pretty woman (1990)', 'score: 0.24704778'],
 ['sliding doors (1998)', 'score: 0.23318802'],
 ['jerry maguire (1996)', 'score: 0.20774767'],
 ['october sky (1999)', 'score: 0.19897364'],
 ['ghost (1990)', 'score: 0.19265862'],
 ['four weddings and a funeral (1994)', 'score: 0.18301342'],
 ['enemy of the state (1998)', 'score: 0.16463971'],
 ['mask, the (1994)', 'score: 0.16135544'],
 ['apollo 13 (1995)', 'score: 0.15650007'],
 ['star wars: episode iv - a new hope (1977)', 'score: 0.15482706'],
 ['runaway bride (1999)', 'score: 0.1546706'],
 ['twister (1996)', 'score: 0.15181325'],
 ['boat, the (das boot) (1981)', 'score: 0.15159811'],
 ['spartacus (1960)', 'score: 0.14044496'],
 ['piano, the (1993)', 'score: 0.13891043'],
 ['back to the future (1985)', 'score: 0.13865188'],
 ['while you were sleeping (1995)', 'score: 0.13499326'],
 ['me, myself and irene (2000)', 'score: 0.13329956'],
 ['shadowlands (1993)', 'score: 0.13229819'],
 ['strangers on a train (1951)', 'score: 0.13202818']]

선호하는 영화로 로맨스 영화를 많이 넣었는데, 스코어는 낮지만 추천 영화로 로맨스 영화들이 상위권에 위치했다.  
학습이 잘 된 것으로 보인다.

---

# 회고

* 해당 프로젝트를 나중에 진행하다보니 MF와 CSR을 접한 것이 이번 노드인지 그 전인지 잘 생각이 나지 않는다.  
행렬 연산을 통해 서로의 연관성을 찾아 스코어를 나열하여 맞춤 상품을 추천하는 시스템을 구성해보니, 생각보다 잘 동작했다.  
또한, factor가 커질수록 행렬 자체가 매우 커지기에 학습에 적합하지 않게 되는데 이 때 CSR을 이용하니 크기도 작아지고 훨씬 효율적으로 동작했다.

* 이렇게 데이터를 다루는 프로젝트를 여러번 진행해왔는데 그때마다 느끼는 것은 데이터를 분석하고 정제하는 과정이 정말 중요하다는 것이다.  
정말 중요한 과정이면서도 정말 어려운 과정이기에 데이터를 다루는 연습이 필요해보인다.  
데이터톤에서 제대로 연습이 될 것 같다.