# Movielens 영화추천실습   
- 별점 데이터는 explicit data이지만, implicit data로 간주하고 테스트해 볼 수 있음   
- 별점 = 시청횟수로 해석하기   
- 별점 3점 미만 = 선호하지 않는 영화로 가정  

# 1. 데이터 준비 및 전처리

In [19]:
import pandas as pd
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [20]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


3점 이상만 남긴 데이터 수 = 1000209
전체 데이터 중 약 84%가 남은 것을 볼 수 있다

In [21]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [22]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [23]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
movies["title" ] = movies["title"].str.lower()

영화 제목을 추가하기 위해 두 데이터 프레임 병합하기

In [25]:
movie_re = pd.merge(ratings, movies)
movie_re.head()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,one flew over the cuckoo's nest (1975),Drama
1,2,1193,5,978298413,one flew over the cuckoo's nest (1975),Drama
2,12,1193,4,978220179,one flew over the cuckoo's nest (1975),Drama
3,15,1193,4,978199279,one flew over the cuckoo's nest (1975),Drama
4,17,1193,5,978158471,one flew over the cuckoo's nest (1975),Drama


In [26]:
# 사용하는 컬럼만 남겨줍니다.
using_cols = [ 'user_id','title', 'counts']
movie_re = movie_re[using_cols]
movie_re.head(10)

Unnamed: 0,user_id,title,counts
0,1,one flew over the cuckoo's nest (1975),5
1,2,one flew over the cuckoo's nest (1975),5
2,12,one flew over the cuckoo's nest (1975),4
3,15,one flew over the cuckoo's nest (1975),4
4,17,one flew over the cuckoo's nest (1975),5
5,18,one flew over the cuckoo's nest (1975),4
6,19,one flew over the cuckoo's nest (1975),5
7,24,one flew over the cuckoo's nest (1975),5
8,28,one flew over the cuckoo's nest (1975),3
9,33,one flew over the cuckoo's nest (1975),5


# 2. 데이터 분석하기

- ratigns에 있는 유니크한 영화 개수

In [27]:
uniq_movie = movie_re["title"].nunique()
uniq_movie 

3628

In [28]:
uniq_user = movie_re["user_id"].nunique()
uniq_user

6039

In [29]:
pop_movie = movie_re.groupby("title")['user_id'].count()
pop_movie.sort_values(ascending=False).head(30)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
jurassic park (1993)                                     2413
sixth sense, the (1999)                                  2385
fargo (1996)                                             2371
braveheart (1995)                                        2314
men in black (1997)                                      2297
schindler's list (1993)                                  2257
pr

In [30]:
pop_movie.sort_values(ascending=True).head(100)

title
brother's kiss, a (1997)                                              1
ten benny (1997)                                                      1
condition red (1995)                                                  1
telling you (1998)                                                    1
jeanne and the perfect guy (jeanne et le garçon formidable) (1998)    1
                                                                     ..
mirage (1995)                                                         1
24-hour woman (1998)                                                  1
voyage to the beginning of the world (1997)                           1
wend kuuni (god's gift) (1982)                                        1
last of the high kings, the (a.k.a. summer fling) (1996)              1
Name: user_id, Length: 100, dtype: int64

In [31]:
movie_re[movie_re["title"].str.contains("little")]

Unnamed: 0,user_id,title,counts
181297,3,"little mermaid, the (1989)",4
181298,6,"little mermaid, the (1989)",4
181299,10,"little mermaid, the (1989)",5
181300,18,"little mermaid, the (1989)",5
181301,19,"little mermaid, the (1989)",3
...,...,...,...
832470,3363,little city (1998),3
832471,4771,little city (1998),4
835724,1298,little men (1998),4
835725,5411,little men (1998),3


In [32]:
#[[ your code ]]
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = ['toy story (1995)' , 'for ever mozart (1996)' ,'e.t. the extra-terrestrial (1982)' ,'terminator 2: judgment day (1991)' ,'little mermaid, the (1989)']

# 'zimin'이라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정하겠습니다.
my_list = pd.DataFrame({'user_id': ['mini']*5, 'title': my_favorite, 'counts':[5]*5})

if not movie_re.isin({'user_id':['mini']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    movie_re = movie_re.append(my_list)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

movie_re.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,title,counts
836473,5851,one little indian (1973),5
836474,5854,slaughterhouse (1987),4
836475,5854,"promise, the (versprechen, das) (1994)",3
836476,5938,"five wives, three secretaries and me (1998)",4
836477,5948,identification of a woman (identificazione di ...,5
0,mini,toy story (1995),5
1,mini,for ever mozart (1996),5
2,mini,e.t. the extra-terrestrial (1982),5
3,mini,terminator 2: judgment day (1991),5
4,mini,"little mermaid, the (1989)",5


In [33]:
# 사용하는 컬럼만 남겨줍니다.
using_cols2 = [ 'user_id','title', 'counts']
movie_re = movie_re[using_cols2]
movie_re.tail(10)

Unnamed: 0,user_id,title,counts
836473,5851,one little indian (1973),5
836474,5854,slaughterhouse (1987),4
836475,5854,"promise, the (versprechen, das) (1994)",3
836476,5938,"five wives, three secretaries and me (1998)",4
836477,5948,identification of a woman (identificazione di ...,5
0,mini,toy story (1995),5
1,mini,for ever mozart (1996),5
2,mini,e.t. the extra-terrestrial (1982),5
3,mini,terminator 2: judgment day (1991),5
4,mini,"little mermaid, the (1989)",5


In [34]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = movie_re['user_id'].unique()
movie_unique = movie_re['title'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [35]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['mini'])    # 358869명의 유저 중 마지막으로 추가된 유저이니 358868이 나와야 합니다. 
print(movie_to_idx['little mermaid, the (1989)'])

6039
191


In [36]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = movie_re['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(movie_re):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    movie_re['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = movie_re['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(movie_re):
    print('movie column indexing OK!!')
    movie_re['title'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

movie_re

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,title,counts
0,0,0,5
1,1,0,5
2,2,0,4
3,3,0,4
4,4,0,5
...,...,...,...
0,6039,40,5
1,6039,3035,5
2,6039,26,5
3,6039,92,5


# 3. CSR matrix 구성

In [37]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = movie_re['user_id'].nunique()
num_movie = movie_re['title'].nunique()

csr_data = csr_matrix((movie_re.counts, (movie_re.user_id, movie_re.title)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# 4. 모델 구성 및 훈련

In [38]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1' 

In [39]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [40]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [41]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [42]:
mini,little = user_to_idx['mini'], movie_to_idx['little mermaid, the (1989)']
mini_vector, little_vector = als_model.user_factors[mini], als_model.item_factors[little]


In [43]:
mini_vector

array([ 4.31043744e-01,  3.68831486e-01, -1.07675724e-01,  4.16129455e-02,
       -1.10987496e+00, -8.84745240e-01, -2.09481880e-01,  2.77805448e-01,
       -1.55255437e-01,  2.33900592e-01,  5.91272354e-01,  5.05392015e-01,
        3.82106513e-01, -3.49589922e-02,  5.36230266e-01, -5.95348239e-01,
       -1.39432460e-01,  6.93272054e-01,  2.72825539e-01, -5.43092668e-01,
       -9.19260979e-02, -6.38065441e-03, -2.47853532e-01, -4.44707990e-01,
        3.57816279e-01,  3.54888886e-01,  7.77294397e-01,  3.64490747e-01,
       -2.81501412e-01, -1.15324169e-01, -1.57895148e-01,  7.74566948e-01,
       -5.97460866e-01,  1.97174996e-02, -7.58245029e-03,  5.46392739e-01,
       -7.04477072e-01,  3.25940460e-01,  2.72210568e-01, -1.36940062e-01,
        4.20936853e-01, -9.70191732e-02, -4.31795806e-01,  2.84237266e-01,
        8.16184044e-01, -1.93379790e-01,  6.59483790e-01, -2.55303949e-01,
       -5.30446351e-01, -9.44138914e-02,  3.03707659e-01,  5.86431205e-01,
       -3.98098975e-01, -

In [44]:
little_vector

array([ 0.01316386, -0.00407214,  0.01320155,  0.01186037, -0.01338487,
       -0.0148115 ,  0.00265931,  0.02620631, -0.00768043, -0.01635449,
        0.02740661, -0.00670404,  0.01544064,  0.0177283 ,  0.01574926,
        0.00032589,  0.00968682, -0.00186111,  0.01703179,  0.00910158,
        0.00365762,  0.01961848, -0.0023017 , -0.00412474,  0.00993684,
       -0.00159167,  0.00199245,  0.02782474,  0.01553477,  0.02900633,
       -0.01589931,  0.01406799, -0.02350472, -0.01377625,  0.01104388,
        0.04079806,  0.00719654,  0.03537108, -0.01521204,  0.02215276,
        0.00905927, -0.03093028, -0.00075862,  0.00543465,  0.01302404,
        0.01747687,  0.00432728,  0.00954858, -0.00587326, -0.00839543,
        0.01832992,  0.02044401,  0.00602061, -0.01626082,  0.0151612 ,
       -0.01236966,  0.01316287,  0.00065466,  0.02717439, -0.00086403,
       -0.00353547,  0.01008864,  0.0010096 ,  0.03025719, -0.00799519,
       -0.02018746, -0.00191376, -0.02775017, -0.01667346,  0.01

In [45]:
#user와 name 내적하는 코드
np.dot(mini_vector, little_vector)

0.30543673

내적 수치가 생각보다 많이 적게 나왔다.

In [46]:
another = movie_to_idx['one little indian (1973)']
another_vector = als_model.item_factors[another]
np.dot(mini_vector, another_vector)

-0.00013411883

In [69]:
als_model2 = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)

In [70]:
als_model2.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [71]:
mini2,little2 = user_to_idx['mini'], movie_to_idx['little mermaid, the (1989)']
mini_vector2, little_vector2 = als_model2.user_factors[mini2], als_model2.item_factors[little2]

In [72]:
np.dot(mini_vector2, little_vector2)

0.3175292

In [73]:
another = movie_to_idx['one little indian (1973)']
another_vector2 = als_model2.item_factors[another]
np.dot(mini_vector2, another_vector2)

-0.00088033173

# 5. 비슷한 영화 찾기

In [56]:
favorite_movie = 'toy story (1995)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(40, 1.0000001),
 (50, 0.78540003),
 (322, 0.57455534),
 (33, 0.5607696),
 (4, 0.5548322),
 (110, 0.5288671),
 (330, 0.43347067),
 (10, 0.42545444),
 (255, 0.41222253),
 (20, 0.4098301),
 (126, 0.34136403),
 (34, 0.32006544),
 (545, 0.315854),
 (851, 0.31390914),
 (160, 0.31089428)]

In [57]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['toy story (1995)',
 'toy story 2 (1999)',
 'babe (1995)',
 'aladdin (1992)',
 "bug's life, a (1998)",
 'groundhog day (1993)',
 'lion king, the (1994)',
 'beauty and the beast (1991)',
 "there's something about mary (1998)",
 'pleasantville (1998)',
 'shakespeare in love (1998)',
 'mulan (1998)',
 'santa clause, the (1994)',
 'iron giant, the (1999)',
 'forrest gump (1994)']

In [75]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [76]:
get_similar_movie('mulan (1998)')

['mulan (1998)',
 'tarzan (1999)',
 'hunchback of notre dame, the (1996)',
 'antz (1998)',
 'hercules (1997)',
 'rescuers down under, the (1990)',
 'anastasia (1997)',
 'aladdin and the king of thieves (1996)',
 'pocahontas (1995)',
 'lion king, the (1994)']

In [74]:
def get_similar_movie2(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model2.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [77]:
get_similar_movie2('mulan (1998)')

['mulan (1998)',
 'tarzan (1999)',
 'antz (1998)',
 'hunchback of notre dame, the (1996)',
 'hercules (1997)',
 'rescuers down under, the (1990)',
 'anastasia (1997)',
 'lion king, the (1994)',
 'balto (1995)',
 'beauty and the beast (1991)']

intertation을 다르게 한 두 모델의 추천 영화

# 6. 유저에게 영화 추천하기

In [64]:
user = user_to_idx['mini']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(33, 0.38440764),
 (50, 0.36181855),
 (107, 0.34719315),
 (124, 0.3433588),
 (322, 0.3283199),
 (10, 0.31946912),
 (330, 0.30543864),
 (4, 0.3023451),
 (22, 0.2943802),
 (175, 0.2800238),
 (200, 0.26661175),
 (141, 0.21951684),
 (284, 0.21645984),
 (62, 0.21110693),
 (126, 0.20190541),
 (9, 0.20080891),
 (317, 0.19897652),
 (45, 0.1970222),
 (117, 0.19386618),
 (87, 0.19196722)]

In [65]:
[idx_to_movie[i[0]] for i in movie_recommended]

['aladdin (1992)',
 'toy story 2 (1999)',
 'jurassic park (1993)',
 'matrix, the (1999)',
 'babe (1995)',
 'beauty and the beast (1991)',
 'lion king, the (1994)',
 "bug's life, a (1998)",
 'back to the future (1985)',
 'men in black (1997)',
 'terminator, the (1984)',
 'fugitive, the (1993)',
 'nightmare before christmas, the (1993)',
 'total recall (1990)',
 'shakespeare in love (1998)',
 'wizard of oz, the (1939)',
 'twelve monkeys (1995)',
 'mary poppins (1964)',
 'star wars: episode v - the empire strikes back (1980)',
 'braveheart (1995)']

- explain으로 추천 기여한 정도 확인하기

In [67]:
aladdin  = movie_to_idx['aladdin (1992)']
explain = als_model.explain(user, csr_data, itemid=aladdin)

In [68]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('toy story (1995)', 0.17511690360080726),
 ('little mermaid, the (1989)', 0.1639233562339709),
 ('e.t. the extra-terrestrial (1982)', 0.04793775031650059),
 ('for ever mozart (1996)', 0.0015378684020498916),
 ('terminator 2: judgment day (1991)', -0.010038441911593799)]

In [82]:
jurassic  = movie_to_idx['jurassic park (1993)']
explain = als_model.explain(user, csr_data, itemid=jurassic)

In [83]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('terminator 2: judgment day (1991)', 0.2988325976019768),
 ('e.t. the extra-terrestrial (1982)', 0.043728044882865136),
 ('little mermaid, the (1989)', 0.03065957055642098),
 ('for ever mozart (1996)', 0.0017713742302065886),
 ('toy story (1995)', -0.03189149830141909)]

In [78]:
movie_recommended2 = als_model2.recommend(user, csr_data, N=20, filter_already_liked_items=True)

In [79]:
[idx_to_movie[i[0]] for i in movie_recommended2]

['aladdin (1992)',
 'toy story 2 (1999)',
 'matrix, the (1999)',
 "bug's life, a (1998)",
 'beauty and the beast (1991)',
 'lion king, the (1994)',
 'babe (1995)',
 'back to the future (1985)',
 'terminator, the (1984)',
 'jurassic park (1993)',
 'twelve monkeys (1995)',
 'men in black (1997)',
 'fugitive, the (1993)',
 'nightmare before christmas, the (1993)',
 'mary poppins (1964)',
 'total recall (1990)',
 'braveheart (1995)',
 'lady and the tramp (1955)',
 'star wars: episode v - the empire strikes back (1980)',
 'snow white and the seven dwarfs (1937)']

In [80]:
matrix  = movie_to_idx['matrix, the (1999)']
explain = als_model.explain(user, csr_data, itemid=matrix)

In [81]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('terminator 2: judgment day (1991)', 0.2698045368834172),
 ('toy story (1995)', 0.06597178995586467),
 ('e.t. the extra-terrestrial (1982)', 0.012687327682412913),
 ('for ever mozart (1996)', 0.00041047245696604936),
 ('little mermaid, the (1989)', -0.01168290852412787)]