In [13]:
import os
import pandas as pd
from implicit.als import AlternatingLeastSquares
import numpy as np
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [14]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [15]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [16]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 836478, dtype: int64

In [17]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
ratings.head(10)

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


유니크한 영화 개수 출력하기 

In [19]:
print("영화 개수: ", ratings['movie_id'].nunique())

영화 개수:  3628


유저 수 .출력하기 

In [20]:
print("유저 수: ", ratings['user_id'].nunique())

유저 수:  6039


인기있는 영화 30개뽑기 

In [22]:
count_movies = merge_df.groupby('title')['user_id'].count()
count_movies = count_movies.sort_values(ascending=False)
count_movies.head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

데이터 합치기 

In [23]:

cols = ['user_id', 'title', 'count']
movie_data = merge_df[cols]
movie_data.head()

Unnamed: 0,user_id,title,count
0,1,One Flew Over the Cuckoo's Nest (1975),5
1,2,One Flew Over the Cuckoo's Nest (1975),5
2,12,One Flew Over the Cuckoo's Nest (1975),4
3,15,One Flew Over the Cuckoo's Nest (1975),4
4,17,One Flew Over the Cuckoo's Nest (1975),5


In [24]:
# 내가 선호하는 영화 5개
my_fav = [
    'Closer (2005)',
    'The wolf of wall Street (2013)',
    'Legend (2015)',
    'Black Swan (2010)',
    'gattaca (1998)'
]

# 유저 id, 영화 제목, 평점
my_rating = pd.DataFrame({'user_id' : [6041] * 5, 'title' : my_fav, 'count' : [5, 5, 5, 5, 5]})
my_rating



Unnamed: 0,user_id,title,count
0,6041,Closer (2005),5
1,6041,The wolf of wall Street (2013),5
2,6041,Legend (2015),5
3,6041,Black Swan (2010),5
4,6041,gattaca (1998),5


In [25]:
movie_data = movie_data.append(my_rating)
movie_data[movie_data['user_id'] == 6041]

Unnamed: 0,user_id,title,count
0,6041,Closer (2005),5
1,6041,The wolf of wall Street (2013),5
2,6041,Legend (2015),5
3,6041,Black Swan (2010),5
4,6041,gattaca (1998),5


CSR

In [34]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = movie_data['user_id'].unique()
title_unique = movie_data['title'].unique()

# 유저, 아티스트 indexing 
user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

In [35]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해본다
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거해준다
temp_user_data = movie_data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(movie_data): # 모든 row가 정상적으로 인덱싱되었다면
    print("user_id column indexing OK!!")
    movie_data['user_id'] = temp_user_data # movie_data['user_id']을 인덱싱된 Series로 교체해준다
else:
    print("user_id column indexing Fail!!")

# title_to_idx을 통해 title 컬럼도 동일한 방식으로 인덱싱 해준다
temp_title_data = movie_data['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(movie_data):
    print("title column indexing OK!!")
    movie_data['title'] = temp_title_data
else:
    print("title column indexing Fail!!")

movie_data

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,title,count
0,0,0,5
1,1,0,5
2,2,0,4
3,3,0,4
4,4,0,5
...,...,...,...
0,6039,3628,5
1,6039,3629,5
2,6039,3630,5
3,6039,3631,5


In [36]:
from scipy.sparse import csr_matrix

num_user = movie_data['user_id'].nunique()
num_title = movie_data['title'].nunique()

csr_data = csr_matrix((movie_data['count'], (movie_data.user_id, movie_data.title)), shape=(num_user, num_title))
csr_data

<6040x3633 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [37]:

from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [38]:

als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [39]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3633x6040 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [40]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [48]:
my_id = user_to_idx[6041]

closer=title_to_idx['Closer (2005)']
my_vector, closer_vector = als_model.user_factors[my_id], als_model.item_factors[closer]

In [51]:
print("클로져  : ", np.dot(my_vector, closer_vector))

클로져  :  0.0004851844


In [50]:

my_id = user_to_idx[6041]
fight_club = title_to_idx['Fight Club (1999)']

my_vector, fight_club_vector = als_model.user_factors[my_id], als_model.item_factors[fight_club]

In [52]:
print("파이트 클럽: ", np.dot(my_vector, fight_club_vector))

파이트 클럽:  0.0030610438


좋아하는 영화 추천받기 

In [55]:
idx_to_title = {v:k for k, v in title_to_idx.items()} # 영화 이름 : 영화 idx

def get_similar_movie(movie_title: str):
    title_id = title_to_idx[movie_title]
    similar_movie = als_model.similar_items(title_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

In [56]:
get_similar_movie('Closer (2005)')

['Closer (2005)',
 'Black Swan (2010)',
 'The wolf of wall Street (2013)',
 'Legend (2015)',
 'gattaca (1998)',
 'Century (1993)',
 "I Don't Want to Talk About It (De eso no se habla) (1993)",
 'Last of the High Kings, The (a.k.a. Summer Fling) (1996)',
 'Neon Bible, The (1995)',
 "Brother's Kiss, A (1997)"]

In [58]:
movie_recommended = als_model.recommend(my_id, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(107, 0.006264089),
 (232, 0.0056588706),
 (399, 0.005324651),
 (1521, 0.0045765787),
 (841, 0.004515371),
 (670, 0.0044173105),
 (1843, 0.004360865),
 (326, 0.004356163),
 (221, 0.0043416945),
 (2229, 0.004340453),
 (470, 0.004273564),
 (2154, 0.0042665456),
 (2129, 0.0042547304),
 (2894, 0.0042146184),
 (122, 0.0041885544),
 (2772, 0.004176708),
 (2484, 0.004134761),
 (223, 0.004114782),
 (1570, 0.004104431),
 (2593, 0.0040209796)]

In [60]:

[idx_to_title[i[0]] for i in movie_recommended]

['Jurassic Park (1993)',
 'Beautiful Thing (1996)',
 'Anna and the King (1999)',
 'Pork Chop Hill (1959)',
 'Prince of Egypt, The (1998)',
 'Galaxy Quest (1999)',
 'Run Silent, Run Deep (1958)',
 'Top Gun (1986)',
 'South Park: Bigger, Longer and Uncut (1999)',
 'Thieves (Voleurs, Les) (1996)',
 'Swingers (1996)',
 'Mummy, The (1959)',
 'Character (Karakter) (1997)',
 'When Night Is Falling (1995)',
 'Waking Ned Devine (1998)',
 'Dorado, El (1967)',
 'Beyond the Mat (2000)',
 'Celluloid Closet, The (1995)',
 'Dance with Me (1998)',
 'My Favorite Season (1993)']

#회고 

추천 점수에 따라 가로로 늘어뜨려 배열로 저장하는 CSR 방식을 처음봐서 신기했던 것 같다. https://bkshin.tistory.com/entry/NLP-7-%ED%9D%AC%EC%86%8C-%ED%96%89%EB%A0%AC-Sparse-Matrix-COO-%ED%98%95%EC%8B%9D-CSR-%ED%98%95%EC%8B%9D 

그리고 내가 저장하지 않은 파이트 클럽 영화에 대해서도 vector값이 그렇게 차이가 나지 않았다는 점에서 추천시스템이 제대로 작동하는 것 같아 신기했다. 

구현하는 방식은 그렇게 어렵진 않았던 것 같은데 원리를 이해하는 데에는 많은 시간이 필요할 것 같다.. 