# 데이터 전처리

In [1]:
import numpy as np
import pandas as pd
import scipy
import implicit
import os
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [2]:
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep = '::', names = ratings_cols, engine = 'python', encoding = 'ISO-8859-1')
original_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
ratings = ratings[ratings['ratings'] >= 3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size: .2%}')

original_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is  83.63%


In [4]:
ratings.rename(columns = {'ratings': 'counts'}, inplace = True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
movie_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_file_path, sep = '::', names = cols, engine ='python', encoding = 'ISO-8859-1')
movies['title'] = movies['title'].str.lower()
movies['genre'] = movies['genre'].str.lower()
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy


In [7]:
movies_ratings = pd.merge(movies, ratings)
movies_ratings.head()
# movies 와 ratings 데이터를 병합

Unnamed: 0,movie_id,title,genre,user_id,counts,timestamp
0,1,toy story (1995),animation|children's|comedy,1,5,978824268
1,1,toy story (1995),animation|children's|comedy,6,4,978237008
2,1,toy story (1995),animation|children's|comedy,8,4,978233496
3,1,toy story (1995),animation|children's|comedy,9,5,978225952
4,1,toy story (1995),animation|children's|comedy,10,5,978226474


In [8]:
movies_ratings.isnull().sum()

movie_id     0
title        0
genre        0
user_id      0
counts       0
timestamp    0
dtype: int64

In [9]:
using_cols = ['user_id', 'title', 'counts']
movies_ratings = movies_ratings[using_cols]
movies_ratings.head()

Unnamed: 0,user_id,title,counts
0,1,toy story (1995),5
1,6,toy story (1995),4
2,8,toy story (1995),4
3,9,toy story (1995),5
4,10,toy story (1995),5


# 데이터 탐색

In [10]:
ratings['user_id'].nunique()

6039

In [11]:
ratings['movie_id'].nunique()

3628

In [12]:
popular = movies_ratings.groupby('title')['user_id'].count()
popular.sort_values(ascending = False).head(30)

title
american beauty (1999)                                   3211
star wars: episode iv - a new hope (1977)                2910
star wars: episode v - the empire strikes back (1980)    2885
star wars: episode vi - return of the jedi (1983)        2716
saving private ryan (1998)                               2561
terminator 2: judgment day (1991)                        2509
silence of the lambs, the (1991)                         2498
raiders of the lost ark (1981)                           2473
back to the future (1985)                                2460
matrix, the (1999)                                       2434
jurassic park (1993)                                     2413
sixth sense, the (1999)                                  2385
fargo (1996)                                             2371
braveheart (1995)                                        2314
men in black (1997)                                      2297
schindler's list (1993)                                  2257
pr

# 내가 선호하는 영화 5가지 

In [13]:
my_favorite = ['toy story (1995)', 'men in black (1997)', 'ghostbusters (1984)', 'back to the future (1985)', 'terminator 2: judgment day (1991)']

my_movies = pd.DataFrame({'user_id': ['me']*5, 'title': my_favorite, 'counts': [5.0]*5})

if not movies_ratings.isin({'user_id': ['me']})['user_id'].any():
    movies_ratings = movies_ratings.append(my_movies)
movies_ratings.tail(10)

Unnamed: 0,user_id,title,counts
836473,5682,"contender, the (2000)",3.0
836474,5812,"contender, the (2000)",4.0
836475,5831,"contender, the (2000)",3.0
836476,5837,"contender, the (2000)",4.0
836477,5998,"contender, the (2000)",4.0
0,me,toy story (1995),5.0
1,me,men in black (1997),5.0
2,me,ghostbusters (1984),5.0
3,me,back to the future (1985),5.0
4,me,terminator 2: judgment day (1991),5.0


# 모델에 활용하기 위한 전처리

In [14]:
user_unique = movies_ratings['user_id'].unique()
title_unique = movies_ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)} # 딕션어리를 통해 user id를 정수화
title_to_idx = {v:k for k,v in enumerate(title_unique)} # 딕션어리를 통해 title을 정수화

In [15]:
temp_user_data = movies_ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(movies_ratings):
    print('user_id column indexing complete')
    movies_ratings['user_id'] = temp_user_data
else:
    print('user_id column indexing failed')
    
temp_title_data = movies_ratings['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(movies_ratings):
    print('title column indexing complete')
    movies_ratings['title'] = temp_title_data
else:
    print('title column indexing failed')
    
movies_ratings

user_id column indexing complete
title column indexing complete


Unnamed: 0,user_id,title,counts
0,0,0,5.0
1,1,0,4.0
2,2,0,4.0
3,3,0,5.0
4,4,0,5.0
...,...,...,...
0,6039,0,5.0
1,6039,1419,5.0
2,6039,2462,5.0
3,6039,1152,5.0


# CSR matrix

In [16]:
num_user = movies_ratings['user_id'].nunique()
num_movies = movies_ratings['title'].nunique()

csr_data = csr_matrix((movies_ratings.counts, (movies_ratings.user_id, movies_ratings.title)), shape = (num_user, num_movies))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.float64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# MF Model

In [17]:
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'

In [18]:
als_model = AlternatingLeastSquares(factors = 100, regularization = 0.01, use_gpu = False, iterations = 15, dtype = np.float32)

In [19]:
csr_data_transpose = csr_data.T

In [20]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
me = user_to_idx['me']
men_in_black = title_to_idx['men in black (1997)']
me_vector, men_in_black_vector = als_model.user_factors[me], als_model.item_factors[men_in_black]

In [26]:
me_vector

array([ 0.9101659 , -0.46187508,  0.25003645,  0.7540665 , -1.011798  ,
        0.42528528, -0.6494303 ,  0.22614776,  1.2160629 , -0.7044644 ,
        0.04920909, -0.08783229,  0.76130587,  0.77802336,  0.36848179,
       -0.16996133, -0.04073583,  0.35000318,  0.02652938,  0.4925242 ,
       -0.6699269 ,  0.5983576 , -0.41531205,  0.02157569,  0.10248125,
        0.8286313 , -0.15255055,  0.45832625,  0.66576964, -0.41337094,
       -0.5623208 , -0.35281077,  0.36196992,  0.09485107,  0.7544501 ,
        0.09128311,  0.22352478, -0.8392337 ,  0.2072167 ,  0.92586476,
        0.10508924,  0.65701467,  0.8921608 ,  0.39073604,  0.2961508 ,
       -0.20269468,  0.42129004, -1.0859919 ,  0.23301192, -0.14434293,
       -0.5793404 , -0.46173277, -0.539462  , -0.13725935, -0.19534579,
        0.42072797,  0.20966074, -0.6169136 , -0.6870163 ,  0.5871776 ,
       -0.92921454, -0.5292167 , -0.44835263, -0.20907432,  0.6080913 ,
        0.273948  ,  0.18929476, -0.8744162 ,  0.7591356 ,  0.18

In [28]:
men_in_black_vector

array([ 2.98045147e-02,  1.75657552e-02,  1.28001673e-02,  8.68000183e-03,
       -2.49843597e-02,  7.48545025e-03, -9.02388617e-03, -3.16941366e-03,
        3.00477375e-03, -3.16096283e-02, -2.37446018e-02, -3.19427587e-02,
        1.02120526e-02,  2.16342863e-02,  1.00226430e-02,  1.00233182e-02,
       -7.26802554e-03,  2.92859343e-03,  7.74391787e-03,  2.29529645e-02,
       -2.72795483e-02,  2.04738174e-02,  4.34892811e-03,  4.31256630e-02,
        3.06978691e-02,  1.82777010e-02, -1.21688191e-02,  2.44697314e-02,
        3.49072032e-02,  3.20670963e-03, -1.35661820e-02, -1.20612178e-02,
        2.77210977e-02, -7.07753189e-03,  3.56369987e-02,  4.73493536e-04,
        6.61325874e-03, -2.21508592e-02,  2.62359884e-02,  1.54263759e-03,
        9.62277781e-03,  2.62172688e-02,  1.35259219e-02,  1.08950520e-02,
        2.21691187e-02, -1.14996154e-02, -1.08734285e-02, -2.21318230e-02,
        1.78699382e-02, -1.41346361e-02, -5.68419369e-03, -2.51271017e-03,
       -1.65741555e-02, -

# 훈련된 모델이 예측한 나의 선호도

In [29]:
np.dot(me_vector, men_in_black_vector)

0.57341343

In [30]:
star_war = title_to_idx['star wars: episode v - the empire strikes back (1980)']
star_war_vector = als_model.item_factors[star_war]
np.dot(me_vector, star_war_vector)

0.21939692

In [31]:
toy = title_to_idx['toy story (1995)']
toy_vector = als_model.item_factors[toy]
np.dot(me_vector, toy_vector)

0.46888375

# 내가 좋아하는 영화와 비슷한 영화 추천받기

In [32]:
favorite_movie = 'back to the future (1985)'
title_id = title_to_idx[favorite_movie]
similar_movie = als_model.similar_items(title_id, N = 15)
similar_movie

[(1152, 1.0000002),
 (2659, 0.63346046),
 (1793, 0.55942464),
 (1189, 0.54025877),
 (2462, 0.51182723),
 (2542, 0.5002374),
 (985, 0.4822584),
 (2166, 0.4810528),
 (1003, 0.45723435),
 (1954, 0.43187565),
 (1267, 0.4287266),
 (3066, 0.41440427),
 (523, 0.4066815),
 (1080, 0.4061891),
 (3460, 0.40371782)]

In [33]:
idx_to_title = {v:k for k,v in title_to_idx.items()}
[idx_to_title[i[0]] for i in similar_movie]

['back to the future (1985)',
 "ferris bueller's day off (1986)",
 'back to the future part ii (1989)',
 'when harry met sally... (1989)',
 'ghostbusters (1984)',
 'big (1988)',
 'fish called wanda, a (1988)',
 'cocoon (1985)',
 'e.t. the extra-terrestrial (1982)',
 'beetlejuice (1988)',
 'raising arizona (1987)',
 'bull durham (1988)',
 'blade runner (1982)',
 'star wars: episode v - the empire strikes back (1980)',
 'project moon base (1953)']

In [36]:
def get_similar_movie(title: str):
    title_id = title_to_idx[title]
    similar_movie = als_model.similar_items(title_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

In [37]:
get_similar_movie('star wars: episode v - the empire strikes back (1980)')

['star wars: episode v - the empire strikes back (1980)',
 'star wars: episode vi - return of the jedi (1983)',
 'star wars: episode iv - a new hope (1977)',
 'raiders of the lost ark (1981)',
 'terminator, the (1984)',
 'e.t. the extra-terrestrial (1982)',
 'indiana jones and the last crusade (1989)',
 'aliens (1986)',
 'star wars: episode i - the phantom menace (1999)',
 'back to the future (1985)']

# 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [38]:
user = user_to_idx['me']
movie_recommended = als_model.recommend(user, csr_data, N = 20, filter_already_liked_items = True)
movie_recommended

[(462, 0.64601195),
 (2325, 0.47798312),
 (2845, 0.38599002),
 (106, 0.3762676),
 (2657, 0.34673014),
 (2536, 0.32125264),
 (2114, 0.30479616),
 (1147, 0.29175904),
 (1003, 0.29118848),
 (439, 0.2890051),
 (1810, 0.2721449),
 (2600, 0.2690285),
 (2507, 0.26735762),
 (342, 0.26349103),
 (2659, 0.25257948),
 (249, 0.2505327),
 (1094, 0.24399066),
 (2542, 0.24200448),
 (1954, 0.24037875),
 (2724, 0.23650491)]

In [41]:
[idx_to_title[i[0]] for i in movie_recommended]

['jurassic park (1993)',
 'matrix, the (1999)',
 'toy story 2 (1999)',
 'braveheart (1995)',
 'total recall (1990)',
 'airplane! (1980)',
 "bug's life, a (1998)",
 'groundhog day (1993)',
 'e.t. the extra-terrestrial (1982)',
 'fugitive, the (1993)',
 'saving private ryan (1998)',
 'american beauty (1999)',
 'sixth sense, the (1999)',
 'forrest gump (1994)',
 "ferris bueller's day off (1986)",
 'star wars: episode iv - a new hope (1977)',
 'star wars: episode vi - return of the jedi (1983)',
 'big (1988)',
 'beetlejuice (1988)',
 'who framed roger rabbit? (1988)']

In [42]:
jurassic_park = title_to_idx['jurassic park (1993)']
explain = als_model.explain(user, csr_data, itemid = jurassic_park)

In [43]:
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('men in black (1997)', 0.3118876890256918),
 ('terminator 2: judgment day (1991)', 0.23157724314965253),
 ('back to the future (1985)', 0.08085572323500627),
 ('ghostbusters (1984)', 0.03301199659990118),
 ('toy story (1995)', -0.023736384549490477)]