# Preprocessing

In [1]:
# load libraries
import numpy as np
import scipy
import implicit
import os
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# recommended steps from implicit API for implicit library
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [2]:
# load rating data
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
# load movie data
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
# filter df to include rating of 3 or above
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# change column name 'raitings' to 'counts'
ratings.rename(columns={'ratings':'counts'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
# merge ratings and movies df
ratings = pd.merge(left=ratings, right=movies, how='left')
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...,...
836473,6040,1090,3,956715518,Platoon (1986),Drama|War
836474,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096,4,956715648,Sophie's Choice (1982),Drama


# Analyzing data

In [8]:
# num of unique movie
ratings['movie_id'].nunique()

3628

In [9]:
# num of unique user id
ratings['user_id'].nunique()

6039

In [10]:
# view 30 most popular movies
movie_count = ratings.groupby(['movie_id', 'title']).count()['user_id'].sort_values(ascending=False).head(30)
movie_count

movie_id  title                                                
2858      American Beauty (1999)                                   3211
260       Star Wars: Episode IV - A New Hope (1977)                2910
1196      Star Wars: Episode V - The Empire Strikes Back (1980)    2885
1210      Star Wars: Episode VI - Return of the Jedi (1983)        2716
2028      Saving Private Ryan (1998)                               2561
589       Terminator 2: Judgment Day (1991)                        2509
593       Silence of the Lambs, The (1991)                         2498
1198      Raiders of the Lost Ark (1981)                           2473
1270      Back to the Future (1985)                                2460
2571      Matrix, The (1999)                                       2434
480       Jurassic Park (1993)                                     2413
2762      Sixth Sense, The (1999)                                  2385
608       Fargo (1996)                                             2371


In [11]:
# stats of movie count by user
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [12]:
# stats of ratings by user
user_median = ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

In [13]:
# add my movies manually
my_movies = ['Doctor Strange (2016)', 'The Dark Knight (2008)', 'Iron Man (2008)',
             'The Lord of the Rings: The Return of the King (2003)', "Schindler's List (1993)"]

# add my movie ratings manually
my_list = pd.DataFrame({'user_id': ['hs']*5, 'title': my_movies, 'counts':[4, 4, 4, 3, 4]})

if not ratings.isin({'user_id':['hs']})['user_id'].any():       # user_id doesn't have 'hs'
    ratings = ratings.append(my_list)                           # append my_movies data

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,6040,1090.0,3,956715518.0,Platoon (1986),Drama|War
836474,6040,1094.0,5,956704887.0,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562.0,5,956704746.0,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096.0,4,956715648.0,Sophie's Choice (1982),Drama
836477,6040,1097.0,4,956715569.0,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
0,hs,,4,,Doctor Strange (2016),
1,hs,,4,,The Dark Knight (2008),
2,hs,,4,,Iron Man (2008),
3,hs,,3,,The Lord of the Rings: The Return of the King ...,
4,hs,,4,,Schindler's List (1993),


In [14]:
# unique nums of user and movies
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

In [15]:
# user, movie indexing code; idx is abbreviation for index
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [16]:
# cross-check idx 
print(user_to_idx['hs'])
print(movie_to_idx['Iron Man (2008)'])

6039
3630


In [17]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['title'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

user_id column indexing OK!!
movie_id column indexing OK!!


In [18]:
# create CSR matrix
num_user = ratings['user_id'].nunique()
num_movie = ratings['title'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape= (num_user, num_movie))
csr_data

<6040x3632 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [19]:
# declare Implicit AlternatingLeastSquares
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)

In [20]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3632x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/30 [00:00<?, ?it/s]

In [22]:
hs, doctor_strange = user_to_idx['hs'], movie_to_idx['Doctor Strange (2016)']
hs_vector, doctor_strange_vector = als_model.user_factors[hs], als_model.item_factors[doctor_strange]

In [23]:
hs_vector

array([ 1.41779453e-01,  1.18649369e-02,  5.07242456e-02, -1.52026922e-01,
       -1.49777621e-01, -1.03295326e-01,  4.70314035e-03,  2.17673387e-02,
        9.46322381e-02, -1.45774886e-01, -1.72707792e-02,  9.68178064e-02,
       -9.40428674e-02, -7.54192099e-02,  1.25120476e-01,  6.58759624e-02,
        5.95640875e-02, -2.71612983e-02, -3.73011120e-02,  9.72664952e-02,
        3.09791137e-02, -1.36448860e-01, -1.43715832e-02, -1.12358816e-01,
        2.75077410e-02, -2.78560836e-02,  6.01092763e-02, -6.96489736e-02,
        9.48058069e-02,  2.08959673e-02,  2.14542896e-02, -5.54025061e-02,
       -1.28018707e-01,  1.86452731e-01,  7.87447393e-02,  6.50196970e-02,
        7.13174343e-02, -1.64444104e-01, -1.17318228e-01,  4.02094349e-02,
       -6.58099949e-02,  2.14372035e-02, -2.85990145e-02, -1.13738455e-01,
        3.80515829e-02,  2.37710178e-02, -8.57482012e-03,  1.72578588e-01,
       -4.64098603e-02,  1.35946751e-01,  1.21157490e-01,  7.89724756e-04,
        7.25470781e-02, -

In [24]:
doctor_strange_vector

array([0.00608696, 0.00514721, 0.00576909, 0.00314528, 0.0046696 ,
       0.00507861, 0.00466142, 0.00498031, 0.00572361, 0.00468136,
       0.00468868, 0.00529078, 0.00517832, 0.0045062 , 0.00533844,
       0.00378601, 0.00474906, 0.00523123, 0.00401212, 0.00454191,
       0.00520069, 0.0048508 , 0.00541529, 0.00528601, 0.00490138,
       0.00684008, 0.00618329, 0.00373121, 0.00698311, 0.00548135,
       0.00559969, 0.00384565, 0.00544349, 0.00622286, 0.00469122,
       0.0058062 , 0.00533489, 0.00396375, 0.00358069, 0.00483543,
       0.00420041, 0.00368831, 0.00407765, 0.00343515, 0.0052852 ,
       0.00557462, 0.00522394, 0.00540634, 0.00558233, 0.00548256,
       0.00518012, 0.00352598, 0.00596603, 0.00473872, 0.00491811,
       0.00269768, 0.00644508, 0.00415153, 0.00444749, 0.00429507,
       0.00446929, 0.00470727, 0.00427879, 0.00565851, 0.00486237,
       0.00496592, 0.00463155, 0.00458383, 0.00622348, 0.00426433,
       0.00467779, 0.00423048, 0.00442469, 0.00633924, 0.00345

In [25]:
# zimin과 black_eyed_peas를 내적하는 코드
np.dot(hs_vector, doctor_strange_vector) # 0.00028537068 -> 0.021859618

0.026661593

In [26]:
dark_knight = movie_to_idx['The Dark Knight (2008)']
dark_knight_vector = als_model.item_factors[dark_knight]
np.dot(hs_vector, dark_knight_vector) # 0.0002455473 -> 0.021536825

0.027139897

In [28]:
iron_man = movie_to_idx['Iron Man (2008)']
iron_man_vector = als_model.item_factors[iron_man]
np.dot(hs_vector, iron_man_vector) # -0.00081391656 -> -0.005021919

0.026883025

In [29]:
favorite_movie = 'Doctor Strange (2016)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(3628, 1.0000001),
 (3629, 0.99988157),
 (3631, 0.99986964),
 (3630, 0.99986476),
 (3615, 0.9975538),
 (3531, 0.9973943),
 (3534, 0.9972694),
 (3035, 0.997181),
 (3088, 0.9968074),
 (3570, 0.9966827),
 (3549, 0.9966735),
 (3589, 0.99640006),
 (3053, 0.99630237),
 (3156, 0.9960763),
 (3519, 0.99582887)]

In [30]:
#movie_to_idx 를 뒤집어, index로부터 movie 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Doctor Strange (2016)',
 'The Dark Knight (2008)',
 'The Lord of the Rings: The Return of the King (2003)',
 'Iron Man (2008)',
 'Chain of Fools (2000)',
 'Aiqing wansui (1994)',
 'Project Moon Base (1953)',
 'For Ever Mozart (1996)',
 'Open Season (1996)',
 'Loser (1991)',
 'Make Them Die Slowly (Cannibal Ferox) (1980)',
 'Soft Toilet Seats (1999)',
 'Sunchaser, The (1996)',
 'Running Free (2000)',
 '1-900 (1994)']

In [31]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [32]:
get_similar_movie('Iron Man (2008)')

['Iron Man (2008)',
 'Doctor Strange (2016)',
 'The Lord of the Rings: The Return of the King (2003)',
 'The Dark Knight (2008)',
 'Chain of Fools (2000)',
 'Aiqing wansui (1994)',
 'Project Moon Base (1953)',
 'For Ever Mozart (1996)',
 'Open Season (1996)',
 'Make Them Die Slowly (Cannibal Ferox) (1980)']

In [38]:
get_similar_movie('E.T. the Extra-Terrestrial (1982)')

['E.T. the Extra-Terrestrial (1982)',
 'Paralyzing Fear: The Story of Polio in America, A (1998)',
 'Smashing Time (1967)',
 'Lured (1947)',
 'Held Up (2000)',
 'Boys (1996)',
 'Boys, The (1997)',
 'Slappy and the Stinkers (1998)',
 'Second Jungle Book: Mowgli & Baloo, The (1997)',
 'Yankee Zulu (1994)']

In [39]:
user = user_to_idx['hs']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2210, 0.027317002),
 (1560, 0.026977206),
 (3493, 0.026414588),
 (1006, 0.025515314),
 (3541, 0.025311947),
 (2178, 0.024594799),
 (3122, 0.024042122),
 (3500, 0.023923185),
 (714, 0.023875048),
 (3591, 0.02369555),
 (2999, 0.023526907),
 (151, 0.023456372),
 (3296, 0.02323399),
 (2824, 0.023149695),
 (725, 0.023138883),
 (3517, 0.023130128),
 (3416, 0.023112755),
 (2269, 0.022932604),
 (2313, 0.02292992),
 (3534, 0.022662647)]

In [40]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Brighton Beach Memoirs (1986)',
 "Dead Men Don't Wear Plaid (1982)",
 'Held Up (2000)',
 'Highlander: Endgame (2000)',
 'Smashing Time (1967)',
 'Twelfth Night (1996)',
 'Belizaire the Cajun (1986)',
 'Journey of August King, The (1995)',
 'Waterworld (1995)',
 "Heaven's Burning (1997)",
 'Grateful Dead (1995)',
 'Mr. Jones (1993)',
 'Window to Paris (1994)',
 'Anna Karenina (1997)',
 'Billy Madison (1995)',
 'Paralyzing Fear: The Story of Polio in America, A (1998)',
 'Firelight (1997)',
 "Young Poisoner's Handbook, The (1995)",
 '20 Dates (1998)',
 'Project Moon Base (1953)']

In [41]:
e_t = movie_to_idx['E.T. the Extra-Terrestrial (1982)']
explain = als_model.explain(user, csr_data, itemid=e_t)

In [42]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[("Schindler's List (1993)", 0.0017544294786417695),
 ('The Dark Knight (2008)', 0.0003644123664457667),
 ('Doctor Strange (2016)', 0.0003500241908909584),
 ('Iron Man (2008)', 0.00023226869666919357),
 ('The Lord of the Rings: The Return of the King (2003)',
  0.00016537630447906122)]

이후에는 이전 스텝에 소개했던 것과 동일한 방식으로 MF model을 구성하여 내가 좋아할 만한 영화를 추천해 볼 수 있습니다.

4) CSR matrix를 직접 만들어 봅시다.

5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.