# Preprocessing

In [1]:
# load libraries
import numpy as np
import scipy
import implicit
import os
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

# recommended steps from implicit API for implicit library
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [2]:
# load rating data
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [3]:
# load movie data
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
# filter df to include rating of 3 or above
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# change column name 'raitings' to 'counts'
ratings.rename(columns={'ratings':'counts'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [6]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [7]:
# merge ratings and movies df
ratings = pd.merge(left=ratings, right=movies, how='left')
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy
...,...,...,...,...,...,...
836473,6040,1090,3,956715518,Platoon (1986),Drama|War
836474,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096,4,956715648,Sophie's Choice (1982),Drama


# Analyzing data

In [8]:
# num of unique movie
ratings['movie_id'].nunique()

3628

In [9]:
# num of unique user id
ratings['user_id'].nunique()

6039

In [10]:
# view 30 most popular movies
movie_count = ratings.groupby(['movie_id', 'title']).count()['user_id'].sort_values(ascending=False).head(30)
movie_count

movie_id  title                                                
2858      American Beauty (1999)                                   3211
260       Star Wars: Episode IV - A New Hope (1977)                2910
1196      Star Wars: Episode V - The Empire Strikes Back (1980)    2885
1210      Star Wars: Episode VI - Return of the Jedi (1983)        2716
2028      Saving Private Ryan (1998)                               2561
589       Terminator 2: Judgment Day (1991)                        2509
593       Silence of the Lambs, The (1991)                         2498
1198      Raiders of the Lost Ark (1981)                           2473
1270      Back to the Future (1985)                                2460
2571      Matrix, The (1999)                                       2434
480       Jurassic Park (1993)                                     2413
2762      Sixth Sense, The (1999)                                  2385
608       Fargo (1996)                                             2371


In [11]:
# stats of movie count by user
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [12]:
# stats of ratings by user
user_median = ratings.groupby('user_id')['counts'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: counts, dtype: float64

In [13]:
# add my movies manually
my_movies = ['Doctor Strange (2016)', 'The Dark Knight (2008)', 'Iron Man (2008)',
             'The Lord of the Rings: The Return of the King (2003)', "Schindler's List (1993)"]

# add my movie ratings manually
my_list = pd.DataFrame({'user_id': ['hs']*5, 'title': my_movies, 'counts':[4, 4, 4, 3, 4]})

if not ratings.isin({'user_id':['hs']})['user_id'].any():       # user_id doesn't have 'hs'
    ratings = ratings.append(my_list)                           # append my_movies data

ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,6040,1090.0,3,956715518.0,Platoon (1986),Drama|War
836474,6040,1094.0,5,956704887.0,"Crying Game, The (1992)",Drama|Romance|War
836475,6040,562.0,5,956704746.0,Welcome to the Dollhouse (1995),Comedy|Drama
836476,6040,1096.0,4,956715648.0,Sophie's Choice (1982),Drama
836477,6040,1097.0,4,956715569.0,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
0,hs,,4,,Doctor Strange (2016),
1,hs,,4,,The Dark Knight (2008),
2,hs,,4,,Iron Man (2008),
3,hs,,3,,The Lord of the Rings: The Return of the King ...,
4,hs,,4,,Schindler's List (1993),


In [14]:
# unique nums of user and movies
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

In [15]:
# user, movie indexing code; idx is abbreviation for index
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [16]:
# cross-check idx 
print(user_to_idx['hs'])
print(movie_to_idx['Iron Man (2008)'])

6039
3630


In [17]:
# change values in data columns via indexing
# refer to https://wikidocs.net/16 for dictionary.get function

# use user_to_idx.get to get all indexed series from user_id column
# for any anomaly rows, idx becomes NaN, so remove with dropna()
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # if all rows are correctly indexed
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # swap data['user_id'] to indexed Series
else:
    print('user_id column indexing Fail!!')

# use movie_to_idx to do a same process for ratings['title'] column
temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['title'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

user_id column indexing OK!!
movie_id column indexing OK!!


# CSR matrix and ALS model train

In [18]:
# create CSR matrix
num_user = ratings['user_id'].nunique()
num_movie = ratings['title'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape= (num_user, num_movie))
csr_data

<6040x3632 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [19]:
# declare Implicit AlternatingLeastSquares
als_model = AlternatingLeastSquares(factors=2000, regularization=0.01, use_gpu=False, iterations=50, dtype=np.float32)

In [20]:
# als model receives (item x user) matrix format as input, so need to transpose
csr_data_transpose = csr_data.T
csr_data_transpose

<3632x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
# train model
als_model.fit(csr_data_transpose)

  0%|          | 0/50 [00:00<?, ?it/s]

In [22]:
# assign idx of hs and doctor strange, then create hs and doctor_strange vectors
hs, doctor_strange = user_to_idx['hs'], movie_to_idx['Doctor Strange (2016)']
hs_vector, doctor_strange_vector = als_model.user_factors[hs], als_model.item_factors[doctor_strange]

In [23]:
hs_vector

array([-0.04923557,  0.00198815, -0.24138722, ...,  0.29524797,
       -0.27099293, -0.38688943], dtype=float32)

In [24]:
doctor_strange_vector

array([0.0045209 , 0.00691634, 0.00357258, ..., 0.00634889, 0.00331839,
       0.00229167], dtype=float32)

# Find preferences

In [25]:
# dot product of hs and dotor strange
np.dot(hs_vector, doctor_strange_vector) # 0.026514249

0.43893155

In [26]:
# dot product of hs and dark knight
dark_knight = movie_to_idx['The Lord of the Rings: The Return of the King (2003)']
dark_knight_vector = als_model.item_factors[dark_knight]
np.dot(hs_vector, dark_knight_vector) # 0.026508378

0.3582708

In [27]:
# dot product of hs and iron man
iron_man = movie_to_idx['Iron Man (2008)']
iron_man_vector = als_model.item_factors[iron_man]
np.dot(hs_vector, iron_man_vector) # 0.02691814

0.42138603

In [28]:
american_beauty = movie_to_idx['American Beauty (1999)']
american_beauty_vector = als_model.item_factors[american_beauty]
np.dot(hs_vector, american_beauty_vector) # 0.005823412

0.00605844

In [29]:
# find similar movies to doctor strange using als_model.similar_items
favorite_movie = 'Doctor Strange (2016)'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(3628, 1.0000001),
 (3630, 0.9971928),
 (3629, 0.9957552),
 (3631, 0.99496996),
 (3573, 0.96312493),
 (3548, 0.962191),
 (2618, 0.962115),
 (3579, 0.96199065),
 (3583, 0.9619904),
 (3578, 0.9617172),
 (3441, 0.96157974),
 (3577, 0.961563),
 (3574, 0.9613773),
 (3575, 0.960772),
 (3472, 0.96076375)]

In [30]:
# flip movie_to_idx to generate dict to retrieve movie titles based on index
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Doctor Strange (2016)',
 'Iron Man (2008)',
 'The Dark Knight (2008)',
 'The Lord of the Rings: The Return of the King (2003)',
 'Male and Female (1919)',
 'Horror Hotel (a.k.a. The City of the Dead) (1960)',
 "Wend Kuuni (God's Gift) (1982)",
 'Last of the High Kings, The (a.k.a. Summer Fling) (1996)',
 'Neon Bible, The (1995)',
 'Number Seventeen (1932)',
 'Tokyo Fist (1995)',
 'Daens (1992)',
 'Century (1993)',
 "Brother's Kiss, A (1997)",
 'Show, The (1995)']

In [31]:
# function to get similar movies when inputting a movie title
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [32]:
# similar movies of iron man
get_similar_movie('Iron Man (2008)')

['Iron Man (2008)',
 'Doctor Strange (2016)',
 'The Dark Knight (2008)',
 'The Lord of the Rings: The Return of the King (2003)',
 'Male and Female (1919)',
 'Number Seventeen (1932)',
 'Horror Hotel (a.k.a. The City of the Dead) (1960)',
 'Neon Bible, The (1995)',
 'Last of the High Kings, The (a.k.a. Summer Fling) (1996)',
 'Daens (1992)']

In [33]:
# similar movies of ET
get_similar_movie('E.T. the Extra-Terrestrial (1982)')

['E.T. the Extra-Terrestrial (1982)',
 'Neon Bible, The (1995)',
 'Century (1993)',
 'Male and Female (1919)',
 "Brother's Kiss, A (1997)",
 'Paralyzing Fear: The Story of Polio in America, A (1998)',
 'Daens (1992)',
 'Last of the High Kings, The (a.k.a. Summer Fling) (1996)',
 "Another Man's Poison (1952)",
 'Number Seventeen (1932)']

In [34]:
user = user_to_idx['hs']

# recommend user a movie based on CSR matrix
# 'als_model.recommend' receives user*item CSR Matrix format
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2339, 0.121371895),
 (2437, 0.11086334),
 (2049, 0.105467364),
 (2247, 0.10515568),
 (2836, 0.09991087),
 (776, 0.09865465),
 (1987, 0.096904084),
 (3049, 0.09442625),
 (2393, 0.094006315),
 (1985, 0.09365745),
 (2108, 0.09317894),
 (2666, 0.09018628),
 (2185, 0.0889992),
 (1939, 0.08725676),
 (2687, 0.08697872),
 (1677, 0.08650948),
 (1184, 0.08618368),
 (823, 0.085060365),
 (2727, 0.08429104),
 (2842, 0.084256865)]

In [35]:
# movies recommended by the system
[idx_to_movie[i[0]] for i in movie_recommended]

['Alaska (1996)',
 'Fanny and Alexander (1982)',
 'Eyes of Tammy Faye, The (2000)',
 'Bless the Child (2000)',
 'All the Vermeers in New York (1990)',
 'Big Blue, The (Le Grand Bleu) (1988)',
 'Pokémon: The First Movie (1998)',
 'Unforgettable (1996)',
 'Cecil B. Demented (2000)',
 'NeverEnding Story II: The Next Chapter, The (1990)',
 'Herbie Rides Again (1974)',
 'Trial and Error (1997)',
 'Tales from the Darkside: The Movie (1990)',
 'Evita (1996)',
 'Air Up There, The (1994)',
 'Mediterraneo (1991)',
 'Exit to Eden (1994)',
 'One True Thing (1998)',
 'Innocents, The (1961)',
 'Clean Slate (Coup de Torchon) (1981)']

In [36]:
# als.explain() 
held_up = movie_to_idx['Held Up (2000)']
explain = als_model.explain(user, csr_data, itemid=held_up)

In [37]:
# get movie title based on explain
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('The Lord of the Rings: The Return of the King (2003)', 0.00816515304405117),
 ('Iron Man (2008)', 0.008007130107981784),
 ('Doctor Strange (2016)', 0.006593840989423652),
 ("Schindler's List (1993)", 0.006342623660097645),
 ('The Dark Knight (2008)', 0.0008366071628630881)]

이후에는 이전 스텝에 소개했던 것과 동일한 방식으로 MF model을 구성하여 내가 좋아할 만한 영화를 추천해 볼 수 있습니다.

4) CSR matrix를 직접 만들어 봅시다.

5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

# 회고

1회차
- CSR matrix와 als_model을 이용하여 추천 시스템 모델을 구현해 보았다.
- 선호하는 영화 3개(doctor strange, iron man, lord of the rings)와 원본 데이터에 있는 영화 하나(american beauty)를 골라 선호도도 파악해 보았다.
    - 다만 수치가 그렇게 높게 나오지 않았다.
    - 여기선 hyperparameter를 높게 조정하여 충분한 학습을 이루게 하는 방향으로 진행했다.
- 가장 좋아하는 영화와 비슷한 영화를 추천받아 봤고 가장 좋아할 만한 영화도 받아봤는데, 그렇게 비슷한 류의 영화가 나오지 않았다.
    - 충분한 학습이 이뤄지지 않아서 그렇다고 가정하고 hyperparameter 조정하는 방향으로 프로젝트를 진행했다.

2회차
- hyperparameter를 높여 모델을 학습시켜 보았다.
    - 확실히 조정을 해주니, 2%에 불과했던 선호도가 40% 가까이 되는 선호도로 변했다.
    - 원본 데이터에 있는 영화는 변하지 않는 6%에 가까운 선호도를 보였다.
- 다만 선호하는 영화와 비슷한 영화는 그리 많이 보이지 않았다.
- 또한 좋아할만한 영화도 추천 받았지만, 들어보지도 못한 영화나 들어는 봤지만 보지 못한 영화를 추천해주었다.
- 예측하는 이유는 정말 디테일하고 다양한 데이터가 있어야 더 나은 추천이 가능한데, 영화를 원본 데이터에 추가할 때, 장르를 집어넣지 않고 오직 영화 이름, 년도, 그리고 점수만 추가했기 때문이다.
- 확실히 추천 시스템은 모델을 어떻게 학습하냐부터, 데이터의 디테일, 그리고 시간, 순서, 등등 여러가지 복합적인 요인이 있어야 분석 목표에 따른 결과를 도출해 낼 수 있다는 것을 알게되었고 흥미로웠다.