<a href="https://colab.research.google.com/github/twelvesense/first-repository/blob/master/ExplNode15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 프로젝트: Movielens 영화 추천 실습

# 1: 기본 환경

## 1.1: 모듈

In [19]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Using cached implicit-0.5.2-cp37-cp37m-manylinux2014_x86_64.whl (18.5 MB)
Installing collected packages: implicit
Successfully installed implicit-0.5.2


In [69]:
import numpy as np
import scipy
import implicit
import os
import pandas as pd
from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장사항
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

## 1.2: 데이터

In [3]:
# 디렉토리
from google.colab import drive
drive.mount('/content/drive')
colab_dir = '/content/drive/MyDrive/colab/ExplNode15'

lastfm_dir = colab_dir + '/data/lastfm-dataset-360K'
ml_dir = colab_dir + '/data/ml-1m'

Mounted at /content/drive


##1.3: Movielens 데이터의 전제조건


  * MovieLens 1M Dataset 사용
  * 별점 데이터는 explicit 데이터이지만 implicit 데이터로 간주하고 테스트
  * 별점을 시청횟수로 해석
  * 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외


#2: 데이터 준비와 전처리

In [227]:
rating_file_path= colab_dir + '/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [228]:
# 3점 이상만 남김
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [229]:
# ratings 컬럼의 이름을 counts로 바꿈
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [230]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [231]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옴
movie_file_path= colab_dir + '/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [232]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [233]:
movies_title = movies.iloc[:,0:2]
movies_title_indexed = movies_title.set_index('movie_id')
movies_title_indexed

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)
...,...
3948,Meet the Parents (2000)
3949,Requiem for a Dream (2000)
3950,Tigerland (2000)
3951,Two Family House (2000)


In [234]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


#3: 분석하기

In [235]:
# unique한 영화 수
num_movies = ratings['movie_id'].nunique()
print('영화 수: ', num_movies)

영화 수:  3628


In [236]:
# unique한 사용자 수
num_users = ratings['user_id'].nunique()
print('사용자 수: ', num_users)

사용자 수:  6039


In [237]:
# 인기있는 영화 (인기순)
ratings_movies = ratings.groupby('movie_id')['user_id'].count()
# ratings_movies
sorted_ratings_movieid_count = ratings_movies.sort_values(ascending=False)
sorted_ratings_movieid_count
# sorted_ratings_movieid_count.index[0]          # 2858
# sorted_ratings_movieid_count[2858]             # 3211

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
        ... 
1553       1
1548       1
2486       1
138        1
3876       1
Name: user_id, Length: 3628, dtype: int64

In [249]:
sorted_ratings_movieid_count.head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

In [238]:
# 가장 인기있는 영화 30개 (인기순)
count = 0
for i in range(0, len(sorted_ratings_movieid_count)):
  count += 1
  print(movies_title_indexed.loc[sorted_ratings_movieid_count.index[i], 'title'])
  if count==30:break

American Beauty (1999)
Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode VI - Return of the Jedi (1983)
Saving Private Ryan (1998)
Terminator 2: Judgment Day (1991)
Silence of the Lambs, The (1991)
Raiders of the Lost Ark (1981)
Back to the Future (1985)
Matrix, The (1999)
Jurassic Park (1993)
Sixth Sense, The (1999)
Fargo (1996)
Braveheart (1995)
Men in Black (1997)
Schindler's List (1993)
Princess Bride, The (1987)
Shakespeare in Love (1998)
L.A. Confidential (1997)
Shawshank Redemption, The (1994)
Godfather, The (1972)
Groundhog Day (1993)
E.T. the Extra-Terrestrial (1982)
Being John Malkovich (1999)
Ghostbusters (1984)
Pulp Fiction (1994)
Forrest Gump (1994)
Terminator, The (1984)
Toy Story (1995)
Fugitive, The (1993)


#4: 내가 선호하는 영화를 5가지 골라서 ratings에 추가하기

In [239]:
my_favorite = ['Toy Story (1995)', 'Pulp Fiction (1994)' ,'L.A. Confidential (1997)','Braveheart (1995)','Back to the Future (1985)']
my_favorite_id = [movies[movies['title'] == name]['movie_id'].values[0] for name in my_favorite]

new_user_id = max(ratings['user_id']) + 1
my_playlist = pd.DataFrame({'user_id': [new_user_id]*5, 'movie_id': my_favorite_id, 'counts':[5]*5})

if not ratings.isin({'user_id':[new_user_id]})['user_id'].any():
    ratings = ratings.append(my_playlist)

In [240]:
ratings.tail(10)  

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,6041,1,5,
1,6041,296,5,
2,6041,1617,5,
3,6041,110,5,
4,6041,1270,5,


#5: CSR matrix를 직접 만들기

In [241]:
# unique 데이터
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

In [242]:
# csr 행렬
csr_data = csr_matrix((ratings['counts'], (ratings['user_id'], ratings['movie_id'])))

In [243]:
csr_data

<6042x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

#6: als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시키기

In [244]:
# Implicit AlternatingLeastSquares 모델 구성
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [245]:
# 모델 학습
als_model.fit(csr_data)

  0%|          | 0/15 [00:00<?, ?it/s]

#7: 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보기

* 나의 선호 영화: Pulp Fiction (1994)
* 그외 영화 한편: Forrest Gump (1994)

In [251]:
user_vector = als_model.user_factors[6041]
pulip_vector = als_model.item_factors[296]
other_vector = als_model.item_factors[356]

In [254]:
print("Pulp Fiction (1994):", round(np.dot(user_vector, pulip_vector), 4))
print("Forrest Gump (1994):", round(np.dot(user_vector, other_vector), 4))

Pulp Fiction (1994): 0.4584
Forrest Gump (1994): 0.2678


#8: 내가 좋아하는 영화와 비슷한 영화를 추천받아보기

* 내가 좋아 하는 영화: Pulp Fiction (1994)

In [283]:
similar_movies_id, similar_movies1_per = als_model.similar_items(296, N=15)
similar_movies_id, similar_movies1_per

(array([ 296, 1213,  608,  318,   50,  593, 1089, 1704, 1617,   47,  527,
        1729, 1358,  778, 2268], dtype=int32),
 array([1.0000001 , 0.8319239 , 0.8003547 , 0.7348194 , 0.7108695 ,
        0.65971214, 0.65900546, 0.50917375, 0.49789196, 0.47013003,
        0.47006026, 0.45662928, 0.38152212, 0.38146597, 0.37553132],
       dtype=float32))

In [285]:
for i in similar_movies_id:
    print(movies_title_indexed.loc[i])

title    Pulp Fiction (1994)
Name: 296, dtype: object
title    GoodFellas (1990)
Name: 1213, dtype: object
title    Fargo (1996)
Name: 608, dtype: object
title    Shawshank Redemption, The (1994)
Name: 318, dtype: object
title    Usual Suspects, The (1995)
Name: 50, dtype: object
title    Silence of the Lambs, The (1991)
Name: 593, dtype: object
title    Reservoir Dogs (1992)
Name: 1089, dtype: object
title    Good Will Hunting (1997)
Name: 1704, dtype: object
title    L.A. Confidential (1997)
Name: 1617, dtype: object
title    Seven (Se7en) (1995)
Name: 47, dtype: object
title    Schindler's List (1993)
Name: 527, dtype: object
title    Jackie Brown (1997)
Name: 1729, dtype: object
title    Sling Blade (1996)
Name: 1358, dtype: object
title    Trainspotting (1996)
Name: 778, dtype: object
title    Few Good Men, A (1992)
Name: 2268, dtype: object


#9: 내가 가장 좋아할 만한 영화들을 추천받아보기

In [289]:
movie_recommended_id, movie_recommended_per = als_model.recommend(6041, csr_data, N=15, filter_already_liked_items=False)
movie_recommended_id

array([1617,  110, 2028,    1, 1270,  296,  593, 2858,  608,  318, 3114,
        527,   50, 1213,  589], dtype=int32)

In [290]:
for i in movie_recommended_id:
    print(movies_title_indexed.loc[i])

title    L.A. Confidential (1997)
Name: 1617, dtype: object
title    Braveheart (1995)
Name: 110, dtype: object
title    Saving Private Ryan (1998)
Name: 2028, dtype: object
title    Toy Story (1995)
Name: 1, dtype: object
title    Back to the Future (1985)
Name: 1270, dtype: object
title    Pulp Fiction (1994)
Name: 296, dtype: object
title    Silence of the Lambs, The (1991)
Name: 593, dtype: object
title    American Beauty (1999)
Name: 2858, dtype: object
title    Fargo (1996)
Name: 608, dtype: object
title    Shawshank Redemption, The (1994)
Name: 318, dtype: object
title    Toy Story 2 (1999)
Name: 3114, dtype: object
title    Schindler's List (1993)
Name: 527, dtype: object
title    Usual Suspects, The (1995)
Name: 50, dtype: object
title    GoodFellas (1990)
Name: 1213, dtype: object
title    Terminator 2: Judgment Day (1991)
Name: 589, dtype: object
