# 잠재요인 협업 필터링 - SVD 활용

In [5]:
import pandas as pd

ratings = pd.read_csv("./data/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.shape

(100836, 4)

In [8]:
# 사용자수, 영화수
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [9]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format = 'user item rating timestamp', sep = ',', rating_scale=(0.5, 5))
data_folds = DatasetAutoFolds('./data/ratings_noh.csv', reader = reader)

In [10]:
# 전체 데이터를 학습 데이터로 사용
trainset = data_folds.build_full_trainset()

In [13]:
# 모델 생성 및 학습
model = SVD(n_epochs = 20, n_factors = 50, random_state = 2022)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8a3b6221c0>

- 사용자 Id: 9, 영화 Id: 42 (Dead Presidents (1995))

In [18]:
# 영화 정보
mdf = pd.read_csv('./data/movies.csv')
mdf.head(40).tail(5)

Unnamed: 0,movieId,title,genres
35,39,Clueless (1995),Comedy|Romance
36,40,"Cry, the Beloved Country (1995)",Drama
37,41,Richard III (1995),Drama|War
38,42,Dead Presidents (1995),Action|Crime|Drama
39,43,Restoration (1995),Drama


In [24]:
# 사용자 9번이 42번 영화를 보았는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [28]:
# 사용자 9번의 42번 영화에 대한 예상 평점
uid, mid = str(9), str(42)
pred = model.predict(uid, mid, verbose = True)

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


In [29]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=3.2499243773395388, details={'was_impossible': False})

- 사용자 9번이 보지 않은 영화중에서 예상점수가 가장 높은 Top 10

In [30]:
seen_movies = ratings[ratings.userId == 9]['movieId'].tolist()
total_movies = mdf.movieId.tolist()
unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
len(seen_movies), len(unseen_movies)

(46, 9696)

In [31]:
uid = str(9)
# predictions = [model.predict(uid, str(mid)) for mid in unseen_movies]
predictions = []
for mid in unseen_movies:
    pred = model.predict(uid, str(mid))
    predictions.append(pred)
predictions[:5]

[Prediction(uid='9', iid='1', r_ui=None, est=3.7029223474247126, details={'was_impossible': False}),
 Prediction(uid='9', iid='2', r_ui=None, est=3.2274451421980417, details={'was_impossible': False}),
 Prediction(uid='9', iid='3', r_ui=None, est=3.0342513115122123, details={'was_impossible': False}),
 Prediction(uid='9', iid='4', r_ui=None, est=2.661778597408914, details={'was_impossible': False}),
 Prediction(uid='9', iid='5', r_ui=None, est=2.689490348191407, details={'was_impossible': False})]

In [34]:
def sortkey_est(pred):
    return pred.est

In [35]:
predictions.sort(key = sortkey_est, reverse = True)
predictions[:5]

[Prediction(uid='9', iid='318', r_ui=None, est=4.070330794979969, details={'was_impossible': False}),
 Prediction(uid='9', iid='1217', r_ui=None, est=4.063731956995098, details={'was_impossible': False}),
 Prediction(uid='9', iid='1261', r_ui=None, est=4.051908410348555, details={'was_impossible': False}),
 Prediction(uid='9', iid='1204', r_ui=None, est=4.0227662213503805, details={'was_impossible': False}),
 Prediction(uid='9', iid='3275', r_ui=None, est=4.011500870494227, details={'was_impossible': False})]

In [40]:
top_movie_ids = [int(pred.iid) for pred in predictions[:10]]
top_movie_ratings = [pred.est for pred in predictions[:10]]
top_movie_titles = [mdf[mdf.movieId == mid]['title'] for mid in top_movie_ids]

In [41]:
top_df = pd.DataFrame({
    '영화명': top_movie_titles,
    '예상평점': top_movie_ratings
})
top_df.index.name = 'mdf_index'
top_df

Unnamed: 0_level_0,영화명,예상평점
mdf_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"277 Shawshank Redemption, The (1994) Name: ...",4.070331
1,"918 Ran (1985) Name: title, dtype: object",4.063732
2,960 Evil Dead II (Dead by Dawn) (1987) Name...,4.051908
3,"906 Lawrence of Arabia (1962) Name: title, ...",4.022766
4,"2462 Boondock Saints, The (2000) Name: titl...",4.011501
5,3622 Amelie (Fabuleux destin d'Amélie Poula...,3.999696
6,"9071 Spotlight (2015) Name: title, dtype: o...",3.985348
7,"1258 Boogie Nights (1997) Name: title, dtyp...",3.979885
8,"46 Usual Suspects, The (1995) Name: title, ...",3.978625
9,"680 Philadelphia Story, The (1940) Name: ti...",3.978415
