## 3. Collaborative Filtering (협업 필터링: 사용자 리뷰 기반)

In [1]:
import surprise
surprise.__version__

'1.1.4'

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [4]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
reader = Reader(rating_scale=(0.5, 5))

In [6]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x109256ca0>

In [7]:
svd = SVD(random_state=0)

In [8]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8950  0.9040  0.8973  0.8980  0.8943  0.8977  0.0034  
MAE (testset)     0.6877  0.6957  0.6921  0.6913  0.6867  0.6907  0.0032  
Fit time          0.65    0.65    0.65    0.64    0.63    0.64    0.01    
Test time         0.08    0.10    0.10    0.07    0.07    0.09    0.01    


{'test_rmse': array([0.89501829, 0.90402428, 0.89733886, 0.89800675, 0.894324  ]),
 'test_mae': array([0.6877458 , 0.69569138, 0.69211615, 0.69128799, 0.68666552]),
 'fit_time': (0.6493160724639893,
  0.6464838981628418,
  0.6493680477142334,
  0.635472297668457,
  0.6291120052337646),
 'test_time': (0.07710123062133789,
  0.1038060188293457,
  0.10411882400512695,
  0.07313704490661621,
  0.07191109657287598)}

In [9]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10b454b80>

In [10]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [11]:
svd.predict(1, 302) # userID, movieId

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [14]:
svd.predict(1, 1029, 3) # userID, movieId, 실제 평가는 3점일 때 예측평가점수는?

Prediction(uid=1, iid=1029, r_ui=3, est=2.8814455446761933, details={'was_impossible': False})

In [15]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [19]:
svd.predict(100, 1029)

Prediction(uid=100, iid=1029, r_ui=None, est=3.7705476478414846, details={'was_impossible': False})

실제 응용 예: 지금까지 본 영화/ 보지 않은 영화를 구분하고, 지금까지 본 영화에 대한 평가 데이터셋을 만들어서 ratings에 추가한 후 학습해서 보지 않은 영화에 대해서 평점을 매겨서 top10을 뽑아보기