# Surprise

- 추천 시스템 개발을 위한 라이브러리
- 다양한 모델과 데이터 제공
- scikit-learn과 유사한 사용 방법

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 9.3 MB/s eta 0:00:01    |█████████████████               | 6.3 MB 6.9 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl size=751453 sha256=f109a0d91186855badfaf0055c9cff21293e5ba55c432a5c7bdaa4c8f07188f2
  Stored in directory: /Users/sum/Library/Caches/pip/wheels/6b/10/c9/7f607c8cb522ef378844f41e63b30d7181a6495d2c1ae514e9
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
import numpy as np
from surprise import Dataset

In [3]:
data = Dataset.load_builtin('ml-100k', prompt=False)

data.raw_ratings[:10]

Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/sum/.surprise_data/ml-100k


[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
raw_data = np.array(data.raw_ratings, dtype=int)

In [5]:
raw_data[:, 0] -= 1
raw_data[:, 1] -= 1

In [6]:
raw_data

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       ...,
       [      275,      1089,         1, 874795795],
       [       12,       224,         2, 882399156],
       [       11,       202,         3, 879959583]])

In [7]:
n_users = np.max(raw_data[:, 0])
n_movies = np.max(raw_data[:, 1])
shape = (n_users + 1, n_movies + 1)
shape

(943, 1682)

In [8]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

# User-Based Collaborative Filtering의 직접 구현

In [9]:
my_id, my_vector = 0, adj_matrix[0] 
best_match, best_match_id, best_match_vector = 9999, -1, []

# 유클리드 거리를 통해 유사도 구하기
for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
            
print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 14.832396974191326, Best Match ID: 737


In [10]:
print(my_vector, best_match_vector)

[1 1 1 ... 0 0 0] [1 1 0 ... 0 0 0]


In [11]:
# 내가 안본 영화 중 737번이 본 영화를 추천
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)
print(recommend_list)

[297, 312, 317, 342, 356, 366, 379, 384, 392, 402, 404, 407, 417, 422, 428, 433, 448, 454, 469, 473, 495, 510, 516, 526, 527, 549, 567, 602, 635, 649, 650, 654, 658, 661, 664, 696, 731, 746, 750, 754, 915, 918, 925, 929, 950, 968, 1015, 1046]


## 코사인 유사도를 사용해 추천

In [12]:
def compute_cos_similarity(v1, v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))
    dot = np.dot(v1, v2)
    return dot / (norm1 * norm2)

In [13]:
my_id, my_vector = 0, adj_matrix[0] # 내 id가 0번 이라고 가정
best_match, best_match_id, best_match_vector = -1, -1, []

# 코사인 유사도를 통해 유사도 구하기
for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
            
print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.5278586163659506, Best Match ID: 915


In [14]:
# 내가 안본 영화 중 915번이 본 영화를 추천
recommend_list = []
for i, log in enumerate(zip(my_vector, best_match_vector)):
    log1, log2 = log
    if log1 < 1. and log2 > 0.:
        recommend_list.append(i)
print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


# 명시적 피드백을 추가

기존 방법에 명시적 피드백(사용자가 평가한 영화 점수)을 추가해 실험

In [15]:
# 명시적 피드백을 추가한 <사용자, 영화> 인접 행렬 생성
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [16]:
my_id, my_vector = 0, adj_matrix[0] # 내 id가 0번 이라고 가정
best_match, best_match_id, best_match_vector = 9999, -1, []

# 유클리드 거리를 통해 유사도 구하기
for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
            
print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 55.06359959174482, Best Match ID: 737


In [17]:
my_id, my_vector = 0, adj_matrix[0] # 내 id가 0번 이라고 가정
best_match, best_match_id, best_match_vector = -1, -1, []

# 코사인 유사도를 통해 유사도 구하기
for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
            
print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.569065731527988, Best Match ID: 915


## surprise 라이브러리

In [18]:
from surprise import KNNBasic, SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

## Nearest Neighbor CF

- KNNBasic()

In [19]:
model_nn = KNNBasic()
cross_validate(model_nn, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True);

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9776  0.9755  0.9818  0.9792  0.9746  0.9777  0.0026  
MAE (testset)     0.7725  0.7683  0.7757  0.7724  0.7713  0.7720  0.0024  
Fit time          0.31    0.31    0.31    0.30    0.22    0.29    0.04    
Test time         2.02    2.01    2.01    2.01    1.79    1.97    0.09    


## Latent Factor CF

- SVD()

In [20]:
model_lf = SVD()
cross_validate(model_lf, data, measures=['rmse', 'mae'], cv=5, n_jobs=4, verbose=True);

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9394  0.9277  0.9389  0.9393  0.9316  0.9354  0.0048  
MAE (testset)     0.7396  0.7324  0.7398  0.7408  0.7326  0.7370  0.0037  
Fit time          5.05    5.04    5.35    5.07    4.59    5.02    0.24    
Test time         0.15    0.16    0.13    0.13    0.13    0.14    0.01    


## SVD를 사용하여 추천

In [21]:
# surprise의 train_test_split() 사용
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

In [22]:
model_lf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff66e750b80>

In [23]:
# userID 196, itemID 302
uid, iid = str(196), str(302)

# 예측 평점
pred = model_lf.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=4.078736194861334, details={'was_impossible': False})

In [24]:
pred_test = model_lf.test(testset)

print('prediction type:', type(pred_test), ' size:', len(pred_test))
pred_test[:5]

prediction type: <class 'list'>  size: 25000


[Prediction(uid='120', iid='282', r_ui=4.0, est=3.660648964183838, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.7713469068740237, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=4.279324328989259, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.746938603779123, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.451698148718369, details={'was_impossible': False})]

In [25]:
pred_accuracy = accuracy.rmse(pred_test)

RMSE: 0.9495
