In [1]:
!pip install scikit-surprise



In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [3]:
data = Dataset.load_builtin('ml-100k',prompt=False)
data.raw_ratings[:10]
#user,item,rating,id 순서

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [4]:
model = SVD()

In [5]:
cross_validate(model, data, measures=['rmse', 'mae'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9303  0.9370  0.9393  0.9403  0.9375  0.9369  0.0035  
MAE (testset)     0.7311  0.7396  0.7420  0.7406  0.7395  0.7385  0.0038  
Fit time          0.78    0.98    0.83    0.95    0.80    0.87    0.08    
Test time         0.16    0.16    0.12    0.14    0.16    0.15    0.02    


{'test_rmse': array([0.9302526 , 0.9370349 , 0.93925841, 0.94033877, 0.93754317]),
 'test_mae': array([0.73114582, 0.73957983, 0.74196767, 0.74055938, 0.73946205]),
 'fit_time': (0.7848398685455322,
  0.9841327667236328,
  0.8308084011077881,
  0.9475579261779785,
  0.7990143299102783),
 'test_time': (0.16409969329833984,
  0.1639406681060791,
  0.11507916450500488,
  0.14399981498718262,
  0.1550002098083496)}

### 컨텐츠 기반 필터링
이전 행동과 명시적 피드백을 통해 좋아하는 것과 유사한 항목을 추천

In [6]:
import numpy as np
from surprise import Dataset

In [7]:
data = Dataset.load_builtin('ml-100k', prompt=False)
raw_data = np.array(data.raw_ratings, dtype=int)

In [8]:
raw_data[:,0] -= 1
raw_data[:,1] -= 1

In [9]:
n_users = np.max(raw_data[:,0])
n_movies = np.max(raw_data[:,1])
shape = (n_users + 1, n_movies +1)
shape

(943, 1682)

In [10]:
#인접행렬
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = 1.
adj_matrix

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [11]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        similarity = np.dot(my_vector, user_vector)
        if similarity > best_match:
            best_match = similarity
            best_match_id = user_id
            best_match_vector = user_vector
            
print("Best Match:{}, Best Match ID:{}".format(best_match, best_match_id))

In [13]:
recommend_list=[]
for i,log in enumerate(zip(my_vector, best_match_vector)):
    log1,log2 = log
    if log1<1 and log2>0.:
        recommend_list.append(i)
print(recommend_list)

[272, 273, 275, 280, 281, 283, 287, 288, 289, 290, 292, 293, 297, 299, 300, 301, 302, 306, 312, 314, 315, 316, 317, 321, 322, 323, 324, 327, 330, 331, 332, 333, 339, 342, 345, 346, 353, 354, 355, 356, 357, 363, 364, 365, 366, 372, 374, 378, 379, 381, 382, 383, 384, 385, 386, 387, 390, 391, 392, 394, 395, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 414, 416, 417, 418, 419, 420, 422, 424, 425, 426, 427, 428, 430, 431, 432, 435, 442, 446, 447, 448, 449, 450, 451, 452, 454, 455, 457, 460, 461, 462, 468, 469, 470, 471, 472, 473, 474, 478, 495, 500, 507, 517, 522, 525, 530, 539, 540, 543, 545, 546, 548, 549, 550, 551, 553, 557, 558, 560, 561, 562, 563, 565, 566, 567, 568, 570, 571, 574, 575, 576, 577, 580, 581, 582, 585, 587, 589, 590, 594, 596, 602, 623, 626, 627, 630, 633, 635, 639, 646, 648, 651, 652, 654, 657, 664, 668, 671, 677, 678, 681, 683, 684, 685, 690, 691, 692, 695, 696, 708, 709, 714, 718, 719, 720, 724, 726, 727, 731, 733, 734, 736, 738, 741, 742, 745,

###유클리드 거리 활용하기

In [14]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =9999,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
            
print("Best Match:{}, Best Match ID:{}".format(best_match, best_match_id))

Best Match:14.832396974191326, Best Match ID:737


In [15]:
recommend_list=[]
for i,log in enumerate(zip(my_vector, best_match_vector)):
    log1,log2 = log
    if log1<1 and log2>0.:
        recommend_list.append(i)
print(recommend_list)

[297, 312, 317, 342, 356, 366, 379, 384, 392, 402, 404, 407, 417, 422, 428, 433, 448, 454, 469, 473, 495, 510, 516, 526, 527, 549, 567, 602, 635, 649, 650, 654, 658, 661, 664, 696, 731, 746, 750, 754, 915, 918, 925, 929, 950, 968, 1015, 1046]


In [16]:
def compute_cos_similarity(v1,v2):
    norm1 = np.sqrt(np.sum(np.square(v1)))
    norm2 = np.sqrt(np.sum(np.square(v2)))#sqrt = 루트,square=제곱
    dot = np.dot(v1,v2)
    return dot/(norm1*norm2)

In [17]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector,user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
            
print("Best Match:{}, Best Match ID:{}".format(best_match, best_match_id))

Best Match:0.5278586163659506, Best Match ID:915


In [18]:
#915번은 봤지만 나는 안 본 영화 추천
recommend_list=[]
for i,log in enumerate(zip(my_vector, best_match_vector)):
    log1,log2 = log
    if log1<1 and log2>0.:
        recommend_list.append(i)
print(recommend_list)

[272, 275, 279, 280, 283, 285, 289, 294, 297, 316, 317, 355, 365, 366, 368, 379, 380, 381, 384, 386, 392, 398, 401, 404, 416, 420, 422, 424, 426, 427, 430, 432, 450, 460, 461, 466, 469, 471, 473, 474, 475, 479, 482, 483, 497, 505, 508, 510, 511, 522, 526, 527, 529, 530, 534, 536, 540, 545, 548, 549, 556, 557, 558, 560, 565, 567, 568, 569, 577, 580, 581, 582, 592, 596, 630, 635, 639, 641, 649, 651, 654, 673, 677, 678, 683, 684, 692, 696, 701, 703, 707, 708, 709, 712, 714, 719, 720, 726, 731, 734, 736, 738, 740, 745, 747, 754, 755, 761, 762, 763, 766, 780, 789, 791, 805, 819, 823, 824, 830, 843, 862, 865, 918, 929, 930, 938, 942, 943, 947, 958, 959, 960, 970, 977, 1004, 1008, 1009, 1010, 1013, 1041, 1045, 1069, 1072, 1073, 1078, 1097, 1100, 1108, 1112, 1118, 1134, 1193, 1205, 1207, 1216, 1219, 1267, 1334, 1400, 1427, 1596, 1681]


In [19]:
adj_matrix = np.ndarray(shape, dtype=int)
for user_id, movie_id, rating, time in raw_data:
    adj_matrix[user_id][movie_id] = rating
adj_matrix

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 0, ..., 0, 0, 0]])

In [20]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =9999,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        euclidean_dist = np.sqrt(np.sum(np.square(my_vector - user_vector)))
        if euclidean_dist < best_match:
            best_match = euclidean_dist
            best_match_id = user_id
            best_match_vector = user_vector
            
print("Best Match:{}, Best Match ID:{}".format(best_match, best_match_id))

Best Match:55.06359959174482, Best Match ID:737


In [21]:
my_id, my_vector = 0, adj_matrix[0]
best_match, best_match_id, best_match_vector =-1,-1,[]

for user_id, user_vector in enumerate(adj_matrix):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector,user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector
            
print("Best Match:{}, Best Match ID:{}".format(best_match, best_match_id))

Best Match:0.569065731527988, Best Match ID:915
