In [1]:
import numpy as np
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy


In [2]:
# 載入 movielens-100k dataset 第一次會需要下載.
data = Dataset.load_builtin('ml-100k')


In [3]:
# 切訓練集跟測試集 random_state 用跟我一樣的結果才會一樣
trainset, testset = train_test_split(data, test_size=.25, random_state=1)

In [4]:
list(trainset.all_ratings())

[(0, 0, 5.0),
 (0, 141, 3.0),
 (0, 166, 3.0),
 (0, 393, 4.0),
 (0, 67, 5.0),
 (0, 373, 5.0),
 (0, 195, 5.0),
 (0, 426, 4.0),
 (0, 450, 2.0),
 (0, 60, 4.0),
 (0, 294, 4.0),
 (0, 656, 5.0),
 (0, 764, 4.0),
 (0, 77, 5.0),
 (0, 66, 5.0),
 (0, 980, 3.0),
 (0, 262, 4.0),
 (0, 296, 4.0),
 (0, 150, 5.0),
 (0, 187, 3.0),
 (0, 78, 3.0),
 (0, 616, 3.0),
 (0, 109, 4.0),
 (0, 199, 5.0),
 (0, 297, 4.0),
 (0, 312, 4.0),
 (0, 133, 3.0),
 (0, 351, 3.0),
 (0, 94, 4.0),
 (0, 330, 2.0),
 (0, 76, 4.0),
 (0, 18, 5.0),
 (0, 201, 4.0),
 (0, 853, 2.0),
 (0, 654, 4.0),
 (0, 194, 4.0),
 (0, 370, 5.0),
 (0, 335, 3.0),
 (0, 389, 3.0),
 (0, 280, 5.0),
 (0, 711, 4.0),
 (0, 221, 5.0),
 (0, 1412, 4.0),
 (0, 514, 5.0),
 (0, 243, 3.0),
 (0, 50, 4.0),
 (0, 220, 3.0),
 (0, 763, 5.0),
 (0, 850, 3.0),
 (0, 52, 2.0),
 (0, 556, 5.0),
 (0, 191, 4.0),
 (0, 738, 5.0),
 (0, 575, 4.0),
 (0, 566, 4.0),
 (0, 258, 5.0),
 (0, 223, 5.0),
 (0, 28, 4.0),
 (0, 404, 5.0),
 (0, 311, 3.0),
 (0, 252, 2.0),
 (0, 332, 5.0),
 (0, 290, 4.0),
 (0,

In [5]:
# 原本的 user item id
print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(0))

508
185


In [6]:
testset

[('345', '715', 4.0),
 ('92', '998', 2.0),
 ('934', '195', 4.0),
 ('586', '423', 2.0),
 ('336', '383', 1.0),
 ('654', '678', 4.0),
 ('64', '511', 4.0),
 ('425', '209', 2.0),
 ('821', '132', 5.0),
 ('262', '559', 3.0),
 ('894', '32', 4.0),
 ('514', '200', 2.0),
 ('699', '20', 4.0),
 ('380', '433', 3.0),
 ('865', '328', 3.0),
 ('545', '665', 3.0),
 ('450', '832', 2.0),
 ('917', '473', 3.0),
 ('621', '420', 4.0),
 ('218', '269', 4.0),
 ('234', '116', 2.0),
 ('643', '447', 4.0),
 ('452', '491', 4.0),
 ('291', '418', 4.0),
 ('911', '313', 2.0),
 ('291', '924', 4.0),
 ('116', '596', 5.0),
 ('174', '571', 1.0),
 ('833', '203', 5.0),
 ('595', '289', 4.0),
 ('313', '461', 3.0),
 ('629', '137', 5.0),
 ('796', '29', 3.0),
 ('303', '1016', 3.0),
 ('589', '682', 4.0),
 ('128', '715', 4.0),
 ('566', '727', 4.0),
 ('233', '91', 3.0),
 ('725', '300', 4.0),
 ('346', '38', 3.0),
 ('184', '559', 3.0),
 ('784', '307', 4.0),
 ('207', '462', 3.0),
 ('709', '515', 4.0),
 ('537', '988', 1.0),
 ('57', '756', 3

# User CF

In [7]:
# 使用最基本的 User CF(KNN) 算法 
sim_options = {'name': 'cosine',
               'user_based': True
               }
algo = KNNBasic(k=100)
algo.fit(trainset)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fee60854828>

In [8]:
# 找出與 user id = 0 最相似的 10 個 user
algo.get_neighbors(0, k=10)

[14, 25, 46, 61, 195, 280, 304, 305, 344, 347]

In [9]:
algo.predict(trainset.to_raw_uid(0), trainset.to_raw_iid(141), r_ui=3)

Prediction(uid='508', iid='47', r_ui=3, est=3.7110128134359393, details={'actual_k': 40, 'was_impossible': False})

In [10]:
predictions = algo.test(testset)
# 看前三筆預測結果
predictions[:3]

[Prediction(uid='345', iid='715', r_ui=4.0, est=3.5575383652644708, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='998', r_ui=2.0, est=2.4593313558455296, details={'actual_k': 11, 'was_impossible': False}),
 Prediction(uid='934', iid='195', r_ui=4.0, est=4.147515669728399, details={'actual_k': 40, 'was_impossible': False})]

In [11]:
# 算出每個使用者 top n 的推薦電影
def get_top_n(predictions, n=10):
    from collections import defaultdict
    top_n = defaultdict(list)
    # 把推薦電影塞到每個 user id 的 dict 裡面
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))

    # 排序每個使用者的 n 個電影評分 
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [12]:
top_n = get_top_n(predictions, n=10)
for uid, user_ratings in top_n.items():
    print(uid, [(iid, est, true_r) for (iid, est, true_r) in user_ratings])

345 [('12', 4.702117395672621, 5.0), ('479', 4.457075200240884, 4.0), ('246', 4.190757943826343, 4.0), ('124', 4.1616224004143145, 5.0), ('69', 4.082923948477342, 4.0), ('196', 4.058207309778273, 5.0), ('378', 4.04058941201935, 4.0), ('234', 3.979933532552271, 4.0), ('269', 3.955969919736584, 5.0), ('210', 3.946781437594159, 4.0)]
92 [('408', 4.493585493696903, 4.0), ('528', 4.221134103840377, 4.0), ('223', 4.214252428264655, 5.0), ('135', 4.166390223919976, 4.0), ('504', 4.146615096413356, 3.0), ('173', 4.124607056692058, 3.0), ('203', 4.113591870549228, 4.0), ('663', 4.098933406383007, 4.0), ('100', 4.094761132777184, 5.0), ('179', 4.023062748877152, 5.0)]
934 [('474', 4.553181239463419, 4.0), ('183', 4.337578667503333, 2.0), ('316', 4.27917827946306, 4.0), ('315', 4.251893393966529, 4.0), ('195', 4.147515669728399, 4.0), ('661', 4.1469469540916055, 4.0), ('170', 4.1417463451340275, 4.0), ('423', 4.1336446660091335, 3.0), ('516', 4.091013508270193, 3.0), ('527', 4.0694337463046235, 3

In [13]:
accuracy.rmse(predictions)

RMSE: 0.9845


0.9844714102036581

In [14]:
def get_mrr(surprise_predictions, k_highest_scores=None):    
    from scipy import sparse
    
    uids = np.array([p.uid for p in surprise_predictions]).astype(np.int)
    iids = np.array([p.iid for p in surprise_predictions]).astype(np.int)

    # 三分以上判斷為相關 低於三分判斷為不相關
    r_uis = np.array([1 if p.r_ui >= 3 else 0 for p in surprise_predictions]).astype(np.float)
    r_uis = np.asarray(r_uis)        
    ests = np.array([p.est for p in surprise_predictions]).astype(np.float)

    sparse_preds = sparse.coo_matrix((ests, (uids ,iids)))
    sparse_vals = sparse.coo_matrix((r_uis, (uids ,iids)))
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()  
    def cal_mrr(y_true, y_score):
        mrr_list = []
        for i in range(len(y_score)):
            rank = np.argsort(y_score[i])[::-1]
            '''
            rank = np.argsort([1,2,3,2,1])[::-1]
            print(rank)
            array([2, 3, 1, 4, 0])
            '''        
            pos = 0
            for r in rank:
                pos += 1
                if y_true[i][r] == 1:
                    rr = 1/pos
                    mrr_list.append(rr) 
                    break
        return sum(mrr_list)/len(mrr_list)
    return cal_mrr(y_true=dense_vals, y_score=dense_preds) 

In [15]:
get_mrr(predictions)

0.9719438521677329

In [16]:
def get_map(surprise_predictions, k_highest_scores=None):    
    from sklearn.metrics import label_ranking_average_precision_score
    from scipy import sparse
    
    uids = np.array([p.uid for p in surprise_predictions ]).astype(np.int)
    iids = np.array([p.iid for p in surprise_predictions ]).astype(np.int)
    # 三分以上判斷為相關 低於三分判斷為不相關
    r_uis = np.array([1 if p.r_ui >= 3 else 0  for p in surprise_predictions ]).astype(np.float)
    ests = np.array([p.est for p in surprise_predictions ]).astype(np.float)

    sparse_preds = sparse.coo_matrix((ests, (uids ,iids )))
    sparse_vals = sparse.coo_matrix((r_uis, (uids ,iids )))
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()    
    return label_ranking_average_precision_score(y_true=dense_vals , y_score=dense_preds) 

In [17]:
get_map(predictions)

0.929360512381753

In [18]:
def get_ndcg(surprise_predictions, k_highest_scores=None):

    from sklearn.metrics import ndcg_score
    from scipy import sparse
    
    uids = np.array([p.uid for p in surprise_predictions ]).astype(np.int)
    iids = np.array([p.iid for p in surprise_predictions ]).astype(np.int)
    r_uis = np.array([p.r_ui for p in surprise_predictions ]).astype(np.float)
    ests = np.array([p.est for p in surprise_predictions ]).astype(np.float)

    sparse_preds = sparse.coo_matrix((ests, (uids ,iids )))
    sparse_vals = sparse.coo_matrix((r_uis, (uids ,iids )))

    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true=dense_vals , y_score=dense_preds, k=k_highest_scores) 

In [19]:
get_ndcg(predictions)

0.9529628194243384

# Item CF

In [20]:
sim_options = {'name': 'cosine',
               'user_based': False  
               }
# 使用 Item CF 參考：https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measures-configuration
algo = KNNBasic(sim_options=sim_options)

algo.fit(trainset)
predictions = algo.test(testset)



Computing the cosine similarity matrix...
Done computing similarity matrix.


In [21]:
predictions

[Prediction(uid='345', iid='715', r_ui=4.0, est=3.9707501668953533, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='92', iid='998', r_ui=2.0, est=2.95, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='934', iid='195', r_ui=4.0, est=3.9996056220423477, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='586', iid='423', r_ui=2.0, est=3.3495852681629112, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='336', iid='383', r_ui=1.0, est=2.6747209662681706, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='654', iid='678', r_ui=4.0, est=4.122282100935417, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='64', iid='511', r_ui=4.0, est=3.8492557774826834, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='425', iid='209', r_ui=2.0, est=3.2243969934291714, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='821', iid='132', r_ui=5.0, est=4.4769301

In [22]:
accuracy.rmse(predictions)

RMSE: 1.0329


1.0328841239762445

In [23]:
get_mrr(predictions)

0.936558802625528

In [24]:
get_map(predictions)

0.8988611952109944

In [25]:
get_ndcg(predictions)

0.9356459749062643