In [1]:
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict

df = pd.read_csv('ratings_small.csv').drop(['timestamp'],axis=1)
reader = Reader(rating_scale=(1, 5))

#使用reader格式从文件中读取数据
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)

In [2]:
#拆分训练集与测试集，75%的样本作为训练集，25%的样本作为测试集
trainset, testset = train_test_split(data, test_size=.25)

trainset的类型是surprise.dataset.Trainset类型，我们可以查看数据的基本信息

In [3]:
trainset.n_users 

671

In [4]:
trainset.n_items 

8244

训练模型，指定有100个隐含特征，使用训练集进行训练

In [5]:
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1185cb150>

100个隐含特征是指，原本671*8263的矩阵会被拆分成671*100和100*8263的两个矩阵乘积，n_factors值可以任意指定只要不超过8263即可，但是设置不同的值将会拟合出不同的模型，需要选择使结果较优的值。
查看拆分出来的两个矩阵

In [6]:
model.pu.shape

(671, 100)

In [7]:
model.qi.shape 

(8244, 100)

predictions的数据结构，是surprise中的算法自带接口model.test()的输出值。

In [8]:
predictions = model.test(testset)
predictions

[Prediction(uid=388, iid=27846, r_ui=3.5, est=3.9386946226679975, details={u'was_impossible': False}),
 Prediction(uid=124, iid=6942, r_ui=3.0, est=3.629294929727777, details={u'was_impossible': False}),
 Prediction(uid=573, iid=595, r_ui=4.0, est=3.434209761122944, details={u'was_impossible': False}),
 Prediction(uid=472, iid=43396, r_ui=4.0, est=3.7480043657204325, details={u'was_impossible': False}),
 Prediction(uid=547, iid=141, r_ui=1.0, est=2.955209944068966, details={u'was_impossible': False}),
 Prediction(uid=58, iid=2475, r_ui=3.0, est=3.26030804943019, details={u'was_impossible': False}),
 Prediction(uid=457, iid=7445, r_ui=2.5, est=2.5934507364534687, details={u'was_impossible': False}),
 Prediction(uid=105, iid=1210, r_ui=3.0, est=3.6159719168762647, details={u'was_impossible': False}),
 Prediction(uid=468, iid=5444, r_ui=2.5, est=3.121701410553091, details={u'was_impossible': False}),
 Prediction(uid=195, iid=457, r_ui=4.0, est=2.9910640078596873, details={u'was_impossible

定义get_top_n()函数，它能根据predictions结果进行解析，获取top_n字典，该字典的key是user-id，value是该user打分（预测值）最高的n个item-id

In [9]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

使用get_top_n()函数，获取测试集中所有用户得分最高的n（等于2）个item-id，并将print出来，得到结果如下

In [10]:
top_n = get_top_n(predictions, n=5)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

(1, [1343, 31, 1405, 1129])
(2, [319, 720, 457, 47, 590])
(3, [2959, 7361, 296, 7153, 1210])
(4, [1219, 1198, 1270, 356, 1210])
(5, [40819, 3897, 356, 5816, 1997])
(6, [2019, 111, 1204, 2761, 596])
(7, [1196, 1210, 1, 1148, 541])
(8, [1198, 527, 4226, 2858, 2918])
(9, [318, 593, 2291, 1358, 47])
(10, [318, 1089, 1198, 1210, 1197])
(11, [296, 96079, 1201, 71211, 2596])
(12, [1077, 2959, 1230, 529, 3791])
(13, [7361, 1961, 69757, 47, 4306])
(14, [1196, 3175, 2861, 1721])
(15, [1198, 1252, 2959, 91542, 1060])
(16, [750, 2797, 6874, 2278])
(17, [7153, 527, 296, 5782, 1193])
(18, [85, 628, 62, 92, 32])
(19, [1198, 969, 318, 515, 912])
(20, [1148, 904, 110, 2762, 905])
(21, [1256, 527, 1254, 969, 1228])
(22, [858, 2571, 5952, 3868, 2502])
(23, [778, 1254, 1203, 3317, 3983])
(24, [6, 648, 165, 380, 316])
(25, [778, 1356, 1358, 1483, 1405])
(26, [58559, 260, 38038, 48516, 3578])
(27, [296, 111, 1704, 1610, 1584])
(28, [2019, 913, 2324, 1225, 899])
(29, [2571, 73268, 6934])
(30, [318, 1228, 127