In [10]:
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict
from surprise import NMF
from surprise.model_selection import GridSearchCV

df = pd.read_csv('ratings_small.csv').drop(['timestamp'],axis=1)
df.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


In [2]:
#df = pd.read_csv('ratings_small.csv').drop(['timestamp'],axis=1)
reader = Reader(rating_scale=(1, 5))

#使用reader格式从文件中读取数据
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)

#拆分训练集与测试集，75%的样本作为训练集，25%的样本作为测试集
trainset, testset = train_test_split(data, test_size=.25)

trainset的类型是surprise.dataset.Trainset类型，我们可以查看数据的基本信息

In [3]:
trainset.n_users 

671

In [4]:
trainset.n_items 

8214

训练模型，指定有100个隐含特征，使用训练集进行训练

In [5]:
model = SVD(n_factors=100)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x118c7bd90>

100个隐含特征是指，原本671*8263的矩阵会被拆分成671*100和100*8263的两个矩阵乘积，n_factors值可以任意指定只要不超过8263即可，但是设置不同的值将会拟合出不同的模型，需要选择使结果较优的值。
查看拆分出来的两个矩阵

In [6]:
model.pu.shape

(671, 100)

In [7]:
model.qi.shape 

(8214, 100)

In [9]:
#GridSearchCV调参
#指定参数选择范围
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],'n_factors' : [100, 1000]}

#'reg_all': [0.4, 0.6]

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=4)
gs.fit(data)

# 打印最好的均方根误差RMSE
print(gs.best_score['rmse'])

# 打印取得最好RMSE的参数集合
print(gs.best_params['rmse'])

# 现在可以使用产生最佳RMSE的算法
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())


0.9041107360095156
{'lr_all': 0.005, 'n_factors': 100, 'n_epochs': 10}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x118c7b110>

In [15]:
#GridSearchCV调参
#指定参数选择范围
param_grid_nmf = {'n_epochs': [50, 100], 'reg_pu': [0.01, 0.10],'n_factors' : [10, 50]}

#'reg_all': [0.4, 0.6]

gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse', 'mae'], cv=4)
gs_nmf.fit(data)

# 打印最好的均方根误差RMSE
print(gs_nmf.best_score['rmse'])

# 打印取得最好RMSE的参数集合
print(gs_nmf.best_params['rmse'])

# 现在可以使用产生最佳RMSE的算法
algo_nmf = gs_nmf.best_estimator['rmse']
gs_nmf.fit(data.build_full_trainset())

0.9281112539053873
{'n_factors': 50, 'reg_pu': 0.1, 'n_epochs': 100}


AttributeError: Trainset instance has no attribute 'raw_ratings'

predictions的数据结构，是surprise中的算法自带接口model.test()的输出值。

In [None]:
predictions = algo.test(testset)
predictions

定义get_top_n()函数，它能根据predictions结果进行解析，获取top_n字典，该字典的key是user-id，value是该user打分（预测值）最高的n个item-id

In [None]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

使用get_top_n()函数，获取测试集中所有用户得分最高的n（等于2）个item-id，并将print出来，得到结果如下

In [None]:
top_n = get_top_n(predictions, n=5)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
#使用NMF
algo_nmf.fit(trainset)
pred_nmf = algo.test(testset)
top_nmf_n = get_top_n(pred_nmf, n=5)
for uid, user_ratings in top_nmf_n.items():
    print(uid, [iid for (iid, _) in user_ratings])