In [None]:
'''
Author: yaobinsu 68414437+yaobinsu@users.noreply.github.com
Date: 2023-06-13 17:29:48
LastEditors: yaobinsu 68414437+yaobinsu@users.noreply.github.com
LastEditTime: 2023-06-13 17:29:59
FilePath: /wrs/wrs.ipynb
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''


## Exercise 1


In [1]:
import pandas as pd

# 读取数据
train = pd.read_csv('/workspaces/wrs/data/train.data')
test = pd.read_csv('/workspaces/wrs/data/test.data')
print('Original train data shape: ', train.shape)
print('Original test data shape: ', test.shape)

# 清除缺失的评级
train = train.dropna(subset=['rating'])
test = test.dropna(subset=['rating'])

# 删除重复项
train = train.sort_values(['user_id', 'book_id', 'timestamp'], ascending=[True, True, True])
train = train.drop_duplicates(subset=['user_id', 'book_id'], keep='last')

test = test.sort_values(['user_id', 'book_id', 'timestamp'], ascending=[True, True, True])
test = test.drop_duplicates(subset=['user_id', 'book_id'], keep='last')

# 确保测试集中的所有用户也出现在训练集中
test = test[test['user_id'].isin(train['user_id'])]

# 计算用户和项目统计信息
user_rating_count = train.groupby('user_id')['rating'].count()
item_rating_count = train.groupby('book_id')['rating'].count()

print('User rating distribution: \n', user_rating_count.describe())
print('Item rating distribution: \n', item_rating_count.describe())

top_5_popular_items = item_rating_count.nlargest(5)
print('Top 5 popular items: \n', top_5_popular_items)


Original train data shape:  (32335, 5)
Original test data shape:  (9158, 5)
User rating distribution: 
 count    5681.00000
mean        5.69178
std         7.94423
min         2.00000
25%         2.00000
50%         3.00000
75%         6.00000
max       158.00000
Name: rating, dtype: float64
Item rating distribution: 
 count    4462.000000
mean        7.246750
std        15.503312
min         1.000000
25%         3.000000
50%         4.000000
75%         6.000000
max       474.000000
Name: rating, dtype: float64
Top 5 popular items: 
 book_id
23513349    474
20821284    299
30075802    271
18263725    252
1420        191
Name: rating, dtype: int64


## Exercise 2

In [1]:
from surprise import KNNBasic, SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV

# 转换数据格式
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train[['user_id', 'book_id', 'rating']], reader)

# 定义邻域基模型
knn_model = KNNBasic()

# 定义潜在因子模型
svd_model = SVD()

# 使用3折交叉验证进行网格搜索以找到最优超参数
param_grid = {'k': [10, 20, 30, 40, 50], 'sim_options': {'name': ['msd', 'cosine', 'pearson_baseline'], 'user_based': [False, True]}}
knn_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)
knn_gs.fit(train_data)

param_grid = {'n_epochs': [5, 10, 20], 'n_factors': [50, 100, 150]}
svd_gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
svd_gs.fit(train_data)

# 输出最优超参数和对应的RMSE
print('For KNNBasic model, optimal hyperparameters: ', knn_gs.best_params['rmse'])
print('Corresponding RMSE: ', knn_gs.best_score['rmse'])

print('For SVD model, optimal hyperparameters: ', svd_gs.best_params['rmse'])
print('Corresponding RMSE: ', svd_gs.best_score['rmse'])

# 使用最优超参数重新训练模型
knn_model = knn_gs.best_estimator['rmse']
svd_model = svd_gs.best_estimator['rmse']

knn_model.fit(train_data.build_full_trainset())
svd_model.fit(train_data.build_full_trainset())


ModuleNotFoundError: No module named 'surprise'