### 基于均值填充的方法
- 全局平均评分：
$\bar{r}=\sum_{u,i}y_{ui}r_{ui}/\sum_{u,i}y_{ui}$
- 用户u的平均评分：
$\bar{r}_u=\sum_iy_{ui}r_{ui}/\sum_iy_{ui}$
- 物品i的平均评分：
$\bar{r}_i=\sum_uy_{ui}r_{ui}/\sum_uy_{ui}$
- 用户u的偏置：
$b_u=\sum_iy_{ui}(r_{ui}-\bar{r}_i)/\sum_iy_{ui}$
- 物品i的偏置：
$b_i=\sum_uy_{ui}(r_{ui}-\bar{r}_u)/\sum_uy_{ui}$
- 其中，用户偏置表示该用户u是宽容的还是挑剔的（b_u<0）；物品的偏置表示该物品是受欢迎还是不受欢迎。从而：
- 用户平均：$\hat{r}_{ui}=\bar{r}_u$
- 物品平均：$\hat{r}_{ui}=\bar{r}_i$
- 用户平均和物品平均均值：$\hat{r}_{ui}=\bar{r}_u/2+\bar{r}_i/2$
- 用户偏置与物品平均之和：$\hat{r}_{ui}=b_u+\bar{r}_i$
- 用户平均与物品偏置之和：$\hat{r}_{ui}=\bar{r}_u+b_i$
- 全局平均、用户偏置、物品偏置的和：$\hat{r}_{ui}=\bar{r}+b_u+b_i$


In [4]:
import pandas as pd
import numpy as np
  
class AverageFillingRecommender:
    def __init__(self, train_file, test_file):
        # 读取训练集和测试集数据
        self.train_data = pd.read_csv(train_file, sep='\t', header=None, names=['user_id', 'item_id', 'rating'])
        self.test_data = pd.read_csv(test_file, sep='\t', header=None, names=['user_id', 'item_id', 'rating'])
        # 初始化变量
        self.user_mean = {}
        self.item_mean = {}
        self.global_mean = 0
        self.user_bias = {}
        self.item_bias = {}

    def calculate_means_and_biases(self):
        # 计算全局平均评分
        self.global_mean = self.train_data['rating'].mean()
        
        # 计算用户平均评分
        user_group = self.train_data.groupby('user_id')['rating']
        self.user_mean = user_group.mean().to_dict()

        # 计算物品平均评分
        item_group = self.train_data.groupby('item_id')['rating']
        self.item_mean = item_group.mean().to_dict()

        # 计算用户偏置
        for user in self.user_mean:
            item_ratings = self.train_data[self.train_data['user_id'] == user]
            self.user_bias[user] = (item_ratings['rating'] - item_ratings['item_id'].map(self.item_mean)).mean()

        # 计算物品偏置
        for item in self.item_mean:
            user_ratings = self.train_data[self.train_data['item_id'] == item]
            self.item_bias[item] = (user_ratings['rating'] - user_ratings['user_id'].map(self.user_mean)).mean()

    def predict_rating(self, user_id, item_id):
        # 使用不同的策略进行评分预测
        user_pred = self.user_mean.get(user_id, self.global_mean)
        item_pred = self.item_mean.get(item_id, self.global_mean)
        
        # 用户偏置与物品偏置
        user_bias = self.user_bias.get(user_id, 0)
        item_bias = self.item_bias.get(item_id, 0)

        # 预测评分：这里使用的是用户平均与物品偏置之和的公式
        return min(max(user_pred + item_bias, 1), 5)  # 限制在[1, 5]范围内

    def evaluate(self): # 在评估这里用到的test的数据
        # 评估模型的误差
        mae = 0
        mse = 0
        for _, row in self.test_data.iterrows():
            predicted = self.predict_rating(row['user_id'], row['item_id'])
            mae += abs(predicted - row['rating'])
            mse += (predicted - row['rating']) ** 2
        
        mae /= len(self.test_data)
        mse /= len(self.test_data)
        rmse = np.sqrt(mse)

        print(f'MAE: {mae}, RMSE: {rmse}')

if __name__ == "__main__":
    # 文件路径
    train_file = 'E:\\ml-100k\\ml-100k\\u1.base'  # 训练数据
    test_file = 'E:\\ml-100k\\ml-100k\\u1.test'   # 测试数据

    recommender = AverageFillingRecommender(train_file, test_file)
    recommender.calculate_means_and_biases()  # 计算均值和偏置
    recommender.evaluate()  # 评估模型


  mse += (predicted - row['rating']) ** 2


MAE: 883386378.83015, RMSE: 10189607.946008861
