### 基于均值填充的方法
- 全局平均评分：
$\bar{r}=\sum_{u,i}y_{ui}r_{ui}/\sum_{u,i}y_{ui}$
- 用户u的平均评分：
$\bar{r}_u=\sum_iy_{ui}r_{ui}/\sum_iy_{ui}$
- 物品i的平均评分：
$\bar{r}_i=\sum_uy_{ui}r_{ui}/\sum_uy_{ui}$
- 用户u的偏置：
$b_u=\sum_iy_{ui}(r_{ui}-\bar{r}_i)/\sum_iy_{ui}$
- 物品i的偏置：
$b_i=\sum_uy_{ui}(r_{ui}-\bar{r}_u)/\sum_uy_{ui}$
- 其中，用户偏置表示该用户u是宽容的还是挑剔的（b_u<0）；物品的偏置表示该物品是受欢迎还是不受欢迎。从而：
- 用户平均：$\hat{r}_{ui}=\bar{r}_u$
- 物品平均：$\hat{r}_{ui}=\bar{r}_i$
- 用户平均和物品平均均值：$\hat{r}_{ui}=\bar{r}_u/2+\bar{r}_i/2$
- 用户偏置与物品平均之和：$\hat{r}_{ui}=b_u+\bar{r}_i$
- 用户平均与物品偏置之和：$\hat{r}_{ui}=\bar{r}_u+b_i$
- 全局平均、用户偏置、物品偏置的和：$\hat{r}_{ui}=\bar{r}+b_u+b_i$


In [1]:
import numpy as np
import pandas as pd

# 读取数据
def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file, sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    test_data = pd.read_csv(test_file, sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
    return train_data, test_data

# 计算全局平均评分、用户平均评分和物品平均评分
def calculate_means(train_data):
    global_mean = train_data['rating'].mean()
    user_means = train_data.groupby('user')['rating'].mean()
    item_means = train_data.groupby('item')['rating'].mean()
    return global_mean, user_means, item_means

# 计算用户偏置和物品偏置
def calculate_biases(train_data, user_means, item_means):
    train_data = train_data.copy()
    train_data['user_bias'] = train_data.apply(lambda x: x['rating'] - item_means[x['item']] if x['item'] in item_means else 0, axis=1)
    train_data['item_bias'] = train_data.apply(lambda x: x['rating'] - user_means[x['user']] if x['user'] in user_means else 0, axis=1)

    user_biases = train_data.groupby('user')['user_bias'].mean()
    item_biases = train_data.groupby('item')['item_bias'].mean()
    
    return user_biases, item_biases

# 预测评分
def predict_ratings(test_data, global_mean, user_means, item_means, user_biases, item_biases):
    predictions = {}
    
    for method in range(6):
        preds = []
        for _, row in test_data.iterrows():
            user, item = row['user'], row['item']
            
            if method == 0:  # 用户平均
                pred = user_means.get(user, global_mean)
            elif method == 1:  # 物品平均
                pred = item_means.get(item, global_mean)
            elif method == 2:  # 用户和物品平均
                pred = (user_means.get(user, global_mean) + item_means.get(item, global_mean)) / 2
            elif method == 3:  # 用户偏置与物品平均
                pred = item_means.get(item, global_mean) + user_biases.get(user, 0)
            elif method == 4:  # 用户平均与物品偏置
                pred = user_means.get(user, global_mean) + item_biases.get(item, 0)
            elif method == 5:  # 全局平均、用户偏置和物品偏置
                pred = global_mean + user_biases.get(user, 0) + item_biases.get(item, 0)

            preds.append(pred)
        predictions[method] = preds
    
    return predictions

# 计算RMSE和MAE
def calculate_rmse_mae(test_data, predictions):
    results = {}
    for method, preds in predictions.items():
        mse = np.mean((test_data['rating'].values - preds) ** 2)
        rmse = np.sqrt(mse)
        mae = np.mean(np.abs(test_data['rating'].values - preds))
        results[method] = (rmse, mae)
    return results

# 主函数
def main(train_file, test_file):
    train_data, test_data = load_data(train_file, test_file)
    global_mean, user_means, item_means = calculate_means(train_data)
    user_biases, item_biases = calculate_biases(train_data, user_means, item_means)
    predictions = predict_ratings(test_data, global_mean, user_means, item_means, user_biases, item_biases)
    results = calculate_rmse_mae(test_data, predictions)

    for method, (rmse, mae) in results.items():
        print(f'Method {method}: RMSE = {rmse:.4f}, MAE = {mae:.4f}')

# 文件路径
train_file = 'E:\\ml-100k\\ml-100k\\u1.base'
test_file = 'E:\\ml-100k\\ml-100k\\u1.test'

# 执行主函数
main(train_file, test_file)


Method 0: RMSE = 1.0630, MAE = 0.8502
Method 1: RMSE = 1.0334, MAE = 0.8276
Method 2: RMSE = 0.9985, MAE = 0.8085
Method 3: RMSE = 0.9602, MAE = 0.7574
Method 4: RMSE = 0.9758, MAE = 0.7696
Method 5: RMSE = 0.9623, MAE = 0.7613
