In [2]:
import os
import json

import pandas as pd
import numpy as np

# 基于用户的协同过滤

In [3]:
def load_data():
    users = {
        "Alice": {"A": 5, "B": 3, "C": 4, "D": 4},
        "user1": {"A": 3, "B": 1, "C": 2, "D": 3, "E": 3},
        "user2": {"A": 4, "B": 3, "C": 4, "D": 3, "E": 5},
        "user3": {"A": 3, "B": 3, "C": 1, "D": 5, "E": 4},
        "user4": {"A": 1, "B": 5, "C": 5, "D": 2, "E": 1},
    }
    return users


In [4]:
user_data = load_data()
similarity_matrix = pd.DataFrame(
    # 创建一个单位矩阵
    np.identity(len(user_data)),
    index=user_data.keys(),
    columns=user_data.keys(),
)
similarity_matrix

Unnamed: 0,Alice,user1,user2,user3,user4
Alice,1.0,0.0,0.0,0.0,0.0
user1,0.0,1.0,0.0,0.0,0.0
user2,0.0,0.0,1.0,0.0,0.0
user3,0.0,0.0,0.0,1.0,0.0
user4,0.0,0.0,0.0,0.0,1.0


In [8]:
# 遍历每条用户-物品评分数据
for u1, items1 in user_data.items():
    for u2, items2 in user_data.items():
        if u1 == u2:
            continue
        # 获取两个用户都评分的物品
        # vec1 和 vec2 分别存储两个用户对物品的评分
        vec1, vec2 = [], []
        for item, rating1 in items1.items():
            rating2 = items2.get(item, -1)
            if rating2 == -1:
                continue
            vec1.append(rating1)
            vec2.append(rating2)
        # 计算不同用户之间的皮尔逊相关系数
        # corrcoef 函数返回一个相关系数矩阵，它的对角线元素是各个变量自身的相关系数（都是1），其余元素是不同变量之间的相关系数。
        similarity_matrix[u1][u2] = np.corrcoef(vec1, vec2)[0][1]

print(similarity_matrix)

          Alice     user1     user2     user3     user4
Alice  1.000000  0.852803  0.707107  0.000000 -0.792118
user1  0.852803  1.000000  0.467707  0.489956 -0.900149
user2  0.707107  0.467707  1.000000 -0.161165 -0.466569
user3  0.000000  0.489956 -0.161165  1.000000 -0.641503
user4 -0.792118 -0.900149 -0.466569 -0.641503  1.000000


In [10]:
target_user = "Alice"
num = 2
# 由于最相似的用户为自己，去除本身. 获取当前用户和其他用户的相似度所在的行, 然后按相似度排序, 排除自身后取前num个用户
sim_users = similarity_matrix[target_user].sort_values(ascending=False)[1:num+1].index.tolist()
print(f'与用户{target_user}最相似的{num}个用户为：{sim_users}')

与用户Alice最相似的2个用户为：['user1', 'user2']


In [11]:
weighted_scores = 0.  # 总的加权评分
corr_values_sum = 0.  # 总的相似度

target_item = "E"
# 基于皮尔逊相关系数预测用户评分, 用 topK 相似用户
for user in sim_users:
    # 当前用户和目标用户的相似度
    corr_value = similarity_matrix[target_user][user]
    # 用户打分的均值
    user_mean_rating = np.mean(list(user_data[user].values()))

    # 当前用户对目标物品的评分
    weighted_scores += corr_value * (user_data[user][target_item] - user_mean_rating)
    corr_values_sum += corr_value

# 目标用户的打分均值
target_user_mean_rating = np.mean(list(user_data[target_user].values()))
target_item_pred = target_user_mean_rating + weighted_scores / corr_values_sum
print(f'用户{target_user}对物品{target_item}的预测评分为：{target_item_pred}')

用户Alice对物品E的预测评分为：4.871979899370592


# 基于物品的协同过滤

In [13]:
def load_data():
    """
    和上面的同名函数不一样, 这里的 key 是 item, value 是 user 对 item 的评分
    """
    items = {
        "A": {"Alice": 5.0, "user1": 3.0, "user2": 4.0, "user3": 3.0, "user4": 1.0},
        "B": {"Alice": 3.0, "user1": 1.0, "user2": 3.0, "user3": 3.0, "user4": 5.0},
        "C": {"Alice": 4.0, "user1": 2.0, "user2": 4.0, "user3": 1.0, "user4": 5.0},
        "D": {"Alice": 4.0, "user1": 3.0, "user2": 3.0, "user3": 5.0, "user4": 2.0},
        "E": {"user1": 3.0, "user2": 5.0, "user3": 4.0, "user4": 1.0},
    }
    return items

In [14]:
item_data = load_data()

similarity_matrix = pd.DataFrame(
    np.identity(len(item_data)),
    index=item_data.keys(),
    columns=item_data.keys(),
)

# 遍历每条物品-用户评分数据
for i1, users1 in item_data.items():
    for i2, users2 in item_data.items():
        if i1 == i2:
            continue
        # vec1 和 vec2 分别存储不同用户对这两个物品的评分
        vec1, vec2 = [], []
        for user, rating1 in users1.items():
            rating2 = users2.get(user, -1)
            if rating2 == -1:
                continue
            vec1.append(rating1)
            vec2.append(rating2)
        # 计算不同物品之间的皮尔逊相关系数
        similarity_matrix[i1][i2] = np.corrcoef(vec1, vec2)[0][1]

print(similarity_matrix)

          A         B         C         D         E
A  1.000000 -0.476731 -0.123091  0.532181  0.969458
B -0.476731  1.000000  0.645497 -0.310087 -0.478091
C -0.123091  0.645497  1.000000 -0.720577 -0.427618
D  0.532181 -0.310087 -0.720577  1.000000  0.581675
E  0.969458 -0.478091 -0.427618  0.581675  1.000000


In [16]:
target_user = "Alice"
target_item = "E"
num = 2

sim_items = []
# 获取和目标物品最相似的num个物品
sim_items_list = similarity_matrix[target_item].sort_values(ascending=False).index.tolist()
for item in sim_items_list:
    # 如果target_user对物品item评分过
    if target_user in item_data[item]:
        sim_items.append(item)
    # 取目标用户打分过的最相似的 num 个物品
    if len(sim_items) == num:
        break
print(f'与物品{target_item}最相似的{num}个物品为：{sim_items}')

与物品E最相似的2个物品为：['A', 'D']


In [19]:
# 目前物品的打分均值
target_item_mean_rating = np.mean(list(item_data[target_item].values()))
weighted_scores = 0.
corr_values_sum = 0.

target_item = "E"
for item in sim_items:
    corr_value = similarity_matrix[target_item][item]
    item_mean_rating = np.mean(list(item_data[item].values()))

    weighted_scores += corr_value * (item_data[item][target_user] - item_mean_rating)
    corr_values_sum += corr_value

target_item_pred = target_item_mean_rating + weighted_scores / corr_values_sum
print(f'用户{target_user}对物品{target_item}的预测评分为：{target_item_pred}')

用户Alice对物品E的预测评分为：4.6
