In [1]:
from itertools import combinations
import pandas as pd

In [2]:
alpha = 0.5
top_k = 20
def load_data(train_path):
    """
    这个文件有三列, 分别是 用户id, 物品id, 评分
    """
    train_data = pd.read_csv(train_path, sep="\t", engine="python", names=["userid", "itemid", "rate"]) # 提取用户交互记录数据
    print(train_data.head(3))
    return train_data

In [9]:
def get_uitems_iusers(train: pd.DataFrame):
    """
    获取用户交互过的item集合, item被交互过的user集合
    """
    u_items = dict()  # key 是用户id, value 是用户交互过的item集合
    i_users = dict()  # key 是item id, value 是item被交互过的user集合
    for index, row in train.iterrows():  # 处理用户交互记录
        u_items.setdefault(row["userid"], set())
        i_users.setdefault(row["itemid"], set())
        u_items[row["userid"]].add(row["itemid"])  # 得到user交互过的所有item
        i_users[row["itemid"]].add(row["userid"])  # 得到item交互过的所有user
    print("使用的用户个数为：{}".format(len(u_items)))
    print("使用的item个数为：{}".format(len(i_users)))
    return u_items, i_users

In [12]:
def swing_model(u_items: dict, i_users: dict):
    """
    遍历所有的物品对, 计算物品相似度.
    物品相似度的计算公式为:
    对每个物品对, 找到和这两个物品都有交互的用户, 然后计算这些用户的商品交集, 1/(alpha + 交集的长度), 然后求和
    sim(i, j) = sum(1 / (alpha + |N(u) ∩ N(v)|)) for all u, v in N(i) ∩ N(j)
    """
    # print([i for i in i_users.values()][:5])
    # print([i for i in u_items.values()][:5])
    # 这是物品间的两两组合
    item_pairs = list(combinations(i_users.keys(), 2))  # 全排列组合对
    print("item pairs length：{}".format(len(item_pairs)))
    # 构建物品的相似度字典
    item_sim_dict = dict()
    for i, j in item_pairs:
        # 这是用户的组合, i_users[i] & i_users[j] 是两个物品都有交互的用户
        user_pairs = list(combinations(i_users[i] & i_users[j], 2))  # item_i和item_j对应的user取交集后全排列 得到user对
        result = 0
        for u, v in user_pairs:
            # 用户 u 和 v 交互过的物品交集
            result += 1 / (alpha + list(u_items[u] & u_items[v]).__len__())  # 分数公式
        if result != 0:
            item_sim_dict.setdefault(i, dict())
            # 保留6位小数, 保存 item_i 和 item_j 的相似度
            item_sim_dict[i][j] = format(result, ".6f")
    return item_sim_dict

In [None]:
def save_item_sims(item_sim_dict: dict, top_k: int, path: str):
    new_item_sim_dict = dict()
    try:
        writer = open(path, 'w', encoding='utf-8')
        # 对每个商品和它的相似商品
        for item, sim_items in item_sim_dict.items():
            new_item_sim_dict.setdefault(item, dict())
            # 按照相似度排序, 取出 top_k个相似的item
            new_item_sim_dict[item] = dict(sorted(sim_items.items(), key = lambda k:k[1], reverse=True)[:top_k])#排序取出 top_k个相似的item
            writer.write('item_id:%d\t%s\n' % (item, new_item_sim_dict[item]))
        print("SUCCESS: top_{} item saved".format(top_k))
    except Exception as e:
        print(e.args)

In [None]:
if __name__ == "__main__":
    train_data_path = "./ratings_final.txt"
    item_sim_save_path = "./item_sim_dict.txt"
    top_k = 10 #与item相似的前 k 个item
    train = load_data(train_data_path)
    u_items, i_users = get_uitems_iusers(train)
    item_sim_dict = swing_model(u_items, i_users)
    save_item_sims(item_sim_dict, top_k, item_sim_save_path)