In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
learning_rate = 0.001
dimension = 40

In [4]:
def data_preprocess(data_path:str)->pd.DataFrame:
    """
    数据预处理
    :param data_path: 数据路径
    :return: 处理后的数据
    """
    data_raw = pd.read_csv(data_path)   # 读取数据
    data_raw = data_raw.iloc[:, :3] # 去掉时间戳
    data_raw = data_raw.sample(frac=1, random_state=42) # 打乱数据
    return data_raw

In [5]:
data_raw = data_preprocess("ratings.csv")
data_raw.head()

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0


In [6]:
# 划分k折
def k_fold_split(data:pd.DataFrame, k:int)->list:
    """
    划分k折
    :param data: 数据
    :param k: k折
    :return: k折数据
    """
    data_len = len(data)
    for i in range(k):
        start = int(i * data_len / k)
        end = int((i + 1) * data_len / k)
        # 返回训练集和验证集
        yield pd.concat([data.iloc[:start], data.iloc[end:]]), data.iloc[start:end]

In [7]:
for _train_set, _valid_set in k_fold_split(data_raw, 10):
    # print(len(_train_set), len(_valid_set))
    train_set = _train_set.copy()
    valid_set = _valid_set.copy()
    break

len(train_set), len(valid_set)

(90753, 10083)

In [8]:
class Node:
    """节点"""
    def __init__(self, Id:int, neighbors:list[int], dimension:int=2):
        """
        :param dimension: 节点维度
        """
        self.Id = Id    # 节点ID
        self.neighbors = neighbors  # 邻居
        self.position = np.random.rand(dimension) # 随机初始化位置

In [9]:
def get_nodes(data:pd.DataFrame, data_train:pd.DataFrame, dimension:int=40)->tuple[dict, dict]:
    """
    获取节点
    :param data: 数据
    :param dimension: 节点维度
    
    """
    # 获取每个用户看过的电影，保存到字典里
    user_movies = {}
    for user_id, movie_id, rating in data_train.values:
        if user_id not in user_movies:
            user_movies[int(user_id)] = []
        user_movies[user_id].append(int(movie_id))

    # 获取看过该电影的用户，保存到字典里
    movie_users = {}
    for user_id, movie_id, rating in data_train.values:
        if movie_id not in movie_users:
            movie_users[int(movie_id)] = []
        movie_users[movie_id].append(int(user_id))

    user_nodes, item_nodes = {}, {}
    for user, _ in data.groupby("userId"):
        neighbors = user_movies[user] if user in user_movies else []
        user_nodes[user] = Node(int(user), neighbors, dimension)
        
    for item, _ in data.groupby("movieId"):
        neighbors = movie_users[item] if item in movie_users else []
        item_nodes[item] = Node(int(item), neighbors, dimension)
    return user_nodes, item_nodes

In [10]:
_user_nodes, _item_nodes = get_nodes(data_raw, train_set, dimension)

In [11]:
def correction_user(train_set:pd.DataFrame, user_node:Node, item_nodes:dict[int, Node], maxR:float=5.0, minR:float=0.5):
    """
    修正用户位置
    :param train_set: 训练集
    :param user_node: 用户节点
    :param item_nodes: 电影节点
    """
    weight = {}
    for _, row in train_set.iterrows():
        movie_id = int(row["movieId"])
        item_node = item_nodes[movie_id]
        weight[movie_id] = 1 / np.linalg.norm(user_node.position - item_node.position)
    sim_sum = sum(weight.values())
    for movie_id in weight.keys():
        weight[movie_id] /= sim_sum

    correction = 0
    for _, row in train_set.iterrows():
        movie_id = int(row["movieId"])
        item_node = item_nodes[movie_id]
        real_rating = row["rating"]
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        pred_rating = maxR - (maxR - minR) * pred_distance / 100
        error = real_rating - pred_rating
        correction += error * weight[movie_id] 

In [12]:
def correction_item(train_set:pd.DataFrame, item_node:Node, user_nodes:dict[int, Node], maxR:float=5.0, minR:float=0.5):
    """
    修正物品位置
    :param train_set: 训练集
    :param item_node: 物品节点
    :param user_nodes: 用户节点
    """
    weight = {}
    for _, row in train_set.iterrows():
        user_id = int(row["userId"])
        user_node = user_nodes[user_id]
        weight[user_id] = 1 / np.linalg.norm(user_node.position - item_node.position)
    sim_sum = sum(weight.values())
    for user_id in weight.keys():
        weight[user_id] /= sim_sum

    correction = 0
    for _, row in train_set.iterrows():
        user_id = int(row["userId"])
        user_node = user_nodes[user_id]
        real_rating = row["rating"]
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        pred_rating = maxR - (maxR - minR) * pred_distance / 100
        error = real_rating - pred_rating
        correction += error * weight[user_id]