In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
learning_rate = 0.01
dimension = 40
maxR = 5.
minR = .5

In [3]:
def data_preprocess(data_path:str)->pd.DataFrame:
    """
    数据预处理
    :param data_path: 数据路径
    :return: 处理后的数据
    """
    data_raw = pd.read_csv(data_path)   # 读取数据
    data_raw = data_raw.iloc[:, :3] # 去掉时间戳
    data_raw = data_raw.sample(frac=1, random_state=42) # 打乱数据
    return data_raw

In [4]:
data_raw = data_preprocess("ratings.csv")
data_raw.head()

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0


In [5]:
# 划分k折
def k_fold_split(data:pd.DataFrame, k:int)->list:
    """
    划分k折
    :param data: 数据
    :param k: k折
    :return: k折数据
    """
    data_len = len(data)
    for i in range(k):
        start = int(i * data_len / k)
        end = int((i + 1) * data_len / k)
        # 返回训练集和验证集
        yield pd.concat([data.iloc[:start], data.iloc[end:]]), data.iloc[start:end]

In [6]:
for _train_set, _valid_set in k_fold_split(data_raw, 10):
    # print(len(_train_set), len(_valid_set))
    train_set = _train_set.copy()
    valid_set = _valid_set.copy()
    break

len(train_set), len(valid_set)

(90753, 10083)

In [7]:
class Node:
    """节点"""
    def __init__(self, Id:int, neighbors:list[int], dimension:int=2):
        """
        :param dimension: 节点维度
        """
        self.Id = Id    # 节点ID
        self.neighbors = neighbors  # 邻居
        self.position = np.random.rand(dimension) # 随机初始化位置

In [8]:
class User(Node):
    """用户"""
    def __init__(self, Id:int, neighbors:list[int], dimension:int=2):
        super().__init__(Id, neighbors, dimension)

class Item(Node):
    """物品"""
    def __init__(self, Id:int, neighbors:list[int], dimension:int=2):
        super().__init__(Id, neighbors, dimension)

In [9]:
def get_nodes(data:pd.DataFrame, data_train:pd.DataFrame, dimension:int=40)->tuple[dict, dict]:
    """
    获取节点
    :param data: 数据
    :param dimension: 节点维度
    
    """
    # 获取每个用户看过的电影，保存到字典里
    user_movies = {}
    for user_id, movie_id, rating in data_train.values:
        if user_id not in user_movies:
            user_movies[int(user_id)] = []
        user_movies[user_id].append(int(movie_id))

    # 获取看过该电影的用户，保存到字典里
    movie_users = {}
    for user_id, movie_id, rating in data_train.values:
        if movie_id not in movie_users:
            movie_users[int(movie_id)] = []
        movie_users[movie_id].append(int(user_id))

    user_nodes, item_nodes = {}, {}
    for user, _ in data.groupby("userId"):
        neighbors = user_movies[user] if user in user_movies else []
        user_nodes[user] = User(int(user), neighbors, dimension)
        
    for item, _ in data.groupby("movieId"):
        neighbors = movie_users[item] if item in movie_users else []
        item_nodes[item] = Item(int(item), neighbors, dimension)
    return user_nodes, item_nodes

In [10]:
_user_nodes, _item_nodes = get_nodes(data_raw, train_set, dimension)

In [11]:
def predict(valid_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, maxR:float=5., minR:float=.5):
    """
    预测评分
    
    """
    # 为验证集添加一列，存储预测评分
    valid_set["predict"] = 0
    for i, row in valid_set.iterrows():
        user, item, rating = int(row["userId"]), int(row["movieId"]), row["rating"]
        user_node, item_node = user_nodes[user], item_nodes[item]
        # 计算欧氏距离
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        pred_rating = maxR - (maxR - minR) * pred_distance / 100
        if pred_rating > maxR:
            pred_rating = maxR
        elif pred_rating < minR:
            pred_rating = minR
        valid_set.loc[i, "predict"] = pred_rating
    return valid_set

In [15]:
def update(train_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, learning_rate:float=0.001, maxR:float=5., minR:float=0.5)->None:
    """
    训练
    :param train_set: 训练集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :param learning_rate: 学习率
    :param maxR: 最大评分
    :param minR: 最小评分
    """
    # 随机选择一个节点更新
    nodes = list(user_nodes.values()) + list(item_nodes.values())
    selected_node = np.random.choice(nodes)

    neighbors = selected_node.neighbors
    if len(neighbors) == 0:
        return user_nodes, item_nodes
    selected_neighbors = np.random.choice(neighbors)

    if isinstance(selected_node, User):
        user_node = selected_node
        item_node = item_nodes[selected_neighbors]
        neighbor_node = item_node
        user_id = user_node.Id
        item_id = item_node.Id
    else:
        user_node = user_nodes[selected_neighbors]
        neighbor_node = user_node
        item_node = selected_node
        user_id = user_node.Id
        item_id = item_node.Id

    real_rating = train_set[(train_set["userId"] == user_id) & (train_set["movieId"] == item_id)]["rating"].values[0]
    real_distance = 100 * (maxR - real_rating) / (maxR - minR)
    pred_distance = np.linalg.norm(selected_node.position - neighbor_node.position)
    error = real_distance - pred_distance
    direction = selected_node.position - neighbor_node.position
    selected_node.position += learning_rate * error * direction

    # for neighbor in neighbors:
    #     if isinstance(selected_node, User):
    #         user_node = selected_node
    #         item_node = item_nodes[neighbor]
    #         neighbor_node = item_node
    #         user_id = user_node.Id
    #         item_id = item_node.Id
    #     else:
    #         user_node = user_nodes[neighbor]
    #         neighbor_node = user_node
    #         item_node = selected_node
    #         user_id = user_node.Id
    #         item_id = item_node.Id
        
    #     real_rating = train_set[(train_set["userId"] == user_id) & (train_set["movieId"] == item_id)]["rating"].values[0]
    #     real_distance = 100 * (maxR - real_rating) / (maxR - minR)
    #     pred_distance = np.linalg.norm(selected_node.position - neighbor_node.position)
    #     error = real_distance - pred_distance
    #     direction = selected_node.position - neighbor_node.position
    #     selected_node.position += learning_rate * error * direction

    return user_nodes, item_nodes

In [16]:
def train(train_set:pd.DataFrame, valid_set:pd.DataFrame, user_nodes:dict, item_nodes:dict, learning_rate:float=0.001, maxR:float=5., minR:float=0.5, epoch:int=10)->tuple[pd.DataFrame, pd.DataFrame]:
    """
    训练
    :param train_set: 训练集
    :param valid_set: 验证集
    :param user_nodes: 用户节点
    :param item_nodes: 物品节点
    :param learning_rate: 学习率
    :param maxR: 最大评分
    :param minR: 最小评分
    :param epoch: 训练轮数
    :return: 训练集和验证集
    """
    for i in range(epoch):
        user_nodes, item_nodes = update(train_set, user_nodes, item_nodes, learning_rate, maxR, minR)
        if i % 1000 == 0:
            print("epoch: ", i)
            valid_set = predict(valid_set, user_nodes, item_nodes, maxR, minR)
            mae = np.mean(np.abs(valid_set["predict"] - valid_set["rating"]))
            rmse = np.sqrt(np.mean(np.square(valid_set["predict"] - valid_set["rating"])))
            print("mae: ", mae, "rmse: ", rmse)
    return user_nodes, item_nodes

In [17]:
from copy import deepcopy
_user_nodes_trained, _item_nodes_trained = train(train_set, valid_set, deepcopy(_user_nodes), deepcopy(_item_nodes), epoch=100000, learning_rate=learning_rate, maxR=maxR, minR=minR) 

epoch:  0
mae:  1.4221274381673448 rmse:  1.7439617101495613
epoch:  1000
mae:  1.4188961271541385 rmse:  1.7406194224679923
epoch:  2000
mae:  1.4161746696414015 rmse:  1.7378564229712739
epoch:  3000
mae:  1.4129694944145146 rmse:  1.734330904112993
epoch:  4000
mae:  1.4103052087458987 rmse:  1.7313785058260212
epoch:  5000
mae:  1.4042584598350658 rmse:  1.7246754985350827
epoch:  6000
mae:  1.3998293150087742 rmse:  1.7193994909291417
epoch:  7000
mae:  1.3961454211111004 rmse:  1.7152241914243003
epoch:  8000
mae:  1.3912604287674737 rmse:  1.7097461260792626
epoch:  9000
mae:  1.387132336887542 rmse:  1.7052377181624307
epoch:  10000
mae:  1.3828281524177033 rmse:  1.700309420650842
epoch:  11000
mae:  1.3791225702428855 rmse:  1.6961408820732529
epoch:  12000
mae:  1.3745964596177072 rmse:  1.6912676168915597
epoch:  13000
mae:  1.369835644080901 rmse:  1.6861772952593095
epoch:  14000
mae:  1.3647404532742349 rmse:  1.6807252495814542
epoch:  15000
mae:  1.3576755087223256 rms

In [None]:
_user_nodes_trained

Unnamed: 0,userId,movieId,rating
76059,479,351,2.0
14430,91,2431,2.0
43498,292,111,2.0
73590,474,1784,3.5
19181,124,110,3.5
...,...,...,...
6265,42,4005,4.0
54886,364,141,4.0
76820,480,6867,4.0
860,6,981,3.0
