In [385]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [386]:
learning_rate = 0.001
dimension = 40

In [387]:
def data_preprocess(data_path:str)->pd.DataFrame:
    """
    数据预处理
    :param data_path: 数据路径
    :return: 处理后的数据
    """
    data_raw = pd.read_csv(data_path)   # 读取数据
    data_raw = data_raw.iloc[:, :3] # 去掉时间戳
    data_raw = data_raw.sample(frac=1, random_state=42) # 打乱数据
    return data_raw

In [388]:
data_raw = data_preprocess("ratings.csv")
# 为每一个用户添加两列，分别是该用户的最大评分和最小评分
# data_raw["max"] = data_raw.groupby("userId")["rating"].transform("max")
# data_raw["min"] = data_raw.groupby("userId")["rating"].transform("min")
data_raw.head()

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0


In [389]:
# 划分k折
def k_fold_split(data:pd.DataFrame, k:int)->list:
    """
    划分k折
    :param data: 数据
    :param k: k折
    :return: k折数据
    """
    data_len = len(data)
    for i in range(k):
        start = int(i * data_len / k)
        end = int((i + 1) * data_len / k)
        # 返回训练集和验证集
        yield pd.concat([data.iloc[:start], data.iloc[end:]]), data.iloc[start:end]

In [390]:
for _train_set, _valid_set in k_fold_split(data_raw, 10):
    # print(len(_train_set), len(_valid_set))
    train_set = _train_set.copy()
    valid_set = _valid_set.copy()
    break

len(train_set), len(valid_set)

(90753, 10083)

In [391]:
class Node:
    """节点"""
    def __init__(self, neighbors:list[int], dimension:int=2):
        """
        :param dimension: 节点维度
        """
        self.position = np.random.rand(dimension) # 随机初始化位置(randn: 标准正态分布)
        self.neighbor = neighbors

In [392]:
class User(Node):
    def __init__(self, user_id:int, maxRating:int, minRating:int, neighbors: list[int], dimension: int=2):
        super().__init__(neighbors, dimension)
        self.user_id = user_id
        self.maxRating = maxRating
        self.minRating = minRating

class Item(Node):
    def __init__(self, item_id:int, neighbors: list[int], dimension: int=2):
        super().__init__(neighbors, dimension)
        self.item_id = item_id

In [393]:
# 用户节点
user_nodes = {}
for user, group in data_raw.groupby("userId"):
    # maxR = group['rating'].max()
    # minR = group['rating'].min()
    neighbors = list(group["movieId"])
    user_nodes[user] = User(user, 5.0, 0.5, neighbors, dimension=dimension)

print(len(user_nodes.items()))

# 物品节点
item_nodes = {}
for item, group in data_raw.groupby("movieId"):
    neighbors = list(group["userId"])
    item_nodes[item] = Item(item, neighbors, dimension=dimension)

print(len(item_nodes.items()))

610
9724


In [394]:
user_nodes[1].__dict__

{'position': array([0.29087275, 0.51022472, 0.83051972, 0.3030798 , 0.97720333,
        0.30240388, 0.26097202, 0.87152515, 0.67894707, 0.84786611,
        0.06162842, 0.63779078, 0.82138329, 0.75132235, 0.2169713 ,
        0.50328695, 0.99417281, 0.26381474, 0.65075542, 0.78506351,
        0.66399167, 0.90364218, 0.08971231, 0.41233774, 0.11261357,
        0.85786972, 0.75289117, 0.0432998 , 0.95720576, 0.82880752,
        0.2437637 , 0.68637856, 0.40719269, 0.19576624, 0.10306772,
        0.94639306, 0.98808901, 0.2454836 , 0.86675132, 0.19174813]),
 'neighbor': [3740,
  2353,
  1278,
  596,
  2115,
  2529,
  1927,
  2078,
  3034,
  2899,
  1500,
  1198,
  1587,
  1029,
  423,
  2492,
  2048,
  2797,
  3578,
  2991,
  1580,
  2640,
  2387,
  1954,
  2459,
  2116,
  1644,
  1617,
  2329,
  673,
  1282,
  2193,
  2987,
  3639,
  151,
  1214,
  2700,
  3671,
  1445,
  2090,
  2427,
  1030,
  1793,
  2139,
  2993,
  3062,
  2366,
  3033,
  2949,
  2617,
  593,
  2161,
  3729,
  2033,
  2

In [395]:
user_nodes[1].__dict__

{'position': array([0.29087275, 0.51022472, 0.83051972, 0.3030798 , 0.97720333,
        0.30240388, 0.26097202, 0.87152515, 0.67894707, 0.84786611,
        0.06162842, 0.63779078, 0.82138329, 0.75132235, 0.2169713 ,
        0.50328695, 0.99417281, 0.26381474, 0.65075542, 0.78506351,
        0.66399167, 0.90364218, 0.08971231, 0.41233774, 0.11261357,
        0.85786972, 0.75289117, 0.0432998 , 0.95720576, 0.82880752,
        0.2437637 , 0.68637856, 0.40719269, 0.19576624, 0.10306772,
        0.94639306, 0.98808901, 0.2454836 , 0.86675132, 0.19174813]),
 'neighbor': [3740,
  2353,
  1278,
  596,
  2115,
  2529,
  1927,
  2078,
  3034,
  2899,
  1500,
  1198,
  1587,
  1029,
  423,
  2492,
  2048,
  2797,
  3578,
  2991,
  1580,
  2640,
  2387,
  1954,
  2459,
  2116,
  1644,
  1617,
  2329,
  673,
  1282,
  2193,
  2987,
  3639,
  151,
  1214,
  2700,
  3671,
  1445,
  2090,
  2427,
  1030,
  1793,
  2139,
  2993,
  3062,
  2366,
  3033,
  2949,
  2617,
  593,
  2161,
  3729,
  2033,
  2

In [396]:
def predict(valid_set:pd.DataFrame, user_nodes, item_nodes):
    # 为验证集添加一列，存储预测评分
    valid_set["predict"] = 0
    for i, row in valid_set.iterrows():
        user = int(row["userId"])
        item = int(row["movieId"])
        rating = row["rating"]
        user_node = user_nodes[user]
        item_node = item_nodes[item]
        # 计算欧氏距离
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        predict_rating = 5 - 4.5 * pred_distance / 100
        if predict_rating > 5:
            predict_rating = 5
        if predict_rating < 0.5:
            predict_rating = 0.5
        valid_set.loc[valid_set.index == i, "predict"] = predict_rating
        # 计算RMSE和MAE
    RMSE = np.sqrt(np.sum((valid_set["rating"] - valid_set["predict"]) ** 2) / len(valid_set))
    MAE = np.sum(np.abs(valid_set["rating"] - valid_set["predict"])) / len(valid_set)
    print("RMSE: ", RMSE, "MAE: ", MAE)

In [397]:
for i in range(20):
    print(i)
    for _, row in train_set.iterrows():
        user = int(row["userId"])
        item = int(row["movieId"])
        rating = row["rating"]
        user_node = user_nodes[user]
        item_node = item_nodes[item]
        # 计算欧氏距离
        pred_distance = np.linalg.norm(user_node.position - item_node.position)
        real_distance = 100 * (5. - rating) / 4.5
        Eui = real_distance - pred_distance
        direction = (user_node.position - item_node.position)
        user_node.position += learning_rate * Eui * direction
        item_node.position -= learning_rate * Eui * direction
        if i == 19:
            print(real_distance, pred_distance)

    predict(valid_set, user_nodes, item_nodes)

0
RMSE:  1.0340214484425507 MAE:  0.7870888757727739
1
RMSE:  0.9606322857693612 MAE:  0.7375060147012694
2
RMSE:  0.9241297085296216 MAE:  0.713247936370887
3
RMSE:  0.9054561648805328 MAE:  0.7010483219411395
4
RMSE:  0.8954747683998775 MAE:  0.6946385765190867
5
RMSE:  0.8895478202516163 MAE:  0.6911249012685345
6
RMSE:  0.885705575038955 MAE:  0.6889961693285688
7
RMSE:  0.8830607666791414 MAE:  0.6875322242592005
8
RMSE:  0.8811739557919293 MAE:  0.6865465449686673
9
RMSE:  0.8798035452281386 MAE:  0.6858840185128032
10
RMSE:  0.8788031310263086 MAE:  0.6854242833576855
11
RMSE:  0.8780768640371935 MAE:  0.685126890804966
12
RMSE:  0.8775584175079163 MAE:  0.6849546060651388
13
RMSE:  0.8772000819056147 MAE:  0.6848630887391843
14
RMSE:  0.876966561002222 MAE:  0.6848526466207615
15
RMSE:  0.8768311760034907 MAE:  0.6848997191145745
16
RMSE:  0.8767734205525879 MAE:  0.6849824875719269
17
RMSE:  0.8767773224459043 MAE:  0.6851063513746645
18
RMSE:  0.876830306956642 MAE:  0.685273

In [182]:
# 为验证集添加一列，存储预测评分
valid_set["predict"] = 0
for i, row in valid_set.iterrows():
    user = int(row["userId"])
    item = int(row["movieId"])
    rating = row["rating"]
    user_node = user_nodes[user]
    item_node = item_nodes[item]
    # 计算欧氏距离
    pred_distance = np.linalg.norm(user_node.position - item_node.position)
    predict_rating = 5 - 4.5 * pred_distance / 100
    if predict_rating > 5:
        predict_rating = 5
    if predict_rating < 0.5:
        predict_rating = 0.5
    valid_set.loc[valid_set.index == i, "predict"] = predict_rating

In [183]:
valid_set

Unnamed: 0,userId,movieId,rating,predict
67037,432,77866,4.5,3.833696
42175,288,474,3.0,3.166308
93850,599,4351,3.0,2.626492
6187,42,2987,4.0,4.435266
12229,75,1610,4.0,3.100619
...,...,...,...,...
84545,543,1387,5.0,3.882738
52065,339,1580,2.0,3.674121
92269,597,1090,5.0,4.844735
18346,116,30749,4.5,3.733076


In [184]:
# 计算RMSE和MAE
RMSE = np.sqrt(np.sum((valid_set["rating"] - valid_set["predict"]) ** 2) / len(valid_set))
MAE = np.sum(np.abs(valid_set["rating"] - valid_set["predict"])) / len(valid_set)

In [185]:
RMSE, MAE

(0.9033283566666056, 0.6942303296497739)

In [34]:
user_nodes[1].position

array([nan, nan])

In [17]:
np.linalg.norm(user_nodes[1].position - item_nodes[1].position)

20.83063921015371

In [18]:
item_nodes[1].position

array([-5.61923121e+00, -7.31550099e-01, -1.74715591e+00, -4.43138115e+00,
        1.39583135e+00, -3.65486287e+00,  3.18685414e+00,  2.41406895e+00,
        9.08762924e-01, -1.31323208e+00, -2.58347973e-01,  4.38207168e+00,
        5.57851417e+00,  6.25364790e-01, -2.98295620e-01,  7.93133254e-01,
        4.89112154e-01,  5.25342887e+00,  6.24336675e-01,  2.12356092e+00,
        4.93980350e+00,  1.92275419e+00, -4.80277491e+00,  1.47460657e-03,
       -4.88535009e-01, -7.65150218e+00, -3.91186451e+00, -7.21442364e-01,
        2.64892597e+00,  3.89667651e+00, -3.18415038e-01,  3.21398947e-01,
       -8.89185547e-01, -2.99047884e+00,  1.96450241e+00,  3.02716365e+00,
        2.35145342e+00, -8.35008843e-02,  1.45718614e-01, -2.67142450e+00])

In [171]:
user_nodes[1].position

array([ 0.64718048,  1.25690615, -0.84764463,  3.75270077, -1.36956327,
        2.44904117, -0.46442723, -3.27430339, -1.50755227,  0.57148712,
       -1.42965405, -0.77413474, -0.13065483,  3.7481099 , -7.07639679,
       -0.13976116,  0.36535661,  2.70232272, -3.08107649,  1.7862285 ,
       -0.25635535,  1.09844091, -2.10638754,  0.17597196, -1.15740362,
        3.53861097, -3.00506422,  1.40732534, -3.04742796,  2.7460193 ,
        4.77779781,  0.76392592,  0.19576007, -1.77575578, -1.66977127,
       -0.71062822, -1.66830196,  0.84474566, -1.69174288, -1.0550299 ])

In [41]:
from typing import Union
class Edge:
    """边"""
    def __init__(self, node1:Union[User, Item], node2:[User, Item]):
        self._node1 = node1
        self._node2 = node2
        if isinstance(node1, User):
            self._maxRating = node1.maxRating
            self._minRating = node1.minRating
        else:
            self._maxRating = node2.maxRating
            self._minRating = node2.minRating
        self.predict_distance = np.linalg.norm(self._node1.position - self._node2.position) # 预测距离
        self.predict_rating = self.distance2rating(self.predict_distance)
    
    def rating2distance(self, rating:float)->float:
        """评分转距离"""
        if self._maxRating == self._minRating:
            return 0
        return 100 * abs(self._maxRating - rating) / (self._maxRating - self._minRating)
    
    def distance2rating(self, distance:float)->float:
        """距离转评分"""
        return self._maxRating - (self._maxRating - self._minRating) * distance / 100

In [42]:
class TrainEdge(Edge):
    """训练集的边"""
    def __init__(self, node1:Union[User, Item], node2:[User, Item], rating:float):
        super().__init__(node1, node2)
        self.real_rating = rating
        self.real_distance = self.rating2distance(rating)

    def update(self, sigma:float=0.1):
        """更新距离"""
        Eui = self.real_distance - self.predict_distance
        dir = self._node1.position - self._node2.position
        self._node1.position += sigma * Eui * dir
        # print(sigma, Eui, dir, self._node1.position, self._node2.position)
        # self._node2.position -= sigma * Eui * dir
        self.predict_distance = np.linalg.norm(self._node1.position - self._node2.position)
        self.predict_rating = self.distance2rating(self.predict_distance)

In [43]:
edges = []
for _, row in train_set.iterrows():
    user = row["userId"]
    item = row["movieId"]
    rating = row["rating"]
    edges.append(TrainEdge(user_nodes[user], item_nodes[item], rating))
    # edges.append(TrainEdge(item_nodes[item], user_nodes[user], rating))

In [48]:
for i in range(1):
    print(i)
    for edge in edges:
        edge:TrainEdge
        edge.update(sigma=0.001)

0


In [49]:
edges[0].__dict__, edges[0]._node1.__dict__, edges[0]._node2.__dict__

({'_node1': <__main__.User at 0x1fb2093b820>,
  '_node2': <__main__.Item at 0x1fb255801f0>,
  '_maxRating': 5.0,
  '_minRating': 1.0,
  'predict_distance': 581.9621081893789,
  'predict_rating': -18.278484327575157,
  'real_rating': 2.0,
  'real_distance': 75.0},
 {'position': array([-74.09915111,  17.37961763, -12.78906693,  38.814973  ,
           1.41935101,  47.43889202,  -7.77860264,  10.52017942,
          49.2068886 , -18.32646637,  47.2582954 ,   9.90468075,
          17.29335434,  -7.43519228, -67.707011  , -32.3766146 ,
           4.73440474,  71.01198853,  26.51554251, -54.32877515,
          -8.31650498, -25.44215913,   5.26207309, -52.20485778,
          27.38593514,  -9.11298488,  32.23298145,  20.18311009,
         -30.46815308,   3.82385526,  -5.68234759, -34.43358802,
          17.36305828,  22.71632192, -33.5110674 ,  14.37208793,
          29.42007817,  -7.41457477,  22.67242754, -13.49508285]),
  'neighbor': [5568,
   3361,
   533,
   2701,
   1320,
   5651,
   3270

In [45]:
edges[0].__dict__, edges[0]._node1.__dict__, edges[0]._node2.__dict__

({'_node1': <__main__.User at 0x1fb2093b820>,
  '_node2': <__main__.Item at 0x1fb255801f0>,
  '_maxRating': 5.0,
  '_minRating': 1.0,
  'predict_distance': 833.5491352081989,
  'predict_rating': -28.341965408327958,
  'real_rating': 2.0,
  'real_distance': 75.0},
 {'position': array([ -74.17854483,  117.84303026,  -51.90189904,  101.79236815,
          -36.37197902,  -25.29833262,   91.09862921,  121.81818355,
           21.73080135,  -13.13555727,  -40.53408132,  -26.76107285,
           59.09596326, -212.32533988,  -50.94529141, -200.69662156,
          148.53327323,  110.43431443,   16.79432113, -239.60414595,
          122.05409481,   93.13598357,   45.76221022,  -18.6408554 ,
           32.9650201 ,  -34.85083219,   68.15471866,  -64.32666894,
          -51.3049116 ,  -67.32001611,  -67.43050424,   28.48414877,
          146.17013149,   12.26061045, -184.55693579,  -98.33326181,
           -8.20945998,  106.44553622,   54.23617751,   37.2856761 ]),
  'neighbor': [5568,
   3361,
  