In [None]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/推荐系统/推荐系统算法复现/ml-100k")
import numpy as np
import pandas as pd
import torch
from torch import nn

In [None]:
# 导入user-item交互矩阵
data = pd.read_csv("./one-hot_user&item_features.csv")
# 去除一开始的序号行,user_id,item_id共列
data.drop(data.columns[[0,1,2]], axis=1, inplace=True)
#第一列rating即为target
data

Unnamed: 0,rating,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,gender_F,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
4,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
99996,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99998,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import numpy as np

# 设置随机数生成器的种子
np.random.seed(0)
# 创建一个1到10000的数组
arr = np.arange(data.shape[0])
# 原地随机排列数组
np.random.shuffle(arr)
# 选择百分之多少的数据作为训练集
train_test_ratio = 0.9
# 打乱后的数组的前百分之train_test_ratio(i.e.90%)作为训练集对应的index
train_index = arr[:int(len(arr)*train_test_ratio)]
# 打乱后的数组的后百分之(1-train_test_ratio)(i.e.10%)作为测试集对应的index
test_index = arr[int(len(arr)*train_test_ratio):]
# 获取训练集和测试集
train_set = data.iloc[train_index,:]
test_set = data.iloc[test_index, :]

设特征$x_i$对应的隐向量$w_i$为长度为k的向量，则FM模型的交叉部分为

$$\Sigma_{i=1}^{n}\Sigma_{j=i+1}^n <w_i, w_j> x_i x_j = \frac{1}{2}[\Sigma_{i=1}^{n}\Sigma_{j=1}^n <w_i, w_j> x_i x_j - \Sigma_{i=1}^{n}<w_i, w_i> x_i x_i] $$

$$= \frac{1}{2}[\Sigma_{i=1}^{n}\Sigma_{j=1}^n \Sigma_{f=1}^{k}w_{if}w_{jf}x_i x_j - \Sigma_{i=1}^{n} \Sigma_{f=1}^{k} w_{if}w_{if} x_i x_i]$$

$$ = \frac{1}{2}\Sigma_{f=1}^k[(\Sigma_{i=1}^n w_{if}x_i)^2 - \Sigma_{i=1}^n w_{if}^2x_i^2]$$

$$ = \frac{1}{2}\Sigma_{f=1}^k[(\Sigma_{i=1}^n w_{if}x_i)^2] - \frac{1}{2}\Sigma_{f=1}^k \Sigma_{i=1}^n w_{if}^2x_i^2$$

In [None]:
from torch.nn import Parameter,init
class FM( nn.Module ):
  def __init__( self, feature_num, hidden_dim = 256 ):
    super(FM, self).__init__()
    self.linear = nn.Linear(feature_num , 1)
    self.weight = init.xavier_uniform_(Parameter(torch.empty(feature_num, hidden_dim)))

  def FMcross(self, x ):
    # x的shape为(batch_size, feature_num); self.weight的shape为(feature_num, hidden_dim)
    # left_part的shape为(batch_size, hidden_dim)
    left_part = (0.5)*torch.sum((x@self.weight)**2,axis = -1)
    # right_part的shape为(batch_size, hidden_dim)
    right_part = (0.5)*torch.sum((x**2)@(self.weight**2), axis= -1)

    return left_part-right_part

  def forward(self, x):
    # input:x的shape为(batch_size, feature_num); linear_sum的shape为(batch_size)
    linear_sum = self.linear(x).squeeze()
    # input:x的shape为(batch_size, feature_num); quadratic_sum的shape为(batch_size)
    quadratic_sum = self.FMcross(x)
    # total_sum的shape为(batch_size)
    total_sum = linear_sum + quadratic_sum
    return torch.sigmoid(total_sum)

In [None]:
from sklearn.metrics import precision_score,recall_score,accuracy_score
def evaluation(y_pred, y_true):
  p = precision_score(y_true, y_pred)
  r = recall_score(y_true, y_pred)
  acc = accuracy_score(y_true,y_pred)
  return p,r,acc

In [None]:
input_size = data.shape[1] - 1
model = FM(input_size)

# 损失函数
loss_fn = nn.BCELoss()

# 优化器
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score,recall_score,accuracy_score

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
  for X in DataLoader(train_set.values, batch_size=512, shuffle=True):
    train_X = X[:,1:].float()
    train_y = X[:,0].float()

    # 清空梯度
    optimizer.zero_grad()
    # 前向传播
    predictions = model(train_X)
    predictions = predictions.squeeze() #将predictions的从二维变成一维
    # 计算损失
    loss = loss_fn(predictions, train_y)
    # 反向传播
    loss.backward()
    optimizer.step()
  print(f"Epoch {epoch}, Loss: {loss.item()}")

  # 训练集
  y_pred = np.array([1 if i >= 0.5 else 0 for i in predictions])
  precision, recall, acc = evaluation(y_pred, train_y)
  print('train: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(precision, recall, acc))

  #测试集
  test_X = torch.tensor(test_set.iloc[:,1:].values).float()
  test_y = test_set.iloc[:,0].values
  predictions = model(test_X)
  y_pred = np.array([1 if i >= 0.5 else 0 for i in predictions])
  y_true = test_y
  precision, recall, acc = evaluation(y_pred, y_true)
  print('test: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(precision, recall, acc))
  print('----------------------------------------------------------------------------------------')

Epoch 0, Loss: 0.6581762433052063
train: Precision 0.6442 | Recall 0.7319 | accuracy 0.6050
test: Precision 0.6171 | Recall 0.7246 | accuracy 0.5986
----------------------------------------------------------------------------------------
Epoch 1, Loss: 0.6571202278137207
train: Precision 0.6154 | Recall 0.7500 | accuracy 0.5975
test: Precision 0.6222 | Recall 0.7399 | accuracy 0.6072
----------------------------------------------------------------------------------------
Epoch 2, Loss: 0.6678417921066284
train: Precision 0.6301 | Recall 0.6951 | accuracy 0.6025
test: Precision 0.6337 | Recall 0.6834 | accuracy 0.6060
----------------------------------------------------------------------------------------
Epoch 3, Loss: 0.6742897033691406
train: Precision 0.6122 | Recall 0.7031 | accuracy 0.5750
test: Precision 0.6197 | Recall 0.7349 | accuracy 0.6035
----------------------------------------------------------------------------------------
Epoch 4, Loss: 0.6289081573486328
train: Precisi