## FM, FFM, DFM, NFM, Wide & Deep 实现

这个 notebook 实现了下列论文中提出的模型：

- Factorization Machines
- Field-aware Factorization Machines for CTR Prediction
- DeepFM: A Factorization-Machine based Neural Network for CTR Prediction
- Wide & Deep Learning for Recommender Systems
- Neural Factorization Machines for Sparse Predictive Analytics

其中的代码大量参考了[rixwew/pytorch-fm](https://github.com/rixwew/pytorch-fm)， 感谢原作者。

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## 构造 DataSet

这里使用 MovieLens 数据集，输入是 `(user_id, item_id)` 输出是对应评分，因为 `user_id` 和 `item_id` 都是从 1 开始编号的，所以将 `item_id` 整体做偏移。

In [2]:
class MovieLensDataset(Dataset):
    def __init__(self, path):
        data = pd.read_csv(path, sep='\t').values
        self.items = data[:, :2].astype(np.int)
        self.ratings = data[:, 2].astype(np.float32)
        self.field_dims = np.max(self.items, axis=0)
        self.items[:,1] + self.field_dims[0]
        
    def __len__(self):
        return self.items.shape[0]
    
    def __getitem__(self, index):
        return self.items[index], self.ratings[index]

## 模型定义

先定义一些后面会用到的基础组件：

In [3]:
class FeaturesLinear(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, 1)
        self.bias = nn.Parameter(torch.zeros((1,)))
        
    def forward(self, x):
        x = self.embedding(x)
        return torch.sum(x, dim=1) + self.bias

class FeaturesEmbedding(nn.Module):
    def __init__(self, input_dim, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim+1, embed_dim)

    def forward(self, x):
        x = self.embedding(x)
        return x
    
class FeaturesCross(nn.Module):
    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum
        
    def forward(self, x):
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        
        ix = square_of_sum - sum_of_square
        
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        
        return 0.5 * ix
    
class MultiLayerPerceptron(torch.nn.Module):
    def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
        super().__init__()
        layers = []
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

### Linear Regression

In [13]:
class LinearRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, 1)
        self.bias = nn.Parameter(torch.zeros((1,)))
        
    def forward(self, x):
        w = self.embedding(x)
        y = self.bias + torch.sum(w)
        return y.squeeze(1)

### Factorization Machine

In [5]:
class FactorizationMachine(nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        input_dim = np.sum(field_dims)
        self.linear = FeaturesLinear(input_dim)
        self.embedding = FeaturesEmbedding(input_dim, embed_dim)
        self.cross = FeaturesCross()
    
    def forward(self, x):
        embed = self.embedding(x)
        x = self.linear(x) + self.cross(embed)
        return x.squeeze(1)

### Field Aware Factorization Machine

In [6]:
class FieldAwareFactorizationMachine(torch.nn.Module):
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        input_dim = sum(field_dims)
        self.num_fields = len(field_dims)
        self.linear = FeaturesLinear(input_dim)
        self.embeddings = torch.nn.ModuleList([
            torch.nn.Embedding(input_dim + 1, embed_dim) for _ in range(self.num_fields)
        ])

    def forward(self, x):
        xs = [self.embeddings[i](x) for i in range(self.num_fields)]
        ix = []
        for i in range(self.num_fields - 1):
            for j in range(i+1, self.num_fields):
                ix.append(xs[j][:,i] * xs[i][:,j])
        ix = torch.stack(ix, dim=1)
        ffm = torch.sum(ix, (1, 2))
        x = self.linear(x).squeeze(1) + ffm
        return x

### Neural Factorization Machine

In [21]:
class NeuralFactorizationMachine(nn.Module):
    def __init__(self, field_dims, embed_dim, mlp_dims, drop_rate=0.2):
        super().__init__()
        input_dim = np.sum(field_dims)
        self.drop_rate = drop_rate
        self.linear = FeaturesLinear(input_dim)
        self.embedding = FeaturesEmbedding(input_dim, embed_dim)
        self.mlp = MultiLayerPerceptron(embed_dim, mlp_dims, drop_rate)
        
        self.cross = torch.nn.Sequential(
            FeaturesCross(reduce_sum=False),
            nn.BatchNorm1d(embed_dim),
            nn.Dropout(drop_rate)
        )
    
    def forward(self, x):
        embedding = self.embedding(x)
        cross = self.cross(embedding)
        x = self.linear(x) + self.mlp(cross)
        return x.squeeze(1)

### Deep Factorization Machine

In [8]:
class DeepFactorizationMachine(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, mlp_dims, drop_rate=0.2):
        super().__init__()
        input_dim = np.sum(field_dims)
        self.linear = FeaturesLinear(input_dim)
        self.embedding = FeaturesEmbedding(input_dim, embed_dim)
        self.cross = FeaturesCross()
        
        self.mlp_input_dim = len(field_dims) * embed_dim
        self.mlp = MultiLayerPerceptron(self.mlp_input_dim, mlp_dims, drop_rate)

    def forward(self, x):
        embed = self.embedding(x)
        
        x = self.cross(embed) + self.mlp(embed.view(-1, self.mlp_input_dim))
        return x.squeeze(1)

### Wide & Deep

In [9]:
class WideAndDeep(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, mlp_dims, drop_rate=0.2):
        super().__init__()
        input_dim = np.sum(field_dims)
        self.linear = FeaturesLinear(input_dim)
        self.embedding = FeaturesEmbedding(input_dim, embed_dim)
        
        self.mlp_input_dim = len(field_dims) * embed_dim
        self.mlp = MultiLayerPerceptron(self.mlp_input_dim, mlp_dims, drop_rate)

    def forward(self, x):
        embed = self.embedding(x)
        x = self.linear(x) + self.mlp(embed.view(-1, self.mlp_input_dim))
        return x.squeeze(1)

## 训练

In [10]:
from sklearn.metrics import mean_squared_error

def train(model, optimizer, data_loader, criterion, device, log_interval=1000):
    model.train()
    total_loss = 0
    for i, (x, y) in enumerate(data_loader):
        x, y = x.to(device), y.to(device)
        y_hat = model(x)
        loss = criterion(y, y_hat)
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += np.sqrt(loss.item())
        if (i + 1) % log_interval == 0:
            print('- loss:', total_loss / log_interval)
            total_loss = 0
            
def test(model, data_loader, device):
    model.eval()
    targets, predicts = [], []
    with torch.no_grad():
        for i, (x, y) in enumerate(data_loader):
            x, y = x.to(device), y.to(device)
            y_hat = model(x)
            targets.extend(y.tolist())
            predicts.extend(y_hat.tolist())
    mse = mean_squared_error(targets, predicts)
    return np.sqrt(mse)

In [25]:
device = torch.device('cuda')
dataset = MovieLensDataset("./data/ml-100k/u.data")

train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length

train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset, (train_length, valid_length, test_length))

train_data_loader = DataLoader(train_dataset, batch_size=32)
valid_data_loader = DataLoader(valid_dataset, batch_size=32)
test_data_loader = DataLoader(test_dataset, batch_size=32)


# model = LinearRegression(np.sum(dataset.field_dims))
# model = FactorizationMachine(dataset.field_dims, 16)
# model = FieldAwareFactorizationMachine(dataset.field_dims, 8)
# model = DeepFactorizationMachine(dataset.field_dims, 16, [16, 16])
# model = NeuralFactorizationMachine(dataset.field_dims, 64, [64])
model = WideAndDeep(dataset.field_dims, 16, [16,16])
model.cuda()

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

for epoch_i in range(5):
    train(model, optimizer, train_data_loader, criterion, device)
    rmse = test(model, test_data_loader, device)
    print('test rmse:', rmse)

- loss: 2.1442796721315647
- loss: 1.6825360619697156
test rmse: 1.4161099228544758
- loss: 1.4198498158365154
- loss: 1.3025001574838495
test rmse: 1.1402408731351026
- loss: 1.1773485173122928
- loss: 1.1275556205756883
test rmse: 1.0608862612025725
- loss: 1.0828320220567376
- loss: 1.062964973177104
test rmse: 1.0275087781282006
- loss: 1.0434619229686866
- loss: 1.0375892681606158
test rmse: 1.011924632138925


下面是各个模型跑出来的效果，还没有仔细调参：

|模型|RMSE|
|:---|:----------------|
|LR  |2.046919840803719|
|FM  |1.643816350960632|
|FFM |1.473343716268819|
|DFM |1.461163142731684|
|NFM |1.068538892587793|
|W&D |1.011924632138925|