# NeuralCF
NeuralCF 用"多层神经网络+输出层"的结构替代了矩阵分解模型中简单的内积操作。这样做可以让用户向量和物品向量做更充分的交叉，得到更多有价值的特征组合信息；同时引入更多的非线性特征，让模型的表达能力更强

<img src="../../img/neuralcf.png" width="600" >

## Pytorch

In [1]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
from torch.utils.data.dataset import Dataset
from collections import OrderedDict

In [2]:
learning_rate = 0.0001
batch_size = 64
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
with open('../data/movielens/feature_map.json') as obj:
    feature_map = json.load(obj)
feature_map

{'dataset_id': 'movielens',
 'num_fields': 26,
 'feature_specs': {'movieId': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 935,
   'index': 0},
  'userId': {'source': 'user',
   'type': 'categorical',
   'vocab_size': 22540,
   'index': 1},
  'rating': {'source': 'user',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 2},
  'timestamp': {'source': 'user',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 3},
  'releaseYear': {'source': 'item',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 4},
  'movieGenre1': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 18,
   'index': 5},
  'movieGenre2': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 18,
   'index': 6},
  'movieGenre3': {'source': 'item',
   'type': 'categorical',
   'vocab_size': 15,
   'index': 7},
  'movieRatingCount': {'source': 'item',
   'type': 'numerical',
   'vocab_size': 1,
   'index': 8},
  'movieAvgRating': {'source': 'item',
   'type': 'numerical',
 

In [4]:
# use only movieId and userId
feature_map['feature_specs'] = {
    'movieId': feature_map['feature_specs']['movieId'],
    'userId': feature_map['feature_specs']['userId']
}

feature_map['num_fields'] = 2

In [5]:
class MovielensDataset(Dataset):
    def __init__(self, url):
        self.df = pd.read_csv(url)
    
    def __getitem__(self, idx):
        x, y = self.df.iloc[idx, :-1].values.astype(np.float32), self.df.iloc[idx, -1].astype(np.float32)
        return x, y
    
    def __len__(self):
        return self.df.shape[0]

In [6]:
train_dataset = MovielensDataset('../data/movielens/data_for_train.csv')
test_dataset = MovielensDataset('../data/movielens/data_for_test.csv')

In [7]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

In [8]:
class NCF(nn.Module):
    def __init__(self,
                feature_map,
                embedding_dim=10,
                hidden_units=[256, 128, 64]):
        super(NCF, self).__init__()
        self.feature_map = feature_map
        # Embedding
        self.embedding = nn.ModuleDict()
        for feature, feature_spec in feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                self.embedding[feature] = nn.Linear(1, embedding_dim, bias=False)
            elif feature_spec['type'] == 'categorical':
                padding_idx = feature_spec.get('padding_idx', None)
                self.embedding[feature] = nn.Embedding(feature_spec['vocab_size'],
                                                       embedding_dim,
                                                       padding_idx=padding_idx)
        # DNN
        input_dim = feature_map['num_fields'] * embedding_dim
        hidden_units = [input_dim] + hidden_units
        hidden_layers = []
        for i in range(len(hidden_units) - 1):
            hidden_layers.append(nn.Linear(hidden_units[i], hidden_units[i + 1]))
            hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(hidden_units[-1], 1))
        self.dnn = nn.Sequential(*hidden_layers)
        # Sigmoid
        self.output_activation = nn.Sigmoid()
    
    def forward(self, X):
        feature_emb_list = []
        for feature, feature_spec in self.feature_map['feature_specs'].items():
            if feature_spec['type'] == 'numerical':
                raw_feature = X[:, feature_spec['index']].float().view(-1, 1)
            elif feature_spec['type'] == 'categorical':
                raw_feature = X[:, feature_spec['index']].long()
            embedding_vec = self.embedding[feature](raw_feature)
            feature_emb_list.append(embedding_vec)
        feature_emb = torch.stack(feature_emb_list, dim=1)
        out = self.dnn(feature_emb.flatten(start_dim=1))
        y_pred = self.output_activation(out).squeeze(1)
        return y_pred

In [9]:
model = NCF(feature_map).to(device)
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (X, y) in enumerate(train_loader):
        X = X.to(device)
        y = y.to(device)

        # Forward pass
        output = model(X)
        loss = criterion(output, y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 300 == 0:
            print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(
                epoch + 1, num_epochs, i + 1, total_step, loss.item()))

Epoch [1/5], Step [300/1388] Loss: 0.7093
Epoch [1/5], Step [600/1388] Loss: 0.6798
Epoch [1/5], Step [900/1388] Loss: 0.6857
Epoch [1/5], Step [1200/1388] Loss: 0.6784
Epoch [2/5], Step [300/1388] Loss: 0.7194
Epoch [2/5], Step [600/1388] Loss: 0.6640
Epoch [2/5], Step [900/1388] Loss: 0.6624
Epoch [2/5], Step [1200/1388] Loss: 0.5992
Epoch [3/5], Step [300/1388] Loss: 0.5905
Epoch [3/5], Step [600/1388] Loss: 0.6283
Epoch [3/5], Step [900/1388] Loss: 0.5838
Epoch [3/5], Step [1200/1388] Loss: 0.6566
Epoch [4/5], Step [300/1388] Loss: 0.6355
Epoch [4/5], Step [600/1388] Loss: 0.6338
Epoch [4/5], Step [900/1388] Loss: 0.6081
Epoch [4/5], Step [1200/1388] Loss: 0.6274
Epoch [5/5], Step [300/1388] Loss: 0.5946
Epoch [5/5], Step [600/1388] Loss: 0.6243
Epoch [5/5], Step [900/1388] Loss: 0.5837
Epoch [5/5], Step [1200/1388] Loss: 0.6673


In [10]:
# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X, y in test_loader:
        X = X.to(device)
        y = y.to(device).bool()
        output = model(X)
        y_pred = output > 0.5
        total += y.shape[0]
        correct += (y_pred == y).sum().item()

    print('Accuracy of the model on the test images: {:.2f} %'.format(
        100 * correct / total))

Accuracy of the model on the test images: 63.83 %


## Tensorflow

### 读取数据

In [1]:
import tensorflow as tf

In [2]:
# Training samples path, change to your local path
training_samples_file_path = tf.keras.utils.get_file(
    'trainingSamples.csv', '../data/trainingSamples.csv')
# Test samples path, change to your local path
test_samples_file_path = tf.keras.utils.get_file('testSamples.csv',
                                                 '../data/testSamples.csv')


# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(file_path,
                                                    batch_size=12,
                                                    label_name='label',
                                                    na_value="0",
                                                    num_epochs=1,
                                                    ignore_errors=True)
    return dataset


# split as test dataset and training dataset
train_data = get_dataset(training_samples_file_path)
test_data = get_dataset(test_samples_file_path)

### 特征工程

In [3]:
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(
    key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(
    key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)

# define input for keras model
inputs = {
    'movieId': tf.keras.layers.Input(name='movieId', shape=(), dtype='int32'),
    'userId': tf.keras.layers.Input(name='userId', shape=(), dtype='int32'),
}

### 模型训练与预测

In [4]:
# neural cf model arch two. only embedding in each tower, then MLP as the interaction layers
def neural_cf_model_1(feature_inputs, item_feature_columns,
                      user_feature_columns, hidden_units):
    item_tower = tf.keras.layers.DenseFeatures(item_feature_columns)(
        feature_inputs)
    user_tower = tf.keras.layers.DenseFeatures(user_feature_columns)(
        feature_inputs)
    interact_layer = tf.keras.layers.concatenate([item_tower, user_tower])
    for num_nodes in hidden_units:
        interact_layer = tf.keras.layers.Dense(
            num_nodes, activation='relu')(interact_layer)
    output_layer = tf.keras.layers.Dense(1,
                                         activation='sigmoid')(interact_layer)
    neural_cf_model = tf.keras.Model(feature_inputs, output_layer)
    return neural_cf_model

# neural cf model architecture
model = neural_cf_model_1(inputs, [movie_emb_col], [user_emb_col], [10, 10])

# compile the model, set loss function, optimizer and evaluation metrics
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[
                  'accuracy',
                  tf.keras.metrics.AUC(curve='ROC'),
                  tf.keras.metrics.AUC(curve='PR')
              ])

# train the model
model.fit(train_data, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(
    test_data)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.
      format(test_loss, test_accuracy, test_roc_auc, test_pr_auc))

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Test Loss 0.6640917658805847, Test Accuracy 0.6761586666107178, Test ROC AUC 0.7302051186561584, Test PR AUC 0.7565736770629883


In [5]:
# print some predict results
predictions = model.predict(test_data)
for prediction, goodRating in zip(predictions[:12],
                                  list(test_data)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))

Predicted good rating: 9.89%  | Actual rating label:  Good Rating
Predicted good rating: 25.50%  | Actual rating label:  Good Rating
Predicted good rating: 34.29%  | Actual rating label:  Bad Rating
Predicted good rating: 17.61%  | Actual rating label:  Bad Rating
Predicted good rating: 92.53%  | Actual rating label:  Good Rating
Predicted good rating: 19.93%  | Actual rating label:  Good Rating
Predicted good rating: 86.79%  | Actual rating label:  Good Rating
Predicted good rating: 77.65%  | Actual rating label:  Good Rating
Predicted good rating: 55.82%  | Actual rating label:  Bad Rating
Predicted good rating: 63.80%  | Actual rating label:  Bad Rating
Predicted good rating: 6.24%  | Actual rating label:  Bad Rating
Predicted good rating: 70.57%  | Actual rating label:  Bad Rating


### 保存模型

In [6]:
tf.keras.models.save_model(
    model,
    "./pretrained_model",
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None)

INFO:tensorflow:Assets written to: ./pretrained_model\assets


## NeuralCF 混合模型
NeuralCF 混合模型整合了上面提出的原始 NeuralCF 模型和以元素积为互操作的广义矩阵分解模型。这让模型有了更强的特征组合和非线性能力
<img src="../../img/neural_matrix_factorization_model.png" width="600" />

In [None]:
# neural cf model arch one. embedding+MLP in each tower, then dot product layer as the output
def neural_cf_model_2(feature_inputs, item_feature_columns,
                      user_feature_columns, hidden_units):
    item_tower = tf.keras.layers.DenseFeatures(item_feature_columns)(
        feature_inputs)
    for num_nodes in hidden_units:
        item_tower = tf.keras.layers.Dense(num_nodes,
                                           activation='relu')(item_tower)

    user_tower = tf.keras.layers.DenseFeatures(user_feature_columns)(
        feature_inputs)
    for num_nodes in hidden_units:
        user_tower = tf.keras.layers.Dense(num_nodes,
                                           activation='relu')(user_tower)

    output = tf.keras.layers.Dot(axes=1)([item_tower, user_tower])
    output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

    neural_cf_model = tf.keras.Model(feature_inputs, output)
    return neural_cf_model

NeuralCF 模型实际上提出了一个模型框架，它基于用户向量和物品向量这两 Embedding 层，利用不同的互操作层进行特征的交叉组合，并且可以灵活地进行不同互操作层的拼接

NeuralCF 模型也存在局限性，由于是基于协同过滤的思想进行构造的，所以 NeuralCF 模型并没有引入更多其他类型的特征，这在实际应用中无疑浪费了其他有价值的信息