# Neural Collaborative Filtering

论文作者在 github 开源了 NCF 的实现，[hexiangnan/neural_collaborative_filtering](https://github.com/hexiangnan/neural_collaborative_filtering)，下面的代码中模型部分大量借鉴了原作的实现。

In [2]:
import pandas as pd
import numpy as np

## 数据预处理

这里使用的是 MovieLens 100k 数据集合。可以在此下载：http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [195]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./data/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [197]:
ratings = ratings.drop(['timestamp'], axis=1)

In [199]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [200]:
from sklearn.model_selection import train_test_split

user_id = ratings['user_id']
ratings_train, ratings_test, _, _ = train_test_split(ratings, user_id, test_size=0.25, stratify=user_id, random_state=42)

In [273]:
ratings_train.shape, ratings_test.shape

((75000, 3), (25000, 3))

## 构建数据集

样本集就是 `item, user, label` 这样的元组，user 和 item 是输入，label 是输出。

In [224]:
def build_instances_from_df(ratings, num_negative=4):
    # 正例
    users = ratings['user_id'].values
    items = ratings['movie_id'].values
    labels = np.ones_like(users)
    
    user_item_pairs = set(zip(users, items))
    
    negative_users = []
    negative_items = []
    
    # 负例
    for user in users:
        i = 0
        while i < num_negative:
            item = np.random.choice(items)
            if (user, item) not in user_item_pairs:
                i += 1
                negative_users.append(user)
                negative_items.append(item)
                
    
    users = np.r_[users, negative_users]
    items = np.r_[items, negative_items]
    labels = np.r_[labels, np.zeros_like(negative_users)]
    
    index = np.random.permutation(users.shape[0])

    users = users[index]
    items = items[index]
    labels = labels[index]
    
    return users, items, labels
    
users, items, labels = build_instances_from_df(ratings_train)
users_test, items_test, labels_test = build_instances_from_df(ratings_test)

## Generalized Matrix Factorization (GMF)

In [283]:
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop

def build_GMF_model(num_users, num_items, latent_dim):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')

    user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                               input_length=1, name='user_embedding')
    item_embedding = Embedding(input_dim = num_items, output_dim=latent_dim,
                               input_length=1, name='item_embedding')   
    
    user_latent = Flatten()(user_embedding(user_input))
    item_latent = Flatten()(item_embedding(item_input))
    
    # Element-wise product
    predict_vector = Multiply()([user_latent, item_latent])
    
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid',
                       kernel_initializer=keras.initializers.lecun_uniform(),
                       name='prediction')(predict_vector)
    
    model = Model(inputs=[user_input, item_input], outputs=prediction)

    return model

num_users = ratings['user_id'].unique().shape[0] + 1
num_items = ratings['movie_id'].unique().shape[0] + 1
learning_rate = 0.001

GMF_model = build_model(num_users, num_items, 16)
GMF_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [284]:
GMF_model.fit([np.array(users), np.array(items)],
              np.array(labels), epochs=5,
              batch_size=32, validation_split=0.2)

Train on 300000 samples, validate on 75000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9cd4357080>

In [285]:
GMF_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))



[0.5451006612052918, 0.741792]

## Multi-Layer Perceptron (MLP)

In [277]:
def build_mlp_model(num_users, num_items, layer_units=[20,10], reg_layers=[0,0]):

    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')

    user_embedding = Embedding(input_dim=num_users, output_dim=layer_units[0]//2, name='user_embedding',
                               input_length=1)
    item_embedding = Embedding(input_dim=num_items, output_dim=layer_units[0]//2, name='item_embedding',
                                input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(user_embedding(user_input))
    item_latent = Flatten()(item_embedding(item_input))
    
    # The 0-th layer is the concatenation of embedding layers
    vector = keras.layers.Concatenate(axis=-1)([user_latent, item_latent])
    
    for i, unit in enumerate(layer_units):
        layer = Dense(unit, activation='relu', name='layer_{}'.format(i))
        vector = layer(vector)
        
    # Final prediction layer
    prediction = Dense(1, activation='sigmoid',
                       kernel_initializer=keras.initializers.lecun_uniform(seed=None),
                       name='prediction')(vector)
    
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    
    return model

mlp_model = build_mlp_model(num_users, num_items)
mlp_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [278]:
mlp_model.fit([np.array(users), np.array(items)],
              np.array(labels), epochs=3,
              batch_size=16, validation_split=0.2)

Train on 300000 samples, validate on 75000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f9bf9636cf8>

In [279]:
mlp_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))



[0.46300480207824707, 0.796016]

## Neural matrix factorization model

In [286]:
def build_NeuMF_model(num_users, num_items, mf_dim=16, layer_units=[20]):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')

    mf_user_embedding = Embedding(input_dim=num_users, output_dim=mf_dim,
                                  name='mf_user_embedding', input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=mf_dim,
                                  name='mf_item_embedding', input_length=1)
    
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=layer_units[0]//2,
                                   name='mlp_user_embedding', input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=layer_units[0]//2,
                                   name='mlp_item_embedding', input_length=1)   
    
    # MF part
    mf_user_latent = Flatten()(mf_user_embedding(user_input))
    mf_item_latent = Flatten()(mf_item_embedding(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent])

    # MLP part 
    mlp_user_latent = Flatten()(mlp_user_embedding(user_input))
    mlp_item_latent = Flatten()(mlp_item_embedding(item_input))
    mlp_vector = keras.layers.Concatenate(axis=-1)([mlp_user_latent, mlp_item_latent])
    
    for i, unit in enumerate(layer_units):
        layer = Dense(unit, activation='relu', name='layer_{}'.format(i))
        mlp_vector = layer(mlp_vector)

    predict_vector = keras.layers.Concatenate(axis=-1)([mf_vector, mlp_vector])
    
    prediction = Dense(1, activation='sigmoid',
                       kernel_initializer=keras.initializers.lecun_uniform(seed=None),
                       name = "prediction")(predict_vector)
    
    model = Model(inputs=[user_input, item_input], outputs=prediction)
    
    return model

NeuMF_model = build_NeuMF_model(num_users, num_items)
NeuMF_model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [287]:
NeuMF_model.fit([np.array(users), np.array(items)],
              np.array(labels), epochs=3,
              batch_size=16, validation_split=0.2)

Train on 300000 samples, validate on 75000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f9be3f279b0>

In [288]:
NeuMF_model.evaluate([np.array(users_test), np.array(items_test)], np.array(labels_test))



[0.5358014254379272, 0.74336]