# Neural Collaborative Filtering on MovieLens dataset
Neural Collaborative Filtering is a well known recommendation algorithm that generalizes the matrix factorization problem with multi-layer perceptron.

In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import sys
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, get_top_k_items, recall_at_k

print('System version: {}'.format(sys.version))
print('pandas version: {}'.format(pd.__version__))
print('tensor version: {}'.format(tf.__version__))


System version: 3.7.13 (default, Mar 29 2022, 02:18:16) 
[GCC 7.5.0]
pandas version: 1.3.5
tensor version: 2.7.3


In [3]:
top_k = 10
movielens_data_size = '100k'
epochs = 50
batch_size = 256
seed = 27

In [4]:
# 1. Download the dataset
df = movielens.load_pandas_df(size=movielens_data_size, header=['userID','itemID','rating','timestamp'])

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|█████████████████████████████████████| 4.81k/4.81k [00:02<00:00, 1.81kKB/s]


In [5]:
# 2. Split the data using the Spark chronological splitter
train, test = python_chrono_split(df, 0.75)

test = test[test['userID'].isin(train['userID'].unique())]
test = test[test['itemID'].isin(train['itemID'].unique())]

train_file = "./train.csv"
test_file = "./test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

# Generate an NCF dataset object from the data subsets
data = NCFDataset(train_file=train_file, test_file=test_file, seed=seed)

INFO:recommenders.models.ncf.dataset:Indexing ./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ./test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ./test_full.csv ...
100%|█████████████████████████████████████████| 943/943 [00:21<00:00, 44.82it/s]
INFO:recommenders.models.ncf.dataset:Indexing ./test_full.csv ...


In [6]:
# 3. Train the NCF model on the training data
# and get the top-k recommendations for testing data
model = NCF(n_users=data.n_users, n_items=data.n_items,
           model_type='NeuMF', n_factors=4,
           layer_sizes=[16,8,4], n_epochs=epochs, batch_size=batch_size,
           learning_rate=1e-3, verbose=10, seed=seed)

2022-06-22 15:15:06.059276: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-22 15:15:06.105662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-22 15:15:06.130797: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-22 15:15:06.131062: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [7]:
with Timer() as train_time:
    model.fit(data)
    
print("Took {} seconds for training".format(train_time))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [3.81s]: train_loss = 0.262599 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [3.81s]: train_loss = 0.250173 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [3.83s]: train_loss = 0.242473 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [3.84s]: train_loss = 0.237200 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [3.84s]: train_loss = 0.234393 


Took 193.1260 seconds for training


In [13]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    print(item)
    
    for user in train.userID.unique():
        user = [user] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))
        
    all_predictions = pd.DataFrame(data={'userID': users, 'itemID':items, 'prediction':preds})
    
    merged = pd.merge(train, all_predictions, on=['userID', 'itemID'], how='outer')
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)
    
print('Took {} secs for prediction'.format(test_time))
    
    

[168, 172, 165, 156, 166, 196, 187, 14, 250, 127, 181, 117, 109, 1, 246, 257, 248, 50, 249, 253, 262, 93, 224, 124, 19, 123, 137, 146, 7, 235, 15, 245, 260, 24, 264, 126, 237, 13, 25, 121, 251, 236, 240, 118, 130, 65, 190, 47, 31, 28, 114, 39, 52, 238, 199, 183, 11, 69, 161, 95, 60, 83, 179, 22, 98, 64, 135, 163, 26, 202, 89, 8, 214, 182, 48, 99, 160, 175, 192, 180, 128, 143, 185, 68, 55, 204, 56, 96, 81, 79, 151, 212, 23, 70, 84, 94, 197, 191, 184, 134, 207, 145, 188, 186, 97, 85, 36, 144, 159, 17, 174, 252, 105, 148, 108, 147, 220, 106, 243, 122, 107, 104, 247, 120, 45, 268, 267, 259, 261, 263, 10, 150, 234, 92, 71, 42, 176, 91, 193, 217, 177, 216, 194, 73, 59, 133, 41, 195, 218, 170, 213, 157, 223, 27, 227, 80, 231, 67, 119, 200, 4, 215, 164, 2, 206, 77, 53, 136, 46, 40, 153, 269, 254, 115, 173, 211, 229, 155, 203, 62, 90, 219, 167, 230, 35, 162, 61, 265, 112, 57, 49, 30, 233, 131, 152, 82, 141, 72, 33, 158, 198, 225, 113, 21, 286, 258, 305, 307, 288, 312, 301, 306, 292, 303, 299, 3

Took 1.6633 secs for prediction


In [17]:
# 4. Evaluate how well NCF performs

eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=top_k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=top_k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=top_k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=top_k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.047500
NDCG:	0.197197
Precision@K:	0.179215
Recall@K:	0.100682


In [18]:
if is_jupyter():
    import papermill as pm
    import scrapbook as sb
    
    sb.glue('map', eval_map)
    sb.glue('ndcg', eval_ndcg)    
    sb.glue('precision', eval_precision)    
    sb.glue('recall', eval_recall)    
    sb.glue('train_time', train_time.interval)    
    sb.glue('test_time', test_time.interval)    