# Test the NCF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components, with gmf and mlp pretrain)

#### 4/20/2020, test with original paper's dataset

In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from tensorflow.keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation,
    evaluation_grouped
)

#### check original paper's dataset

In [2]:
!tree ../metadata/original_dataset/

[01;34m../metadata/original_dataset/[00m
├── item_input
├── labels
├── testNegatives
├── testRatings
├── train
└── user_input

0 directories, 6 files


In [3]:
import pickle

In [4]:
with open('../metadata/original_dataset/train','rb') as fp:
    train = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/testRatings','rb') as fp:
    testRatings = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/testNegatives','rb') as fp:
    testNegatives = pickle.load(fp, encoding='latin1')

In [5]:
type(train), type(testRatings), type(testNegatives)

(scipy.sparse.dok.dok_matrix, list, list)

In [6]:
testRatings[:5]

[[0, 25], [1, 133], [2, 207], [3, 208], [4, 222]]

In [7]:
len(testRatings), len(testNegatives)

(6040, 6040)

In [8]:
with open('../metadata/original_dataset/user_input','rb') as fp:
    user_input = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/item_input','rb') as fp:
    item_input = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/labels','rb') as fp:
    labels = pickle.load(fp, encoding='latin1')    

In [9]:
type(user_input), type(item_input), type(labels)

(list, list, list)

#### reformat the test dataset

In [10]:
for idx,value in enumerate(testRatings):
    if idx<5:
        print('{}: {} //{}'.format(idx,value, testNegatives[idx][:5]))

0: [0, 25] //[1064, 174, 2791, 3373, 269]
1: [1, 133] //[1072, 3154, 3368, 3644, 549]
2: [2, 207] //[2216, 209, 2347, 3, 1652]
3: [3, 208] //[3023, 1489, 1916, 1706, 1221]
4: [4, 222] //[1794, 3535, 108, 593, 466]


In [11]:
%%time
user_test, item_test, labels_test = [],[],[]
for idx in range(len(testRatings)):
    user_test.extend(
        [testRatings[idx][0]]*(len(testNegatives[idx])+1)
    )
    item_test.append(testRatings[idx][1])
    item_test.extend(testNegatives[idx])
    labels_test.append(1)
    labels_test.extend([0]*len(testNegatives[idx]))

CPU times: user 28.3 ms, sys: 16 ms, total: 44.4 ms
Wall time: 43.2 ms


In [12]:
len(user_test)

604000

## step 1: create the model architecture and fit model with dataset from original paper

In [13]:
num_users, num_items = train.shape
num_users, num_items

(6040, 3706)

In [14]:
n_users = num_users
n_items = num_items
n_factors_gmf = 8
layers_mlp = [64,32,16,8]
# n_factors_gmf = 4
# layers_mlp = [16,8,4]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.001
flg_pretrain = ''
filepath = ''
filepath_mlp_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 256

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
ncf.create_model()

In [15]:
ncf.compile(learning_rate=learning_rate)

In [16]:
ncf.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_mlp_User (Embedding)  (None, 1, 32)        193280      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_mlp_Item (Embedding)  (None, 1, 32)        118592      item_input[0][0]                 
______________________________________________________________________________________________

In [18]:
# def fit(self, dataset, 
batch_size=256
num_epochs=20
path_model_weights='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf4/ncf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5'
path_csvlog='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf4/ncf_log.csv'

## create the callback metrics
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath= path_model_weights, 
    monitor='val_loss',
    verbose=1, 
    save_best_only=True
)
csvlog = tf.keras.callbacks.CSVLogger(
    filename=path_csvlog, 
    separator=',', 
    append=False
)
earlystop = tf.keras.callbacks.EarlyStopping(patience=12)
lrreduce = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.3, 
    patience=4, 
    verbose=1
)  
metrics2 = evaluation_grouped.metricsCallback(batch_size=batch_size,log_steps=100)      
## fit the model
hist = ncf.model.fit(
    x = [
        np.array(user_input),
        np.array(item_input)
    ],
    y = np.array(labels),
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=2,
    shuffle=True,
    callbacks=[metrics2,checkpoint,csvlog,earlystop,lrreduce],
    validation_data=(
        [
            np.array(user_test),
            np.array(item_test)
        ],
        np.array(labels_test)
    )
)  

Train on 4970845 samples, validate on 604000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.16958, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf4/ncf-weights-improvement-01-0.1696.hdf5
4970845/4970845 - 57s - loss: 0.3219 - accuracy: 0.8570 - val_loss: 0.1696 - val_accuracy: 0.9391
Epoch 2/20

Epoch 00002: val_loss improved from 0.16958 to 0.15503, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf4/ncf-weights-improvement-02-0.1550.hdf5
4970845/4970845 - 56s - loss: 0.2744 - accuracy: 0.8802 - val_loss: 0.1550 - val_accuracy: 0.9410
Epoch 3/20

Epoch 00003: val_loss improved from 0.15503 to 0.15225, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf4/ncf-weights-improvement-03-0.1523.hdf5
4970845/4970845 - 53s - loss: 0.2627 - accuracy: 0.8858 - val_loss: 0.1523 - val_accuracy: 0.9412
Epoch 4/20

Epoch 00004: val_loss improved from 0.15225 to 0.14039, saving model to /Users/xyin/Documents/wo

#### try to load the parameters from the best model

In [19]:
ncf.model.load_weights('../metadata/ncf4/ncf-weights-improvement-04-0.1404.hdf5')

In [21]:
%%time 

scores = ncf.model.evaluate(
    x = [
        np.array(user_test),
        np.array(item_test)
    ],
    y = np.array(labels_test),
    verbose=0
)

CPU times: user 16.8 s, sys: 2.44 s, total: 19.2 s
Wall time: 12.3 s


In [22]:
scores

[0.14038968149246955, 0.945404]

In [23]:
ncf.model.save('../metadata/ncf4/ncf-best.hdf5')

In [14]:
model4 = keras.models.load_model('../metadata/ncf4/ncf-best.hdf5')

In [26]:
%%time 

evaluator = evaluation_grouped.metricsEval(
    model=model4,
    users=user_input,
    items=item_input
)
evaluator.getRecs()
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    user_test,
    item_test,
    labels_test
)
rmse,auc,logloss

100%|██████████| 6040/6040 [09:03<00:00, 11.12it/s]


CPU times: user 11min 20s, sys: 1min 22s, total: 12min 43s
Wall time: 9min 29s


(0.2016294613947125, 0.8940297085975318, 0.1403896816127934)

In [28]:
%%time

recall,precision,ndcg,map2 = evaluator.getRankBasedMetrics(
    user_test,
    item_test,
    labels_test
)
recall,precision,ndcg,map2

CPU times: user 50.7 s, sys: 8.58 s, total: 59.3 s
Wall time: 1min 1s


(0.029635761589403974,
 0.002963576158940397,
 0.013684980463253551,
 0.008958596131609377)

#### metrics are still one order lower than reported numbers in the paper. Although loss function value is close to reported one

```sh
## original paper's model output
(base) xyin self/neural_collaborative_filtering [09:14:12]$ docker run --volume=$(pwd):/home ncf-keras-theano python NeuMF.py --dataset ml-1m --epochs 20 --batch_size 256 --num_factors 8 --layers [64,32,16,8] --reg_mf 0 --reg_layers [0,0,0,0] --num_neg 4 --lr 0.001 --learner adam --verbose 1 --out 1
Using Theano backend.
NeuMF arguments: Namespace(batch_size=256, dataset='ml-1m', epochs=20, layers='[64,32,16,8]', learner='adam', lr=0.001, mf_pretrain='', mlp_pretrain='', num_factors=8, num_neg=4, out=1, path='Data/', reg_layers='[0,0,0,0]', reg_mf=0.0, verbose=1)
Load data done [20.3 s]. #user=6040, #item=3706, #train=994169, #test=6040
Init: HR = 0.0957, NDCG = 0.0448
Iteration 0 [106.5 s]: HR = 0.6000, NDCG = 0.3403, loss = 0.3170 [4.2 s]
Iteration 1 [78.4 s]: HR = 0.6421, NDCG = 0.3704, loss = 0.2743 [3.7 s]
Iteration 2 [79.2 s]: HR = 0.6591, NDCG = 0.3832, loss = 0.2635 [3.6 s]
Iteration 3 [69.0 s]: HR = 0.6608, NDCG = 0.3865, loss = 0.2579 [3.6 s]
Iteration 4 [69.0 s]: HR = 0.6689, NDCG = 0.3948, loss = 0.2537 [3.6 s]
Iteration 5 [69.8 s]: HR = 0.6685, NDCG = 0.3929, loss = 0.2507 [3.6 s]
Iteration 6 [68.5 s]: HR = 0.6785, NDCG = 0.4003, loss = 0.2480 [3.6 s]
Iteration 7 [81.2 s]: HR = 0.6743, NDCG = 0.3992, loss = 0.2458 [3.6 s]
Iteration 8 [74.9 s]: HR = 0.6765, NDCG = 0.4013, loss = 0.2435 [3.6 s]
Iteration 9 [75.7 s]: HR = 0.6791, NDCG = 0.4042, loss = 0.2414 [3.7 s]
Iteration 10 [76.2 s]: HR = 0.6755, NDCG = 0.4040, loss = 0.2399 [3.9 s]
Iteration 11 [434.2 s]: HR = 0.6849, NDCG = 0.4100, loss = 0.2379 [4.0 s]
Iteration 12 [79.7 s]: HR = 0.6815, NDCG = 0.4098, loss = 0.2363 [3.6 s]
Iteration 13 [78.7 s]: HR = 0.6781, NDCG = 0.4059, loss = 0.2352 [3.6 s]
Iteration 14 [69.5 s]: HR = 0.6854, NDCG = 0.4078, loss = 0.2338 [3.6 s]
Iteration 15 [67.5 s]: HR = 0.6815, NDCG = 0.4110, loss = 0.2328 [3.6 s]
Iteration 16 [68.4 s]: HR = 0.6848, NDCG = 0.4089, loss = 0.2316 [3.6 s]
Iteration 17 [67.2 s]: HR = 0.6831, NDCG = 0.4098, loss = 0.2305 [3.6 s]
Iteration 18 [78.4 s]: HR = 0.6849, NDCG = 0.4088, loss = 0.2300 [3.6 s]
Iteration 19 [75.2 s]: HR = 0.6846, NDCG = 0.4076, loss = 0.2287 [3.6 s]
End. Best Iteration 14:  HR = 0.6854, NDCG = 0.4078.
The best NeuMF model is saved to Pretrain/ml-1m_NeuMF_8_[64,32,16,8]_1587302095.h5
```

#### probably our defined evluation modules have issues???

In [36]:
evaluator.all_predictions.head(15)

Unnamed: 0,userID,itemID,prediction
0,0,0,0.883521
1,0,1,0.731152
2,0,2,0.721472
3,0,3,0.819745
4,0,4,0.836016
5,0,5,0.933966
6,0,6,0.117248
7,0,7,0.831106
8,0,8,0.874568
9,0,9,0.967178


## step 2: borrow the original paper's evaluation logic

#### get topK predictions for each user

In [41]:
%%time

k=10
topKItems = evaluator.all_predictions.groupby('userID',as_index=False)\
    .apply(lambda items: items.nlargest(k,'prediction'))\
        .reset_index(drop=True)
## append rank
topKItems['rnk'] = topKItems.groupby('userID',sort=False).cumcount()+1

CPU times: user 10.3 s, sys: 958 ms, total: 11.2 s
Wall time: 11.3 s


In [42]:
topKItems.head(22)

Unnamed: 0,userID,itemID,prediction,rnk
0,0,26,0.990228,1
1,0,40,0.988341,2
2,0,167,0.974924,3
3,0,390,0.974696,4
4,0,381,0.973926,5
5,0,104,0.970118,6
6,0,280,0.968304,7
7,0,9,0.967178,8
8,0,124,0.966618,9
9,0,22,0.964157,10


#### check hit rate

In [45]:
user_test[:5], item_test[:5], labels_test[:5]

([0, 0, 0, 0, 0], [25, 1064, 174, 2791, 3373], [1, 0, 0, 0, 0])

In [47]:
rating_true = pd.DataFrame({
    'userID':user_test,
    'itemID':item_test,
    'rating':labels_test
})
rating_true = rating_true[rating_true.rating>0].reset_index(drop=True)
rating_true.head(3)

Unnamed: 0,userID,itemID,rating
0,0,25,1
1,1,133,1
2,2,207,1


In [48]:
rating_pred = topKItems.loc[:,topKItems.columns[:3]].copy()
rating_pred.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,26,0.990228
1,0,40,0.988341
2,0,167,0.974924


In [63]:
%%time

overlap = pd.merge(
    rating_true,
    rating_pred,
    on=['userID','itemID']
)

CPU times: user 11.2 ms, sys: 3.15 ms, total: 14.3 ms
Wall time: 12.7 ms


In [67]:
overlap.shape[0]/n_users

0.029635761589403974

#### the original paper didn't use allUser-allItem combinations as the basis of hitRate and NDCG evaluation. Instead, it uses the positive sample in test and 99 negative samples as the basis. Therefore, the hit rate and NDCG are all INFLATED!!
#### let's reproduce the author's logic here

#### get prediction value for the test dataset (1 positive plus 99 negative)

In [15]:
testRatings[0]

[0, 25]

In [16]:
%%time

import heapq
idx = 0

rating = testRatings[idx]
items = testNegatives[idx]
u = rating[0]
gtItem = rating[1]
items.append(gtItem)
# Get prediction scores
map_item_score = {}
users = np.full(len(items), u, dtype = 'int32')
predictions = model4.predict([users, np.array(items)], 
                             batch_size=100, verbose=0)
for i in range(len(items)):
    item = items[i]
    map_item_score[item] = predictions[i]
items.pop()

# Evaluate top rank list
ranklist = heapq.nlargest(10, map_item_score, key=map_item_score.get)
ranklist

CPU times: user 111 ms, sys: 5.99 ms, total: 117 ms
Wall time: 115 ms


[128, 25, 174, 273, 464, 175, 1064, 1182, 487, 1331]

In [17]:
len(testRatings)

6040

In [27]:
%%time

from tqdm.notebook import trange

## create placeholders for user, item, pred
list_users, list_items, list_preds = [], [], []

## get predictions for each user-item pair
for idx in trange(len(testRatings)):
    user = user_test[idx*100:(idx+1)*100]
    item = item_test[idx:idx+100]
    list_users.extend(user)
    list_items.extend(item)
    list_preds.extend(
        model4.predict(
            x=[
                np.array(user),
                np.array(item)
            ]
        ).flatten()
    )
## create a pandas dataframe
all_predictions_test = pd.DataFrame(data={
    'userID': list_users,
    'itemID': list_items,
    'prediction': list_preds
})

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


CPU times: user 2min 19s, sys: 6.05 s, total: 2min 25s
Wall time: 2min 16s


In [28]:
all_predictions_test.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,25,0.787151
1,0,1064,0.399748
2,0,174,0.732144


In [29]:
%%time

k=10
topKItems_test = all_predictions_test.groupby('userID',as_index=False)\
    .apply(lambda items: items.nlargest(k,'prediction'))\
        .reset_index(drop=True)
## append rank
topKItems_test['rnk'] = topKItems_test.groupby('userID',sort=False).cumcount()+1

CPU times: user 9.64 s, sys: 171 ms, total: 9.81 s
Wall time: 9.76 s


In [30]:
topKItems_test.head(3)

Unnamed: 0,userID,itemID,prediction,rnk
0,0,128,0.95447,1
1,0,25,0.787151,2
2,0,174,0.732144,3


In [43]:
rating_true = pd.DataFrame({
    'userID':user_test,
    'itemID':item_test,
    'rating':labels_test
})
rating_true = rating_true[rating_true.rating>0].reset_index(drop=True)
rating_true.head(3)

Unnamed: 0,userID,itemID,rating
0,0,25,1
1,1,133,1
2,2,207,1


In [44]:
rating_pred = topKItems_test.loc[:,topKItems_test.columns[:3]].copy()
rating_pred.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,128,0.95447
1,0,25,0.787151
2,0,174,0.732144


In [48]:
overlap = pd.merge(
    rating_true,
    rating_pred,
    on=['userID','itemID']
)
(overlap.rating/10).sum()/len(testRatings) #### still much lower than reported number!!

0.0014238410596026487

#### try the paper's code

In [51]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in trange(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [52]:
%%time

hrs,ndcgs = evaluate_model(model4, testRatings, testNegatives, 10, 1)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


CPU times: user 1min 49s, sys: 3.03 s, total: 1min 52s
Wall time: 1min 49s


In [53]:
hrs[:10]

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0]

In [54]:
ndcgs[:10]

[0.6309297535714574,
 0.33333333333333337,
 0.33333333333333337,
 0.6309297535714574,
 0,
 1.0,
 0.5,
 1.0,
 0.2890648263178878,
 0]

In [56]:
hr, ndcg = np.array(hrs).mean(), np.array(ndcgs).mean()

In [57]:
hr

0.6504966887417218

In [58]:
ndcg

0.37691241349812515