# Test the NCF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components, with gmf and mlp pretrain)

#### 3/21/2020

In [1]:
import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation,
    evaluation_grouped
)

Using TensorFlow backend.


## step 1: load the data

In [2]:
train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:
test.user.nunique(), test.shape

(6040, (6040, 4))

## step 2: prepare the data for gmf model training

In [4]:
dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN(negSample=True)
dataset.prepTestDNN(group=False)

In [5]:
dataset.interaction_train.head(3)

Unnamed: 0,user,item_interacted,item_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6..."
1,1,"{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...","{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15..."


#### prepare the test dataset

In [6]:
newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

## step 3: create the model architecture and fit model with training data

In [7]:
gmf = keras.models.load_model('../metadata/gmf/gmf-best.hdf5')
mlp = keras.models.load_model('../metadata/mlp/mlp-best.hdf5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [8]:
n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.01
flg_pretrain = ''
filepath = ''
filepath_mlp_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 100

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
ncf.create_model()
ncf.load_pretrain_model(
    gmf,
    mlp,
    len(layers_mlp)
)

In [9]:
ncf.compile(learning_rate=learning_rate)

In [10]:
hist = ncf.fit(
    dataset=dataset,
    batch_size=batch_size,
    num_epochs=num_epochs,
    path_model_weights='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf2/ncf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5',
    path_csvlog='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf2/ncf_log.csv'
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4970845 samples, validate on 610038 samples
Epoch 1/20
 - 238s - loss: 0.2829 - accuracy: 0.8778 - val_loss: 0.1492 - val_accuracy: 0.9373

Epoch 00001: val_loss improved from inf to 0.14924, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf2/ncf-weights-improvement-01-0.1492.hdf5
Epoch 2/20
 - 230s - loss: 0.2830 - accuracy: 0.8801 - val_loss: 0.1617 - val_accuracy: 0.9321

Epoch 00002: val_loss did not improve from 0.14924
Epoch 3/20
 - 223s - loss: 0.2846 - accuracy: 0.8808 - val_loss: 0.1426 - val_accuracy: 0.9394

Epoch 00003: val_loss improved from 0.14924 to 0.14262, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf2/ncf-weights-improvement-03-0.1426.hdf5
Epoch 4/20
 - 3140s - loss: 0.2864 - accuracy: 0.8814 - val_loss: 0.1853 - val_accuracy: 0.9167

Epoch 00004: val_loss did not improve from 0.14262
Epoch 5/20
 - 553s - loss: 0.2877 - accuracy: 0.8818 - val_loss: 0.2796 - val_accuracy: 0.8675

Epoch 00005: val_loss did 

#### double check the current state of the trained model

In [11]:
evaluator = evaluation_grouped.metricsEval(
    model=ncf.model,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()

100%|██████████| 6040/6040 [02:50<00:00, 35.39it/s]


In [12]:
evaluator.all_predictions.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,0,0.830965
1,0,1,0.978875
2,0,2,0.945869


In [13]:
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)

In [14]:
rmse,auc,logloss

(0.20498470224471785, 0.8852133304743248, 0.13960937903580425)

it proves that the model is still at the state of last epoch !!!

You can also do something like this

In [15]:
scores = ncf.model.evaluate(
    x = [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    y = np.array(dataset.ratings_test),
    verbose=0
)

In [16]:
scores

[0.13960937837659357, 0.9420790076255798]

In [17]:
ncf.model.metrics_names

['loss', 'accuracy']

#### try to load the parameters from the best model

In [18]:
ncf.model.load_weights('../metadata/ncf2/ncf-weights-improvement-13-0.1296.hdf5')

In [19]:
scores = ncf.model.evaluate(
    x = [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    y = np.array(dataset.ratings_test),
    verbose=0
)

In [20]:
scores

[0.12962980293427456, 0.9457705616950989]

In [21]:
evaluator = evaluation_grouped.metricsEval(
    model=ncf.model,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)
rmse,auc,logloss

100%|██████████| 6040/6040 [02:52<00:00, 35.02it/s]


(0.19726011174002897, 0.8870137908313572, 0.1296010063946608)

#### try to save/load the complete model

In [10]:
ncf.model.save('../metadata/ncf2/ncf-best.hdf5')

In [12]:
model3 = keras.models.load_model('../metadata/ncf2/ncf-best.hdf5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:
evaluator = evaluation_grouped.metricsEval(
    model=model3,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)
rmse,auc,logloss

100%|██████████| 6040/6040 [03:02<00:00, 33.12it/s]


(0.19752335551377526, 0.8868389141123876, 0.13001185821060054)

In [14]:
recall,precision,ndcg,map2 = evaluator.getRankBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)
recall,precision,ndcg,map2

(0.02616760516727393,
 0.0026167605167273927,
 0.011779148923791428,
 0.007521740616308355)

## Not as good as literature reported numbers