# Test the NCF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components)

#### 3/21/2020

In [1]:
import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation,
    evaluation_grouped
)

Using TensorFlow backend.


## step 1: load the data

In [2]:
train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:
test.user.nunique(), test.shape

(6040, (6040, 4))

## step 2: prepare the data for gmf model training

In [4]:
dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN(negSample=True)
dataset.prepTestDNN(group=False)

In [5]:
dataset.interaction_train.head(3)

Unnamed: 0,user,item_interacted,item_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6..."
1,1,"{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...","{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15..."


#### prepare the test dataset

In [6]:
newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

## step 3: create the model architecture and fit model with training data

In [7]:
n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.01
flg_pretrain = ''
filepath = ''
filepath_mlp_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 100

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
ncf.create_model()

In [8]:
ncf.compile(learning_rate=learning_rate)

In [9]:
hist = ncf.fit(
    dataset=dataset,
    batch_size=batch_size,
    num_epochs=num_epochs,
    path_model_weights='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf/ncf-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5',
    path_csvlog='/Users/xyin/Documents/work/projects/rec_utils/metadata/ncf/ncf_log.csv'
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4970845 samples, validate on 610038 samples
Epoch 1/20
 - 256s - loss: 0.3351 - accuracy: 0.8498 - val_loss: 0.1762 - val_accuracy: 0.9292

Epoch 00001: val_loss improved from inf to 0.17617, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf/ncf-weights-improvement-01-0.1762.hdf5
Epoch 2/20
 - 259s - loss: 0.3078 - accuracy: 0.8639 - val_loss: 0.1714 - val_accuracy: 0.9320

Epoch 00002: val_loss improved from 0.17617 to 0.17138, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf/ncf-weights-improvement-02-0.1714.hdf5
Epoch 3/20
 - 262s - loss: 0.3003 - accuracy: 0.8681 - val_loss: 0.1678 - val_accuracy: 0.9337

Epoch 00003: val_loss improved from 0.17138 to 0.16779, saving model to /Users/xyin/Documents/work/projects/rec_utils/metadata/ncf/ncf-weights-improvement-03-0.1678.hdf5
Epoch 4/20
 - 269s - loss: 0.2958 - accuracy: 0.8707 - val_loss: 0.1226 - val_accuracy: 0.9529

Epoch 00004: val_loss improved from 0.16779 to 0.12255, s

#### double check the current state of the trained model

In [10]:
evaluator = evaluation_grouped.metricsEval(
    model=ncf.model,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()

100%|██████████| 6040/6040 [03:08<00:00, 32.07it/s]


In [11]:
evaluator.all_predictions.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,0,0.889615
1,0,1,0.955038
2,0,2,0.871819


In [12]:
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)

In [13]:
rmse,auc,logloss

(0.22037640459148028, 0.8818099626756808, 0.16317447215406755)

it proves that the model is still at the state of last epoch !!!

You can also do something like this

In [15]:
scores = ncf.model.evaluate(
    x = [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    y = np.array(dataset.ratings_test),
    verbose=0
)

In [16]:
scores

[0.1631744735293154, 0.9339696764945984]

In [18]:
ncf.model.metrics_names

['loss', 'accuracy']

#### try to load the parameters from the best model

In [20]:
ncf.model.load_weights('../metadata/ncf/ncf-weights-improvement-04-0.1226.hdf5')

In [21]:
scores = ncf.model.evaluate(
    x = [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    y = np.array(dataset.ratings_test),
    verbose=0
)

In [22]:
scores

[0.12255288464253568, 0.9529061913490295]

In [23]:
evaluator = evaluation_grouped.metricsEval(
    model=ncf.model,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)
rmse,auc,logloss

100%|██████████| 6040/6040 [03:07<00:00, 32.22it/s]


(0.1876459585482166, 0.8684167502067479, 0.12255288494877845)

#### try to save/load the complete model

In [24]:
ncf.model.save('../metadata/ncf/ncf-best.hdf5')

In [25]:
model3 = keras.models.load_model('../metadata/ncf/ncf-best.hdf5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [26]:
evaluator = evaluation_grouped.metricsEval(
    model=model3,
    users=dataset.users,
    items=dataset.items
)
evaluator.getRecs()
rmse,auc,logloss = evaluator.getOverlapBasedMetrics(
    dataset.users_test,
    dataset.items_test,
    dataset.ratings_test
)
rmse,auc,logloss

100%|██████████| 6040/6040 [03:06<00:00, 32.40it/s]


(0.1876459585482166, 0.8684167502067479, 0.12255288494877845)