# Test the NCF modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model

In [1]:
import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation
)

Using TensorFlow backend.


## step 1: load the data

In [2]:
train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:
train.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,32,4,978824330
1,0,34,4,978824330
2,0,4,5,978824291


In [4]:
test.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504


In [5]:
test.user.nunique(), test.shape

(6040, (6040, 4))

## step 2: prepare the data for ncf model training

In [6]:
dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN()
dataset.prepTestDNN()
dataset.negativeSampling()

In [7]:
len(dataset.users),train.shape

(4970845, (994169, 6))

In [8]:
len(dataset.users_test),test.shape

(610040, (6040, 6))

In [9]:
train.user.nunique(), test.user.nunique()

(6040, 6040)

In [10]:
train.item.nunique(), test.item.nunique()

(3704, 1921)

In [11]:
dataset.interaction_train.head(3)

Unnamed: 0,user,item_interacted,item_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6..."
1,1,"{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...","{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15..."


In [12]:
len(set(dataset.users)), len(set(dataset.items))

(6040, 3704)

In [13]:
len(set(dataset.users_test)), len(set(dataset.items_test))

(6040, 3706)

#### prepare the test dataset

In [14]:
newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

## step 3: create the model architecture

In [15]:
n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.01
flg_pretrain = ''
filepath = ''
filepath_gmf_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 100

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
model = ncf.create_model()
#### compile the model
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
#### create the callback metrics
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath= '../metadata/ncf/ncf_model_best', 
    verbose=1, 
    save_best_only=True
)
csvlog = keras.callbacks.CSVLogger(
    '../metadata/ncf/ncf_log.csv', 
    separator=',', 
    append=False
)
earlystop = keras.callbacks.EarlyStopping(patience=12)
lrreduce = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.3, 
    patience=4, 
    verbose=1
)

W1210 12:22:32.945915 4402009536 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1210 12:22:32.971755 4402009536 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4185: The name tf.truncated_normal is deprecated. Please use tf.random.truncated_normal instead.

W1210 12:22:33.030456 4402009536 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1210 12:22:33.069296 4402009536 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1210 12:22:33.08148

## step 4: train the model

In [16]:
#### train
hist = model.fit(
    x = [
        np.array(dataset.users),
        np.array(dataset.items)
    ],
    y = np.array(dataset.ratings),
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=2,
    shuffle=True,
    callbacks=[checkpoint,csvlog,earlystop,lrreduce],
    validation_data=[
        [np.array(dataset.users_test),np.array(dataset.items_test)],
        np.array(dataset.ratings_test)
    ]
)

Train on 4970845 samples, validate on 610038 samples
Epoch 1/20
 - 236s - loss: 0.3471 - acc: 0.8410 - val_loss: 0.1914 - val_acc: 0.9290

Epoch 00001: val_loss improved from inf to 0.19142, saving model to ../metadata/ncf/ncf_model_best
Epoch 2/20
 - 227s - loss: 0.3166 - acc: 0.8582 - val_loss: 0.2030 - val_acc: 0.9180

Epoch 00002: val_loss did not improve from 0.19142
Epoch 3/20
 - 235s - loss: 0.3121 - acc: 0.8618 - val_loss: 0.1759 - val_acc: 0.9246

Epoch 00003: val_loss improved from 0.19142 to 0.17587, saving model to ../metadata/ncf/ncf_model_best
Epoch 4/20
 - 229s - loss: 0.3064 - acc: 0.8668 - val_loss: 0.1885 - val_acc: 0.9198

Epoch 00004: val_loss did not improve from 0.17587
Epoch 5/20
 - 245s - loss: 0.3054 - acc: 0.8688 - val_loss: 0.2051 - val_acc: 0.9117

Epoch 00005: val_loss did not improve from 0.17587
Epoch 6/20
 - 249s - loss: 0.3056 - acc: 0.8704 - val_loss: 0.1262 - val_acc: 0.9487

Epoch 00006: val_loss improved from 0.17587 to 0.12620, saving model to ../m

In [26]:
hist.history

{'val_loss': [0.19141636139927704,
  0.20298453419919416,
  0.17587084039748271,
  0.18854073413501876,
  0.20513475767298153,
  0.1262007889407188,
  0.15794206839426292,
  0.20882419177667863,
  0.16121499948453813,
  0.2056902889382691,
  0.15903311647938065,
  0.15154774740219953,
  0.1601451641549566,
  0.17826030145408625,
  0.16404884681243623,
  0.16573054983249905,
  0.16591598149803244,
  0.15554556764649174],
 'val_acc': [0.929022456397194,
  0.9179887170655134,
  0.9246227296610686,
  0.919778769265511,
  0.911733369180274,
  0.9486507422005938,
  0.9317780216616615,
  0.9074074084178875,
  0.9314354208890181,
  0.9122612052019553,
  0.9319993196483413,
  0.9358794061223048,
  0.9324599467023853,
  0.9244391350588043,
  0.9314370600486994,
  0.9304141729217235,
  0.9313813254221884,
  0.9364154384776286],
 'loss': [0.34709987770233347,
  0.3165626227382344,
  0.3121344553044487,
  0.30639764288683735,
  0.3054343348763331,
  0.3056224581690873,
  0.3045794017037246,
  0.303