# Test the NCF modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset

In [1]:
import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data
)

Using TensorFlow backend.


## step 1: load the data

In [2]:
train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:
train.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,32,4,978824330
1,0,34,4,978824330
2,0,4,5,978824291


In [4]:
test.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504


In [5]:
test.user.nunique(), test.shape

(6040, (6040, 4))

## step 2: prepare the data for ncf model training

In [6]:
dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN()
dataset.prepTestDNN()
dataset.negativeSampling()

In [8]:
len(dataset.users),train.shape

(4970845, (994169, 6))

In [9]:
len(dataset.users_test),test.shape

(610040, (6040, 6))

In [10]:
train.user.nunique(), test.user.nunique()

(6040, 6040)

In [11]:
train.item.nunique(), test.item.nunique()

(3704, 1921)

In [12]:
dataset.interaction_train.head(3)

Unnamed: 0,user,item_interacted,item_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6..."
1,1,"{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...","{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15..."


In [14]:
len(set(dataset.users)), len(set(dataset.items))

(6040, 3704)

In [15]:
len(set(dataset.users_test)), len(set(dataset.items_test))

(6040, 3706)

## step 3: create the model architecture

In [16]:
n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.001
flg_pretrain = ''
filepath = ''
filepath_gmf_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 2
batch_size = 100

ncf = NCF(
    n_users=n_users,
    n_items=n_items,
    n_factors_gmf=n_factors_gmf,
    layers_mlp=layers_mlp,
    reg_gmf=reg_gmf,
    reg_layers_mlp=reg_layers_mlp
)
model = ncf.create_model()
#### compile the model
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy'
)

## step 4: train the model

In [17]:
#### train
hist = model.fit(
    x = [
        np.array(dataset.users),
        np.array(dataset.items)
    ],
    y = np.array(dataset.ratings),
    batch_size=batch_size,
    epochs=3,
    verbose=2,
    shuffle=True
)

Epoch 1/3
 - 218s - loss: 0.3085
Epoch 2/3
 - 216s - loss: 0.2543
Epoch 3/3
 - 211s - loss: 0.2391


In [19]:
dataset.ratings[:5]

array([1., 0., 0., 0., 0.])

In [20]:
hist.history

{'loss': [0.30847480618763495, 0.2542681908473881, 0.2391488334154755]}

In [21]:
dir(hist)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'epoch',
 'history',
 'model',
 'on_batch_begin',
 'on_batch_end',
 'on_epoch_begin',
 'on_epoch_end',
 'on_train_begin',
 'on_train_end',
 'params',
 'set_model',
 'set_params',
 'validation_data']

#### predict on the train dataset

In [23]:
predictions_train = model.predict(
    x = [
        np.array(dataset.users),
        np.array(dataset.items)
    ],
    verbose=1
)



In [28]:
predictions_train[:5]

array([[0.96609616],
       [0.00641885],
       [0.26378247],
       [0.00272253],
       [0.00409278]], dtype=float32)

In [29]:
dataset.ratings[:5]

array([1., 0., 0., 0., 0.])

In [25]:
len(predictions_train)

4970845

In [26]:
help(model.predict)

Help on method predict in module keras.engine.training:

predict(x, batch_size=None, verbose=0, steps=None) method of keras.engine.training.Model instance
    Generates output predictions for the input samples.
    
    Computation is done in batches.
    
    # Arguments
        x: The input data, as a Numpy array
            (or list of Numpy arrays if the model has multiple inputs).
        batch_size: Integer. If unspecified, it will default to 32.
        verbose: Verbosity mode, 0 or 1.
        steps: Total number of steps (batches of samples)
            before declaring the prediction round finished.
            Ignored with the default value of `None`.
    
    # Returns
        Numpy array(s) of predictions.
    
    # Raises
        ValueError: In case of mismatch between the provided
            input data and the model's expectations,
            or in case a stateful model receives a number of samples
            that is not a multiple of the batch size.



In [27]:
keras.__version__

'2.2.4'

In [31]:
len(set(dataset.items)), len(dataset.items), len(predictions_train)

(3704, 4970845, 4970845)

#### create a dataframe with both true and predicted outputs

In [37]:
df_predictions_train = pd.DataFrame({
    'user':dataset.users,
    'item':dataset.items,
    'rating':dataset.ratings,
    'prediction':[value[0] for value in predictions_train]
})

In [39]:
df_predictions_train.head(5)

Unnamed: 0,user,item,rating,prediction
0,0,0,1.0,0.966096
1,0,2371,0.0,0.006419
2,0,204,0.0,0.263782
3,0,3619,0.0,0.002723
4,0,3243,0.0,0.004093


#### remove the new items from test dataset

In [42]:
len(set(dataset.items)), len(set(dataset.items_test))

(3704, 3706)

In [43]:
set(dataset.items_test)-set(dataset.items)

{3704, 3705}

In [44]:
set(dataset.items)-set(dataset.items_test)

set()

In [45]:
newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

In [46]:
len(idx2del)

2

In [47]:
idx2del

[4545, 182911]

In [49]:
dataset.ratings_test[4545], dataset.ratings_test[182911]

(1.0, 1.0)

In [50]:
length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

In [51]:
set(dataset.items_test)-set(dataset.items)

set()

In [52]:
set(dataset.items)-set(dataset.items_test)

set()

In [53]:
len(dataset.users_test),len(dataset.items_test),len(dataset.ratings_test)

(610038, 610038, 610038)

#### predict on the test dataset

In [54]:
predictions_test = model.predict(
    x = [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    verbose=1
)



In [55]:
df_predictions_test = pd.DataFrame({
    'user':dataset.users_test,
    'item':dataset.items_test,
    'rating':dataset.ratings_test,
    'prediction':[value[0] for value in predictions_test]
})
df_predictions_test.head(5)

Unnamed: 0,user,item,rating,prediction
0,0,398,1.0,0.937563
1,0,1557,0.0,0.025693
2,0,277,0.0,0.084378
3,0,853,0.0,0.010413
4,0,624,0.0,0.046614
