# Test the MLP modules under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model

In [1]:
import numpy as np 
import pandas as pd
import keras
from keras import Model
from keras.regularizers import l2
from keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation
)

Using TensorFlow backend.


## step 1: load the data

In [2]:
train = pd.read_csv('../data/ml-1m.train.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])
test = pd.read_csv('../data/ml-1m.test.rating',sep='\t',header=None,names=['user','item','rating','event_ts'])

In [3]:
train.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,32,4,978824330
1,0,34,4,978824330
2,0,4,5,978824291


In [4]:
test.head(3)

Unnamed: 0,user,item,rating,event_ts
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504


In [5]:
test.user.nunique(), test.shape

(6040, (6040, 4))

## step 2: prepare the data for gmf model training

In [6]:
dataset = Data(
    train=train,
    test=test,
    col_user='user',
    col_item='item',
    col_rating='rating',
    col_time='event_ts',
    binary=True,
    n_neg=4,
    n_neg_test=100
)
dataset.prepTrainDNN(negSample=True)
dataset.prepTestDNN(group=True)

Method to save python object to disk for later use

```python
import pickle
## pickle data
with open('../metadata/datasetMlp','wb') as fp:
    pickle.dump(dataset, fp)
## pickle data with compression
import bz2
with bz2.BZ2File('datasetMlpSmaller', 'w') as fp:
    pickle.dump(dataset, fp)
    
## unpickle data
with open('../metadata/datasetMlp','rb') as fp:
    dataset2 = pickle.load(fp)
with bz2.BZ2File('../metadata/datasetMlpSmaller', 'r') as fp:
    dataset2 = pickle.load(fp)    
```

In [7]:
len(dataset.users),train.shape

(4970845, (994169, 6))

In [8]:
len(dataset.users_test),test.shape

(610040, (6040, 6))

In [9]:
train.user.nunique(), test.user.nunique()

(6040, 6040)

In [10]:
train.item.nunique(), test.item.nunique()

(3704, 1921)

In [11]:
dataset.interaction_train.head(3)

Unnamed: 0,user,item_interacted,item_negative
0,0,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6..."
1,1,"{15, 22, 31, 34, 35, 42, 43, 52, 53, 54, 55, 5...","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"{2, 135, 136, 14, 18, 147, 159, 163, 36, 40, 1...","{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15..."


#### prepare the test dataset

In [12]:
newItems = set(dataset.items_test)-set(dataset.items)
idx2del = []
for idx,item in enumerate(dataset.items_test):
    if item in newItems:
        idx2del.append(idx)

length_test_original = len(dataset.users_test)
dataset.users_test = [
    dataset.users_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.items_test = [
    dataset.items_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]
dataset.ratings_test = [
    dataset.ratings_test[idx]
    for idx in range(length_test_original) if idx not in idx2del
]

## step 3: create the model architecture

In [13]:
n_users = 6040
n_items = 3704
n_factors_gmf = 32
layers_mlp = [64,32,16,8]
reg_gmf = 0.
reg_layers_mlp = [0.,0.,0.,0.]
learning_rate = 0.01
flg_pretrain = ''
filepath = ''
filepath_gmf_pretrain = ''
filepath_mlp_pretrain = ''
num_epochs = 20
batch_size = 100

mlp = MLP(
    n_users=n_users,
    n_items=n_items,
    layers_mlp=layers_mlp,
    reg_layers_mlp=reg_layers_mlp
)

model = mlp.create_model()
#### compile the model
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
#### create the callback metrics
filepath="../metadata/mlp/mlp-weights-improvement-{epoch:02d}-{val_loss:.4f}.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=filepath, 
    verbose=1, 
    save_best_only=True
)
csvlog = keras.callbacks.CSVLogger(
    '../metadata/mlp/mlp_log.csv', 
    separator=',', 
    append=False
)
earlystop = keras.callbacks.EarlyStopping(patience=12)
lrreduce = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", 
    factor=0.3, 
    patience=4, 
    verbose=1
)

In [14]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_mlp_User (Embedding)  (None, 1, 32)        193280      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_mlp_Item (Embedding)  (None, 1, 32)        118528      item_input[0][0]                 
____________________________________________________________________________________________

## step 4: train the model

#### check if it's feasible to check recall and precision in each epoch

In [15]:
validation_data=(
    [
        np.array(dataset.users_test),
        np.array(dataset.items_test)
    ],
    np.array(dataset.ratings_test)
)

X_val = validation_data[0]
y_val = validation_data[1]
y_predict = model.predict(x = X_val)

In [16]:
y_predict.shape

(610038, 1)

In [17]:
len(X_val[0]), len(X_val[1])

(610038, 610038)

In [18]:
y_predict[:5]

array([[0.50221723],
       [0.50498635],
       [0.4957168 ],
       [0.5048345 ],
       [0.4985314 ]], dtype=float32)

In [19]:
y_predict[:5].flatten()

array([0.50221723, 0.50498635, 0.4957168 , 0.5048345 , 0.4985314 ],
      dtype=float32)

#### get topK recommendation with multi-processing (multi-processing actually slows down the process)

In [20]:
from cf_ec2 import evaluation2

In [21]:
import imp; imp.reload(evaluation2)

<module 'cf_ec2.evaluation2' from '../cf_ec2/evaluation2.py'>

In [22]:
import time
t1 = time.monotonic()

# recOut = evaluator.getRecSingleUser(user=0)
recOut = evaluation2.getRecMultiUser(dataset, model, 10, num_thread=1)
print('Job finished in {} seconds'.format(time.monotonic()-t1))
#### single-thread takes 63 seconds

Job finished in 58.99633682699999 seconds


In [23]:
recOut[:5]

[[0, 2624, 0.51136, 1],
 [0, 2310, 0.50886977, 2],
 [0, 629, 0.5077475, 3],
 [0, 3450, 0.50764644, 4],
 [0, 1497, 0.5075704, 5]]

#### check model quality/accuracy with single-thread process

In [24]:
#### iterate through all user-item combinations
import time
t1 = time.monotonic()
from tqdm.notebook import tqdm

users, items, preds = [], [], []
item = list(set(dataset.items))
for user in tqdm(set(dataset.users)):
    user = [user] * len(item) 
    users.extend(user)
    items.extend(item)
    preds.extend(model.predict(x = [np.array(user),np.array(item)]).flatten())

all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})
print('Job finished in {} seconds'.format(time.monotonic()-t1))

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


Job finished in 190.29267233699997 seconds


In [25]:
all_predictions = all_predictions.rename(columns={'prediction':'rating'})
all_predictions.head(10)

Unnamed: 0,userID,itemID,rating
0,0,0,0.498592
1,0,1,0.503127
2,0,2,0.504123
3,0,3,0.503084
4,0,4,0.497478
5,0,5,0.496434
6,0,6,0.503733
7,0,7,0.505111
8,0,8,0.500601
9,0,9,0.504715


In [26]:
#### format the test dataset

all_test = pd.DataFrame(data={
    "userID": dataset.users_test, "itemID":dataset.items_test, "rating":dataset.ratings_test
})
all_test = all_test[all_test.rating>0].copy().reset_index(drop=True)
all_test.head(3)

Unnamed: 0,userID,itemID,rating
0,0,398,1.0
1,1,1215,1.0
2,2,418,1.0


In [27]:
t1 = time.monotonic()
#### check recall @ top10
recall_10 = evaluation.recall_at_k(
    rating_true=all_test,
    rating_pred=all_predictions,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='rating',
    k=10
)
precision_10 = evaluation.precision_at_k(
    rating_true=all_test,
    rating_pred=all_predictions,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='rating',
    k=10
)
print('Job finished in {} seconds'.format(time.monotonic()-t1))

Job finished in 26.27661927700001 seconds


In [28]:
recall_10, precision_10

(0.0028155018217952965, 0.0002815501821795297)

In [29]:
t1 = time.monotonic()
#### check recall @ top10
ndcg_10 = evaluation.ndcg_at_k(
    rating_true=all_test,
    rating_pred=all_predictions,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='rating',
    k=10
)
map_10 = evaluation.map_at_k(
    rating_true=all_test,
    rating_pred=all_predictions,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='rating',
    k=10
)
print('Job finished in {} seconds'.format(time.monotonic()-t1))

Job finished in 25.80756768499998 seconds


In [30]:
ndcg_10, map_10

(0.0011764915586316438, 0.0006950030757582927)

#### check model quality/accuracy with multi-process

In [31]:
import itertools
testParam = itertools.product(
    dataset.testGrouped.keys(),
    [10]
)

In [32]:
list(testParam)[:5]

[(0, 10), (1, 10), (2, 10), (3, 10), (4, 10)]

In [33]:
import imp; imp.reload(evaluation2)

<module 'cf_ec2.evaluation2' from '../cf_ec2/evaluation2.py'>

In [37]:
t1 = time.monotonic()

# recOut = evaluator.getRecSingleUser(user=0)
recOut = evaluation2.getRecMultiUser(dataset, model, 10, num_thread=1)
print('Job finished in {} seconds'.format(time.monotonic()-t1))
#### single-thread takes 63 seconds

Job finished in 60.557847687999924 seconds


In [38]:
t1 = time.monotonic()

recalls,precisions = evaluation2.evaluateSingleUser(user=0,K=10)
print('Job finished in {} seconds'.format(time.monotonic()-t1))

Job finished in 0.008146888999931434 seconds


In [39]:
t1 = time.monotonic()

recalls,precisions = evaluation2.evaluateMultiUser(num_process=1)
print('Job finished in {} seconds'.format(time.monotonic()-t1))

10
Job finished in 0.051607107000108954 seconds


#### none of the multi-processing modules work here !!!