# Loading Libraries

In [31]:
# Pandas & Numpy
import pandas as pd
import numpy as np 

# Visualization
import seaborn as sns 
from matplotlib import pyplot as plt 

# Tensorflow
import tensorflow as tf
from tensorflow import keras
tf.get_logger().setLevel('ERROR') # only show error messages 

# Evaluation 
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    ndcg_at_k,
    map_at_k, 
    get_top_k_items,
    rmse,
    mae,
    rsquared,
    exp_var
)
from sklearn.metrics import accuracy_score 

# Recomender Utilities
from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.utils.constants import SEED as DEFAULT_SEED

# System & OS
import sys
import os
import shutil
#import papermill as pm
import pandas as pd

# Turn of Warnings for Readability 
import warnings
warnings.filterwarnings("ignore")

# Import & Read Datasets

In [32]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [33]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (206861, 3)
Shape of Test: 	 (51879, 3)


In [34]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3690
Unique Users in Test: 3690
Unique Items in Train: 2746
Unique Items in Test: 2746


# Data Preparation

In [35]:
# Change name of columns 
train = train.rename(columns={'StockCode':'itemID', 'CustomerID':'userID', 'purchased':'rating'})
test = test.rename(columns={'StockCode':'itemID', 'CustomerID':'userID', 'purchased':'rating'})

In [36]:
# Write Dataset to CSV files. This is a pre-step for the NCF Dataset preparation
train_file = "./or_train.csv"
test_file = "./or_test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

In [37]:
# Create the NCF Dataset 
data = NCFDataset(train_file=train_file, test_file=test_file, seed=1, overwrite_test_file_full=True)

INFO:recommenders.models.ncf.dataset:Indexing ./or_train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ./or_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ./or_test_full.csv ...
100%|██████████| 3690/3690 [00:40<00:00, 90.94it/s] 
INFO:recommenders.models.ncf.dataset:Indexing ./or_test_full.csv ...


# Baseline Model

In [26]:
# Initiate Model with pretrained NeuMF
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=50,
    batch_size=50,
    learning_rate=1e-3,
    verbose=10,
    seed=1
)

2023-06-19 13:45:25.024116: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-19 13:45:25.035937: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)


In [27]:
# Fit the Modek 
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [20.38s]: train_loss = 0.370194 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [21.03s]: train_loss = 0.364789 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [20.67s]: train_loss = 0.361750 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [21.02s]: train_loss = 0.359382 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [21.20s]: train_loss = 0.358611 


Took 1061.36715395801 seconds for training.


In [28]:
# Predict all user and items pairings in Test 
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]

# Create a Datafragme from the Predictions 
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1,1249,0.367157
1,1,396,0.077538
2,1,2601,0.327272
3,1,912,0.46148
4,1,1032,0.273281


In [45]:
predictions['prediction'] = predictions['prediction'].apply(lambda x: 1 if x > 0.5 else 0)

In [47]:
accuracy_score(test.rating, predictions.prediction)

0.31667919582104515

In [29]:
# Predict ALL User & Item Pairings 
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 47.797705667006085 seconds for prediction.


In [30]:
# Evaluate Recall & Preicison at 10 
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=10)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=10)

print("Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Precision@K:	0.084282
Recall@K:	0.073742


# Hyperparameter Tuning 

In [40]:
# Initiate Lists for Hyperparameters 
factors = [4, 16, 50, 100]
layers = [[16, 8, 4], [32, 16, 8], [200, 100, 50], [400, 200, 100]]
lr_rates = [0.001, 0.005, 0.007]

# Initiate Lists for Assessment
accuracy = []
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []
num_factors = []
layer_shapes = []
learning_rates = []




[1, 2, 4]
[4, 6, 8]
[7, 10, 15]
