# Loading Libraries

In [1]:
import pandas as pd
import numpy as np

import logging
import tensorflow as tf


#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
)


# Reading Data

In [2]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_test.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (206861, 3)
Shape of Test: 	 (51879, 3)


In [4]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3690
Unique Users in Test: 3690
Unique Items in Train: 2746
Unique Items in Test: 2746


# Preparation of Datasets for Model

In [5]:
# Import Affinity Matrix
from recommenders.datasets.sparse import AffinityMatrix


#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am_train = AffinityMatrix(df = train, col_user='CustomerID', col_item='StockCode', col_rating='purchased')
am_test = AffinityMatrix(df = test, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [6]:
# Sanity Check: Print Shape of Train & Test Matrix 
print(Xtr.shape)
print(Xtst.shape)

(3690, 2746)
(3690, 2746)


In [7]:
print(np.sum(Xtr))
print(np.sum(Xtst))

206861
51879


# Baseline Model

## Prediction: Test Only

### Data Preparation for Test Only

In [8]:
# Sanity check: Xtr & Xtst before Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 1]
[0 1]


In [9]:
# Change Values above 0 to 5 for Top N training
Xtst = np.where(Xtst > 0, 5, Xtst)
Xtr = np.where(Xtr > 0, 5, Xtr)

In [10]:
# Sanity check: Xtr & Xtst After Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 5]
[0 5]


### Training for Test Only

In [11]:
try:
    del(model)
except:
    pass

#First we initialize the model class
model = RBM(
    possible_ratings=[1., 5.],
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42
)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


2023-05-22 12:10:30.440231: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-22 12:10:30.485787: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)


In [12]:
#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 37.37 seconds for training.


### Test Set Prediction

In [13]:
# Make Predictions for entire Test Matrix
pred_test = model.predict(Xtst) ## Always Use Xtst for prediction - better results 

In [14]:
# Map Back Pred Test
pred_test = am_test.map_back_sparse(pred_test, kind = 'prediction')

In [15]:
# Value counts in Pred Test: Entire Dataset
pred_test.prediction.value_counts()

5.0    8070554
1.0    2062186
Name: prediction, dtype: int64

In [16]:
# Merge test_pred with test
test_prediction_am = test.merge(pred_test, on=['StockCode', 'CustomerID'], how='left')

# Change Purchased to 5 to match same value 
test_prediction_am['purchased'] = 5

In [17]:
# Check first five rows 
test_prediction_am.head()

Unnamed: 0,StockCode,CustomerID,purchased,prediction
0,1249,1,5,5.0
1,396,1,5,5.0
2,2601,1,5,5.0
3,912,1,5,5.0
4,1032,1,5,5.0


In [18]:
# Check Distribution of Prediction Values in Test Set
test_prediction_am.prediction.value_counts()

5.0    51637
1.0      242
Name: prediction, dtype: int64

In [19]:
# Import Accuracy 
from sklearn.metrics import accuracy_score

# Accuracy 
accuracy_score(test_prediction_am.purchased, test_prediction_am.prediction)

0.9953352994467897

## Prediction: Top N

### Data Preparation for TOP N

In [20]:
# Sanity check: Xtr & Xtst before reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 5]
[0 5]


In [21]:
#obtain the sparse matrix Again  
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [22]:
# Change Values above 0 to 5 for Top N training
Xtst = np.where(Xtst > 0, 5, Xtst)
Xtr = np.where(Xtr > 0, 5, Xtr)

In [23]:
# Sanity check: Xtr & Xtst After reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 5]
[0 5]


### Training for TOP N

In [24]:
# Delete First Model 
try:
    del(model)
except:
    pass

# First we initialize the model class
model = RBM(
    possible_ratings=[1., 5.], # Always provide this range - way better results! 
    visible_units=Xtr.shape[1],
    hidden_units=600,
    training_epoch=30,
    minibatch_size=100,
    keep_prob=0.7,
    with_metrics=True,
    seed=42)

#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 37.19 seconds for training.


In [25]:
# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k_20 =  model.recommend_k_items(Xtst, top_k=20, remove_seen=True) # Xtst best

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 1.03 seconds for prediction.


### Evaluation

In [26]:
# Map Back Xtst & Top_k 
df_top_k_20 = am_test.map_back_sparse(top_k_20, kind = 'prediction')
df_test = am_test.map_back_sparse(Xtst, kind = 'ratings')

In [29]:
eval_recall = recall_at_k(df_test, df_top_k_20, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
eval_recall

0.42423868005769544

In [86]:
eval_manual = df_test.merge(df_top_k_20, on=['CustomerID', 'StockCode'], how='inner')
eval_manual

Unnamed: 0,CustomerID,StockCode,purchased,prediction
0,2,1306,5,5.0
1,2,914,5,5.0
2,2,917,5,5.0
3,2,1473,5,5.0
4,3,1805,5,5.0
...,...,...,...,...
14449,4370,1600,5,5.0
14450,4370,162,5,5.0
14451,4370,2607,5,5.0
14452,4370,1208,5,5.0


In [91]:
eval_manual.shape[0] / df_top_k_20.shape[0]

0.19574756229685808

In [92]:
df_top_k_20.shape[0]

73840

In [30]:
model.learning_rate

0.004