# Loading Libraries

In [1]:
# Pandas, Numpy
import pandas as pd
import numpy as np 

# Tensorflow
import tensorflow as tf 

# RBM from Recommenders
from recommenders.models.rbm.rbm import RBM 
from recommenders.utils.timer import Timer 
from recommenders.utils.plot import line_graph 

# Affinity Matrix 
from recommenders.datasets.sparse import AffinityMatrix

# Evaluation 
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k
)
from sklearn.metrics import accuracy_score 

# Visualization
from matplotlib import pyplot as plt 

# Import & Read Dataset

In [2]:
# Read Train & Test Data
train = pd.read_csv("../../00_Data/online_retail_ratings_train.csv", index_col=0)
test = pd.read_csv("../../00_Data/online_retail_ratings_test.csv", index_col=0)

In [3]:
# Check Shapes
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (357692, 3)
Shape of Test: 	 (89479, 3)


In [4]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3696
Unique Users in Test: 3696
Unique Items in Train: 2769
Unique Items in Test: 2769


In [5]:
# Value Counts of Ratings in Train
train.purchased.value_counts()

 1    153304
-1    149998
 2     48666
-2      5724
Name: purchased, dtype: int64

In [6]:
# Value Counts of Ratings in Test
test.purchased.value_counts()

 1    38373
-1    37499
 2    12174
-2     1433
Name: purchased, dtype: int64

# Exclude negative Ratings for Modeling

In [7]:
# Exclude negative Ratings from Train  
train = train[train['purchased']>0]

# Show Impact 
train.purchased.value_counts()

1    153304
2     48666
Name: purchased, dtype: int64

In [8]:
# Exclude negative Ratings from test  
test = test[test['purchased']>0]

# Show Impact 
test.purchased.value_counts()

1    38373
2    12174
Name: purchased, dtype: int64

In [9]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_users = set(train['CustomerID'].unique())
test_users = set(test['CustomerID'].unique())

# Find the intersection of the sets from step 1
common_items = train_users.intersection(test_users)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['CustomerID'].isin(common_items)]
test = test[test['CustomerID'].isin(common_items)]

In [15]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())

# Find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
test = test[test['StockCode'].isin(common_items)]

In [16]:
# Check Number of Unique Items and User in Train & Test 
print("Unique Users in Train:", train.CustomerID.nunique())
print("Unique Users in Test:", test.CustomerID.nunique())
print("Unique Items in Train:", train.StockCode.nunique())
print("Unique Items in Test:", test.StockCode.nunique())

Unique Users in Train: 3643
Unique Users in Test: 3643
Unique Items in Train: 2765
Unique Items in Test: 2765


# Preparation of Datasets for Modeling 

In [17]:
# Set standard names for the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

# Instantiate the sparse matrix generation  
am_train = AffinityMatrix(df = train, col_user='CustomerID', col_item='StockCode', col_rating='purchased')
am_test = AffinityMatrix(df = test, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

# Obtain the sparse matrix 
Xtr, _, _ = am_train.gen_affinity_matrix()
Xtst, _, _ = am_test.gen_affinity_matrix()

In [18]:
# Sanity Check: Print Shape of Train & Test Matrix - Do numbers match nr of unique users and items from above 
print(Xtr.shape)
print(Xtst.shape) 

(3643, 2765)
(3643, 2765)


In [19]:
# Sanity check: Xtr & Xtst After reloading & Transformation
print(np.unique(Xtr))
print(np.unique(Xtst))

[0 1 2]
[0 1 2]


# Modeling

## Hyperparameter Tuning 1 

In [20]:
# Set up Parameters 
hidden_neurons = [500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500]
batch = [200, 300]
lrate = [0.002, 0.003, 0.004]

# Initiate Lists for Assessment
recall_10 = []
recall_20 = []
precision_10 = []
precision_20 = []
neurons = []
batches = []
learning_rates = []

# Loop through lists of parameters 

for n in hidden_neurons:
    for b in batch:
        for l in lrate:
            # Delete Model First
            try:
                del(model)
            except:
                pass
            
            # Define Model with Parameters
            model = RBM(
                possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])), # Always provide this range - way better results! 
                visible_units=Xtr.shape[1],
                hidden_units=n,
                training_epoch=30,
                minibatch_size=b,
                keep_prob=0.7,
                with_metrics=True,
                learning_rate=l,
                seed=42
            )
            
            # Fit Model 
            model.fit(Xtr)
            
            # Recommend top k
            top_k = model.recommend_k_items(Xtst, top_k=20, remove_seen=True)
            
            # Map Back Xtst & Top K
            top_k_df = am_test.map_back_sparse(top_k, kind = 'prediction')
            test_df  = am_test.map_back_sparse(Xtst, kind='ratings')
            
            # Evaluation 
            recall_at_ten  = recall_at_k(test_df, top_k_df, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=10)
            recall_at_twenty = recall_at_k(test_df, top_k_df, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k=20)
            precision_at_ten = precision_at_k(test_df, top_k_df, col_user='CustomerID', col_item='StockCode',
                                            col_rating ='purchased', col_prediction="prediction",
                                            relevancy_method='top_k', k = 10)
            precision_at_twenty = precision_at_k(test_df, top_k_df, col_user='CustomerID', col_item='StockCode',
                                col_rating ='purchased', col_prediction="prediction",
                                relevancy_method='top_k', k = 20)
            
            # Append lists 
            recall_10.append(recall_at_ten)
            recall_20.append(recall_at_twenty)
            precision_10.append(precision_at_ten)
            precision_20.append(precision_at_twenty)
            neurons.append(n)
            batches.append(b)
            learning_rates.append(l)
            
            # Print Results 
            print("Recall at 10 for ", n, "neurons:", recall_at_ten)

Recall at 10 for  500 neurons: 0.03296430067342124
Recall at 10 for  500 neurons: 0.03315551997038836
Recall at 10 for  500 neurons: 0.03302470214545035
Recall at 10 for  500 neurons: 0.032110649640947356
Recall at 10 for  500 neurons: 0.03147368098527063
Recall at 10 for  500 neurons: 0.033319262921085765
Recall at 10 for  600 neurons: 0.048356811914426076
Recall at 10 for  600 neurons: 0.04867442371518753
Recall at 10 for  600 neurons: 0.053020576134617685
Recall at 10 for  600 neurons: 0.04563459300633603
Recall at 10 for  600 neurons: 0.04498903991138204
Recall at 10 for  600 neurons: 0.04797919220137193
Recall at 10 for  700 neurons: 0.05484474999225608
Recall at 10 for  700 neurons: 0.0515469594906076
Recall at 10 for  700 neurons: 0.05018884205430742
Recall at 10 for  700 neurons: 0.058727707178232405
Recall at 10 for  700 neurons: 0.06016475560912275
Recall at 10 for  700 neurons: 0.05579373528837845
Recall at 10 for  800 neurons: 0.051207377042176455
Recall at 10 for  800 neur

In [21]:
# Create Dataframe out of the Results of the Hyperparameter Tuning 
df_tuning_1 = pd.DataFrame({
    "hidden_neurons": neurons,
    "minibatches": batches,
    "learning_rates":learning_rates,
    "recall@10": recall_10,
    "recall@20": recall_20,
    "precision@10":precision_10,
    "precision@20":precision_20
})
# Sort Values by recall@10 
df_tuning_1.sort_values(by='recall@10', ascending=False)

Unnamed: 0,hidden_neurons,minibatches,learning_rates,recall@10,recall@20,precision@10,precision@20
50,1300,200,0.004,0.171765,0.237026,0.206478,0.145745
38,1100,200,0.004,0.171297,0.238259,0.199780,0.142300
37,1100,200,0.003,0.163263,0.229461,0.189569,0.136248
49,1300,200,0.003,0.161478,0.226501,0.196102,0.140255
41,1100,300,0.004,0.147136,0.209359,0.168982,0.123634
...,...,...,...,...,...,...,...
1,500,200,0.003,0.033156,0.054704,0.043069,0.036152
2,500,200,0.004,0.033025,0.055081,0.043920,0.036453
0,500,200,0.002,0.032964,0.054278,0.042575,0.035232
3,500,300,0.002,0.032111,0.054122,0.041806,0.034271
