In [1]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Reading Data
df_online_retail = pd.read_csv('../../00_Data/data.csv', encoding= 'unicode_escape')
#df_online_retail_xlsx = pd.read_excel('../00_Data/online_retail.xlsx', sheet_name=0)
#df_online_retail_II = pd.read_excel('../00_Data/online_retail_II.xlsx', sheet_name=0)
#df_rec_sys_order = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=0)
#df_rec_sys_customer = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=1)
#df_rec_sys_product = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=2)

# 1. Online Retail Dataset

## 1.1 Data Split

In [3]:
# First check Data Types 
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [4]:
# Since we are going to encode the labels for User and Items, we need to make sure that we convert those to strings 
df_online_retail['CustomerID'] = df_online_retail['CustomerID'].astype(str)

# We can also convert Invoice Date into Date time
df_online_retail['InvoiceDate'] = pd.to_datetime(df_online_retail['InvoiceDate'])
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   541909 non-null  object        
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
# Last but not least, we will need a column, which indicates that an item was bought by a user. We are assuming, this column is binary - 1 means bought, 0 means no interaction
df_online_retail['purchased'] = 1
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1


In [6]:
# User & Item  Label Encoder 

# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Set up Objects for User and Item Column
le1 = LabelEncoder()
le2 = LabelEncoder()

# Fit the encoders to the columns in question
le1.fit(df_online_retail['StockCode'])
le2.fit(df_online_retail['CustomerID'])

# Transform dataframe with encoded labels
df_online_retail['StockCode'] = le1.transform(df_online_retail['StockCode'])
df_online_retail['CustomerID'] = le2.transform(df_online_retail['CustomerID'])

# Check Change 
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,3536,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom,1
1,536365,2794,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
2,536365,3044,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom,1
3,536365,2985,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
4,536365,2984,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1


In [7]:
from recommenders.datasets.sparse import AffinityMatrix

In [8]:
unique_items = df_online_retail['StockCode'].unique

In [9]:
#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am = AffinityMatrix(df = df_online_retail, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
X, _, _ = am.gen_affinity_matrix()

# check that the generated matrix has the correct dimensions
assert (X.shape[0] == df_online_retail.CustomerID.unique().shape[0]) & (
        X.shape[1] == df_online_retail.StockCode.unique().shape[0]
    )

In [10]:
from recommenders.datasets.python_splitters import numpy_stratified_split
Xtr, Xtst = numpy_stratified_split(X, ratio=0.8, seed=1)

In [11]:
print('train matrix size', Xtr.shape)
print('test matrix size', Xtst.shape)

train matrix size (4373, 4070)
test matrix size (4373, 4070)


In [12]:
count_nonzero_train = np.sum(Xtr != 0)
count_nonzero_test = np.sum(Xtst!=0)
print('Elements in train: \t', count_nonzero_train)
print('Elements in test:\t', count_nonzero_test)

Elements in train: 	 203609
Elements in test:	 67816


In [13]:
# Turn all values to 5 that are above 0 
Xtr = np.where(Xtr > 0, 5, Xtr)

In [14]:
# Turn all values to 5 that are above 0 
Xtst = np.where(Xtst > 0, 5, Xtst)

## 1.3. Model

In [35]:
import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    map_at_k
)


In [16]:
try:
    del(model)
except:
    pass

In [38]:
hidden_neurons = [500, 1000, 1500]
batch = [80, 90, 100]
K = 20

In [39]:
precision_k = []
recall_k = []
map_k = []
for hiddenlayers in hidden_neurons:
    for bsize in batch: 
        # Delete Model to avoid overwriting
        try:
            del(model)
        except:
            pass
        
        # Define Model with Parameters 
        model = RBM(
            possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
            visible_units=Xtr.shape[1],
            hidden_units=hiddenlayers,
            training_epoch=30,
            minibatch_size=bsize,
            keep_prob=0.9,
            with_metrics=True,
            seed=1
        )
        
        # Fit Model 
        model.fit(Xtr)
        
        # Model Prediction for top_k
        top_k = model.recommend_k_items(Xtst, top_k = K)
        
        # Map Back Xtst & Top_k 
        top_k_df = am.map_back_sparse(top_k, kind = 'prediction')
        test_df = am.map_back_sparse(Xtst, kind = 'ratings')
        
        # Drop Customer 0 
        top_k_df = top_k_df.drop(top_k_df[top_k_df['CustomerID'] == 0].index)
        
        # calculate precision 
        current_precision = precision_at_k(test_df, top_k_df, col_user="CustomerID",col_item="StockCode", col_rating="purchased", col_prediction="prediction", relevancy_method="top_k", k= K)
        
        print("precision_at_k for hidden units:", hiddenlayers, "and batchsize:", bsize, ":",current_precision)
        
        # calculate recall
        current_recall = recall_at_k(test_df, top_k_df, col_user="CustomerID",
                                    col_item="StockCode", col_rating="purchased", col_prediction="prediction",
                                    relevancy_method="top_k", k= K)
        
        print("recall_at_k for hidden units:", hiddenlayers, "and batchsize:", bsize, ":",current_recall)
        print("num missing values:", top_k_df.prediction.isna().sum()) 

precision_at_k for hidden units: 500 and batchsize: 80 : 0.0125772705658583
recall_at_k for hidden units: 500 and batchsize: 80 : 0.01682115855969835
precision_at_k for hidden units: 500 and batchsize: 90 : 0.0125772705658583
recall_at_k for hidden units: 500 and batchsize: 90 : 0.01682115855969835
precision_at_k for hidden units: 500 and batchsize: 100 : 0.0125772705658583
recall_at_k for hidden units: 500 and batchsize: 100 : 0.01682115855969835
precision_at_k for hidden units: 1000 and batchsize: 80 : 0.0125772705658583
recall_at_k for hidden units: 1000 and batchsize: 80 : 0.01682115855969835
precision_at_k for hidden units: 1000 and batchsize: 90 : 0.0125772705658583
recall_at_k for hidden units: 1000 and batchsize: 90 : 0.01682115855969835
precision_at_k for hidden units: 1000 and batchsize: 100 : 0.0125772705658583
recall_at_k for hidden units: 1000 and batchsize: 100 : 0.01682115855969835
precision_at_k for hidden units: 1500 and batchsize: 80 : 0.01238706609605326
recall_at_k 

In [42]:
top_k_df.prediction.isna().sum()

991

In [50]:
top_k_df[top_k_df['CustomerID']==3].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,prediction
61,3,139,5.0
69,3,177,5.0
72,3,298,5.0
76,3,340,5.0
65,3,357,5.0
78,3,409,5.0
63,3,801,5.0
70,3,1306,5.0
71,3,1307,5.0
73,3,1635,5.0


In [51]:
test_df[test_df['CustomerID']==3].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,purchased
45,3,155,5
46,3,282,5
44,3,369,5
38,3,467,5
49,3,669,5
37,3,1021,5
48,3,1483,5
41,3,1577,5
42,3,1739,5
33,3,2133,5
