In [591]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

In [592]:
# Reading Data
df_online_retail = pd.read_csv('../../00_Data/data.csv', encoding= 'unicode_escape', na_values='nan')
#df_online_retail_xlsx = pd.read_excel('../00_Data/online_retail.xlsx', sheet_name=0)
#df_online_retail_II = pd.read_excel('../00_Data/online_retail_II.xlsx', sheet_name=0)
#df_rec_sys_order = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=0)
#df_rec_sys_customer = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=1)
#df_rec_sys_product = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=2)

# 1. Online Retail Dataset

## 1.1 Initial Data Check

In [593]:
# Find out number of missing values per column
df_online_retail.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [594]:
# We need to drop missing values in CustomerID
df_online_retail = df_online_retail.dropna(subset=['CustomerID'])

# Sanity check 
df_online_retail.isna().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [595]:
# First check Data Types 
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    406829 non-null  object 
 1   StockCode    406829 non-null  object 
 2   Description  406829 non-null  object 
 3   Quantity     406829 non-null  int64  
 4   InvoiceDate  406829 non-null  object 
 5   UnitPrice    406829 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      406829 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 27.9+ MB


In [596]:
# Since we are going to encode the labels for User and Items, we need to make sure that we convert those to strings 
df_online_retail['CustomerID'] = df_online_retail['CustomerID'].astype(str)

# We can also convert Invoice Date into Date time
df_online_retail['InvoiceDate'] = pd.to_datetime(df_online_retail['InvoiceDate'])
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  object        
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 27.9+ MB


In [597]:
# Last but not least, we will need a column, which indicates that an item was bought by a user. We are assuming, this column is binary - 1 means bought, 0 means no interaction
df_online_retail['purchased'] = 1
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1


In [598]:
df_online_retail.groupby('CustomerID').size().sort_values()

CustomerID
17846.0       1
17763.0       1
13366.0       1
16579.0       1
13391.0       1
           ... 
14606.0    2782
12748.0    4642
14096.0    5128
14911.0    5903
17841.0    7983
Length: 4372, dtype: int64

In [599]:
# User & Item  Label Encoder 

# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Set up Objects for User and Item Column
le1 = LabelEncoder()
le2 = LabelEncoder()

# Fit the encoders to the columns in question
le1.fit(df_online_retail['StockCode'])
le2.fit(df_online_retail['CustomerID'])

# Transform dataframe with encoded labels
df_online_retail['StockCode'] = le1.transform(df_online_retail['StockCode'])
df_online_retail['CustomerID'] = le2.transform(df_online_retail['CustomerID'])

# Check Change 
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,3249,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom,1
1,536365,2649,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
2,536365,2855,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom,1
3,536365,2803,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
4,536365,2802,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1


## 1.2. Filter out Users

In [600]:
# Count the number of rows per user
rows_per_user = df_online_retail.groupby('CustomerID').size()

# Create a new DataFrame with the desired columns and reset the index
df_rows_per_user = rows_per_user.reset_index(name='Purchases')

# Rename the column to match the original DataFrame
df_rows_per_user = df_rows_per_user.rename(columns={'CustomerID': 'CustomerID'})

# Display the new DataFrame
df_rows_per_user.sort_values(by='Purchases', ascending=False)

Unnamed: 0,CustomerID,Purchases
4042,4042,7983
1895,1895,5903
1300,1300,5128
330,330,4642
1674,1674,2782
...,...,...
1318,1318,1
1046,1046,1
2400,2400,1
2379,2379,1


In [601]:
# Find out a Threshold to cut
print("Users with less than 13 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<13]))
print("Users with less than 15 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<15]))
print("Users with less than 20 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<20]))
print("Users with less than 50 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<50]))

Users with less than 13 Purchases 	: 807
Users with less than 15 Purchases 	: 926
Users with less than 20 Purchases 	: 1227
Users with less than 50 Purchases 	: 2419


In [602]:
# Create a list of users with less than 20 Purchases
customer_less_20 = df_rows_per_user[df_rows_per_user['Purchases']<10]['CustomerID'].tolist()

# Filter out these users from the initial DataFrame df_online_retail
df_online_retail_20 = df_online_retail[~df_online_retail['CustomerID'].isin(customer_less_20)]

# Show shape of new DataFrame
df_online_retail_20.shape

(403837, 9)

In [603]:
# How many rows less?
df_online_retail.shape[0] - df_online_retail_20.shape[0]

2992

## 1.3. Data Split

In [604]:
from recommenders.datasets.sparse import AffinityMatrix

In [605]:
unique_items = df_online_retail_20['StockCode'].unique

In [606]:
#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am = AffinityMatrix(df = df_online_retail_20, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
X, _, _ = am.gen_affinity_matrix()

# check that the generated matrix has the correct dimensions
assert (X.shape[0] == df_online_retail_20.CustomerID.unique().shape[0]) & (
        X.shape[1] == df_online_retail_20.StockCode.unique().shape[0]
    )

In [607]:
from recommenders.datasets.python_splitters import numpy_stratified_split
Xtr, Xtst = numpy_stratified_split(X, ratio=0.8, seed=1)

In [608]:
print('train matrix size', Xtr.shape)
print('test matrix size', Xtst.shape)

train matrix size (3781, 3680)
test matrix size (3781, 3680)


In [609]:
count_nonzero_train = np.sum(Xtr != 0)
count_nonzero_test = np.sum(Xtst!=0)
print('Elements in train: \t', count_nonzero_train)
print('Elements in test:\t', count_nonzero_test)

Elements in train: 	 214497
Elements in test:	 50348


In [610]:
# Check unique values
np.unique(Xtr)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 40, 41, 42, 49, 54, 56, 65, 85])

In [611]:
# Normalize XTR
Xtr = (Xtr - 0) / (85 - 0) * 5
#Xtr = np.round(Xtr, decimals=2)

# Sanity check
np.unique(Xtr)

array([0.        , 0.05882353, 0.11764706, 0.17647059, 0.23529412,
       0.29411765, 0.35294118, 0.41176471, 0.47058824, 0.52941176,
       0.58823529, 0.64705882, 0.70588235, 0.76470588, 0.82352941,
       0.88235294, 0.94117647, 1.        , 1.05882353, 1.11764706,
       1.17647059, 1.23529412, 1.29411765, 1.35294118, 1.41176471,
       1.47058824, 1.52941176, 1.58823529, 1.64705882, 1.70588235,
       1.76470588, 1.82352941, 1.88235294, 1.94117647, 2.        ,
       2.05882353, 2.11764706, 2.17647059, 2.23529412, 2.35294118,
       2.41176471, 2.47058824, 2.88235294, 3.17647059, 3.29411765,
       3.82352941, 5.        ])

In [612]:
# Check unique values
np.unique(Xtst)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 33, 38])

In [613]:
# Normalize XTR
Xtst = (Xtst - 0) / (38- 0) * 5
#Xtst = np.round(Xtst, decimals=2)

# Sanity check
np.unique(Xtst)

array([0.        , 0.13157895, 0.26315789, 0.39473684, 0.52631579,
       0.65789474, 0.78947368, 0.92105263, 1.05263158, 1.18421053,
       1.31578947, 1.44736842, 1.57894737, 1.71052632, 1.84210526,
       1.97368421, 2.10526316, 2.23684211, 2.36842105, 2.5       ,
       2.63157895, 2.76315789, 2.89473684, 3.02631579, 3.15789474,
       3.28947368, 3.42105263, 3.55263158, 3.68421053, 3.94736842,
       4.34210526, 5.        ])

In [614]:
# Turn all values to 5 that are above 0 
#Xtr = np.where(Xtr > 0, 5, Xtr)
#Xtr = np.where(Xtr ==0, 1, Xtr)

In [615]:
# Turn all values to 5 that are above 0 
#Xtst = np.where(Xtst > 0, 5, Xtst)
#Xtst = np.where(Xtst == 0, 1, Xtst)

In [623]:
(Xtst>0).sum()

50348

## 1.3. Model

In [624]:
import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    map_at_k,
    auc
)


In [625]:
try:
    del(model)
except:
    pass

In [626]:
#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
    visible_units=Xtr.shape[1],
    hidden_units=100,
    training_epoch=30,
    minibatch_size=10,
    keep_prob=0.8,
    with_metrics=True,
    seed=1,
    learning_rate=0.0001
)

In [627]:
#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 70.92 seconds for training.


In [628]:
# number of top score elements to be recommended  
K = 20

# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k =  model.recommend_k_items(Xtst, top_k=K)

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 7.98 seconds for prediction.


In [629]:
# Map Back Xtst & Top_k 
top_k_df = am.map_back_sparse(top_k, kind = 'prediction')
test_df = am.map_back_sparse(Xtst, kind = 'ratings')

In [630]:
# Drop Customer ID because non existent
top_k_df = top_k_df.drop(top_k_df[top_k_df['CustomerID'] == 0].index)

In [631]:
#collection of evaluation metrics for later use
def ranking_metrics(
    data_size,
    data_true,
    data_pred,
    K
):
    eval_precision = precision_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)

    eval_recall = recall_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)
    eval_map = map_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)

    df_result = pd.DataFrame(
        {   "Dataset": data_size,
            "K": K,
            "Precision@k": eval_precision,
            "Recall@k": eval_recall,
            "MAP": eval_map,
        }, 
        index=[0]
    )

    return df_result

In [632]:
eval_at_10= ranking_metrics(
    data_size="Online Retail",
    data_true=test_df,
    data_pred=top_k_df,
    K=20
)

eval_at_10

Unnamed: 0,Dataset,K,Precision@k,Recall@k,MAP
0,Online Retail,20,0.000661,0.000782,0.00019


In [633]:
top_k_df[top_k_df['CustomerID']==1].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,prediction
4,1,97,3.4636
13,1,141,3.003859
12,1,244,3.100142
7,1,606,2.966516
3,1,774,2.570968
2,1,1728,2.396583
10,1,2428,3.657192
11,1,2501,2.044219
8,1,2517,3.122402
5,1,2960,2.51031


In [635]:
test_df[test_df['CustomerID']==1].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,purchased
19,1,275,0.131579
18,1,320,0.263158
2,1,461,0.263158
0,1,1286,0.263158
6,1,1292,0.526316
16,1,1301,0.131579
12,1,1358,0.131579
8,1,1551,0.131579
15,1,1627,0.263158
13,1,1628,0.131579
