In [2]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

In [79]:
# Reading Data
#df_online_retail = pd.read_csv('../../00_Data/data.csv', encoding= 'unicode_escape', na_values='nan')
#df_online_retail_xlsx = pd.read_excel('../00_Data/online_retail.xlsx', sheet_name=0)
#df_online_retail = pd.read_excel('../../00_Data/online_retail_II.xlsx', sheet_name=0)
df_online_retail = pd.read_excel('../../00_Data/Rec_sys_data.xlsx', sheet_name=0)
#df_rec_sys_customer = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=1)
#df_rec_sys_product = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=2)

# 1. Online Retail Dataset

## 1.1 Initial Data Check

In [80]:
# Find out number of missing values per column
df_online_retail.isna().sum()

InvoiceNo       0
StockCode       0
Quantity        0
InvoiceDate     0
DeliveryDate    0
Discount%       0
ShipMode        0
ShippingCost    0
CustomerID      0
dtype: int64

In [81]:
# First check Data Types 
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272404 entries, 0 to 272403
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   InvoiceNo     272404 non-null  int64         
 1   StockCode     272404 non-null  object        
 2   Quantity      272404 non-null  int64         
 3   InvoiceDate   272404 non-null  datetime64[ns]
 4   DeliveryDate  272404 non-null  datetime64[ns]
 5   Discount%     272404 non-null  float64       
 6   ShipMode      272404 non-null  object        
 7   ShippingCost  272404 non-null  float64       
 8   CustomerID    272404 non-null  int64         
dtypes: datetime64[ns](2), float64(2), int64(3), object(2)
memory usage: 18.7+ MB


In [82]:
# Since we are going to encode the labels for User and Items, we need to make sure that we convert those to strings 
df_online_retail['CustomerID'] = df_online_retail['CustomerID'].astype(str)
df_online_retail['StockCode'] = df_online_retail['StockCode'].astype(str)

# Sanity Check
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272404 entries, 0 to 272403
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   InvoiceNo     272404 non-null  int64         
 1   StockCode     272404 non-null  object        
 2   Quantity      272404 non-null  int64         
 3   InvoiceDate   272404 non-null  datetime64[ns]
 4   DeliveryDate  272404 non-null  datetime64[ns]
 5   Discount%     272404 non-null  float64       
 6   ShipMode      272404 non-null  object        
 7   ShippingCost  272404 non-null  float64       
 8   CustomerID    272404 non-null  object        
dtypes: datetime64[ns](2), float64(2), int64(2), object(3)
memory usage: 18.7+ MB


In [83]:
# Last but not least, we will need a column, which indicates that an item was bought by a user. We are assuming, this column is binary - 1 means bought, 0 means no interaction
df_online_retail['purchased'] = 1
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID,purchased
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850,1
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850,1
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850,1
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850,1
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850,1


In [84]:
df_online_retail.groupby('CustomerID').size().sort_values()

CustomerID
12346       1
17846       1
14351       1
17763       1
13185       1
         ... 
15311    1892
14606    2185
12748    2388
14911    3648
17841    5095
Length: 3647, dtype: int64

In [85]:
# User & Item  Label Encoder 

# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Set up Objects for User and Item Column
le1 = LabelEncoder()
le2 = LabelEncoder()

# Fit the encoders to the columns in question
le1.fit(df_online_retail['StockCode'])
le2.fit(df_online_retail['CustomerID'])

# Transform dataframe with encoded labels
df_online_retail['StockCode'] = le1.transform(df_online_retail['StockCode'])
df_online_retail['CustomerID'] = le2.transform(df_online_retail['CustomerID'])

# Check Change 
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID,purchased
0,536365,2695,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,3381,1
1,536365,2548,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,3381,1
2,536365,750,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,3381,1
3,536365,2747,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,3381,1
4,536365,1599,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,3381,1


## 1.2. Filter out Users

In [86]:
# Count the number of rows per user
rows_per_user = df_online_retail.groupby('CustomerID').size()

# Create a new DataFrame with the desired columns and reset the index
df_rows_per_user = rows_per_user.reset_index(name='Purchases')

# Rename the column to match the original DataFrame
df_rows_per_user = df_rows_per_user.rename(columns={'CustomerID': 'CustomerID'})

# Display the new DataFrame
df_rows_per_user.sort_values(by='Purchases', ascending=False)

Unnamed: 0,CustomerID,Purchases
3375,3375,5095
1604,1604,3648
268,268,2388
1423,1423,2185
1858,1858,1892
...,...,...
3306,3306,1
3328,3328,1
3331,3331,1
1373,1373,1


In [87]:
# Find out a Threshold to cut
print("Users with less than 13 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<13]))
print("Users with less than 15 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<15]))
print("Users with less than 20 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<20]))
print("Users with less than 50 Purchases \t:", len(df_rows_per_user[df_rows_per_user['Purchases']<50]))

Users with less than 13 Purchases 	: 729
Users with less than 15 Purchases 	: 856
Users with less than 20 Purchases 	: 1151
Users with less than 50 Purchases 	: 2206


In [88]:
# Create a list of users with less than 20 Purchases
customer_less_20 = df_rows_per_user[df_rows_per_user['Purchases']<15]['CustomerID'].tolist()

# Filter out these users from the initial DataFrame df_online_retail
df_online_retail_20 = df_online_retail[~df_online_retail['CustomerID'].isin(customer_less_20)]

# Show shape of new DataFrame
df_online_retail_20.shape

(265836, 10)

In [89]:
# How many rows less?
df_online_retail.shape[0] - df_online_retail_20.shape[0]

6568

## 1.3. Data Split

In [90]:
from recommenders.datasets.sparse import AffinityMatrix

In [91]:
#to use standard names across the analysis 
header = {
        "col_user": "CustomerID",
        "col_item": "StockCode",
        "col_rating": "purchased",
    }

#instantiate the sparse matrix generation  
am = AffinityMatrix(df = df_online_retail_20, col_user='CustomerID', col_item='StockCode', col_rating='purchased')

#obtain the sparse matrix 
X, _, _ = am.gen_affinity_matrix()

# check that the generated matrix has the correct dimensions
assert (X.shape[0] == df_online_retail_20['CustomerID'].unique().shape[0]) & (
        X.shape[1] == df_online_retail_20.StockCode.unique().shape[0]
    )

In [92]:
from recommenders.datasets.python_splitters import numpy_stratified_split
Xtr, Xtst = numpy_stratified_split(X, ratio=0.8, seed=1)

In [93]:
print('train matrix size', Xtr.shape)
print('test matrix size', Xtst.shape)

train matrix size (2791, 3523)
test matrix size (2791, 3523)


In [94]:
count_nonzero_train = np.sum(Xtr != 0)
count_nonzero_test = np.sum(Xtst!=0)
print('Elements in train: \t', count_nonzero_train)
print('Elements in test:\t', count_nonzero_test)

Elements in train: 	 151016
Elements in test:	 35452


In [95]:
# Turn all values to 5 that are above 0 
Xtr = np.where(Xtr > 0, 1, Xtr)
#Xtr = np.where(Xtr ==0, 1, Xtr)

In [96]:
# Turn all values to 5 that are above 0 
Xtst = np.where(Xtst > 0, 1, Xtst)
#Xtst = np.where(Xtst == 0, 1, Xtst)

## 1.3. Model

In [97]:
import logging
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)

#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    map_at_k,
    auc
)


In [98]:
try:
    del(model)
except:
    pass

In [99]:
#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(Xtr), np.array([0])),
    visible_units=Xtr.shape[1],
    hidden_units=100,
    training_epoch=30,
    minibatch_size=10,
    keep_prob=0.8,
    with_metrics=True,
    seed=1,
    learning_rate=0.0001
)

In [100]:
#Model Fit
with Timer() as train_time:
    model.fit(Xtr)

print("Took {:.2f} seconds for training.".format(train_time.interval))

Took 29.60 seconds for training.


In [101]:
# number of top score elements to be recommended  
K = 20

# Model prediction on the test set Xtst.
with Timer() as prediction_time:
    top_k =  model.recommend_k_items(Xtst, top_k=K)

print("Took {:.2f} seconds for prediction.".format(prediction_time.interval))

Took 0.58 seconds for prediction.


In [102]:
# Map Back Xtst & Top_k 
top_k_df = am.map_back_sparse(top_k, kind = 'prediction')
test_df = am.map_back_sparse(Xtst, kind = 'ratings')

In [103]:
# Drop Customer ID because non existent
#top_k_df = top_k_df.drop(top_k_df[top_k_df['Customer ID'] == 0].index)

In [104]:
#collection of evaluation metrics for later use
def ranking_metrics(
    data_size,
    data_true,
    data_pred,
    K
):
    eval_precision = precision_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode",
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)

    eval_recall = recall_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)
    eval_map = map_at_k(data_true, data_pred, col_user="CustomerID", col_item="StockCode", 
                                    col_rating="purchased", col_prediction="prediction", 
                                    relevancy_method="top_k", k= K)

    df_result = pd.DataFrame(
        {   "Dataset": data_size,
            "K": K,
            "Precision@k": eval_precision,
            "Recall@k": eval_recall,
            "MAP": eval_map,
        }, 
        index=[0]
    )

    return df_result

In [105]:
eval_at_10= ranking_metrics(
    data_size="Online Retail",
    data_true=test_df,
    data_pred=top_k_df,
    K=K
)

eval_at_10

Unnamed: 0,Dataset,K,Precision@k,Recall@k,MAP
0,Online Retail,20,0.010695,0.016223,0.00526


In [106]:
top_k_df

Unnamed: 0,CustomerID,StockCode,prediction
0,1,1574,1.0
1,1,1244,1.0
2,1,3029,1.0
3,1,2129,1.0
4,1,3034,1.0
...,...,...,...
55815,3646,2532,1.0
55816,3646,1085,1.0
55817,3646,1917,1.0
55818,3646,2004,1.0


In [118]:
top_k_df[top_k_df['CustomerID']==1].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,prediction
8,1,273,1.0
9,1,314,1.0
7,1,318,1.0
16,1,423,1.0
18,1,749,1.0
15,1,908,1.0
19,1,915,1.0
1,1,1244,1.0
5,1,1245,1.0
11,1,1352,1.0


In [119]:
test_df[test_df['CustomerID']==1].sort_values('StockCode')

Unnamed: 0,CustomerID,StockCode,purchased
8,1,273,1
9,1,314,1
7,1,318,1
15,1,908,1
1,1,1244,1
5,1,1245,1
11,1,1352,1
0,1,1574,1
12,1,1649,1
10,1,2006,1
