In [315]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [316]:
# Reading Data
df_online_retail = pd.read_csv('../00_Data/data.csv', encoding= 'unicode_escape')
#df_online_retail_xlsx = pd.read_excel('../00_Data/online_retail.xlsx', sheet_name=0)
#df_online_retail_II = pd.read_excel('../00_Data/online_retail_II.xlsx', sheet_name=0)
#df_rec_sys_order = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=0)
#df_rec_sys_customer = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=1)
#df_rec_sys_product = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=2)

# 1. Online Retail Dataset

## 1.1 Data Split

In [317]:
# First check Data Types 
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [318]:
# Since we are going to encode the labels for User and Items, we need to make sure that we convert those to strings 
df_online_retail['CustomerID'] = df_online_retail['CustomerID'].astype(str)

# We can also convert Invoice Date into Date time
df_online_retail['InvoiceDate'] = pd.to_datetime(df_online_retail['InvoiceDate'])
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   541909 non-null  object        
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 33.1+ MB


In [319]:
# Last but not least, we will need a column, which indicates that an item was bought by a user. We are assuming, this column is binary - 1 means bought, 0 means no interaction
df_online_retail['purchased'] = 5
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,5
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,5
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,5
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,5
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,5


In [320]:
# User & Item  Label Encoder 

# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Set up Objects for User and Item Column
le1 = LabelEncoder()
le2 = LabelEncoder()

# Fit the encoders to the columns in question
le1.fit(df_online_retail['StockCode'])
le2.fit(df_online_retail['CustomerID'])

# Transform dataframe with encoded labels
df_online_retail['StockCode'] = le1.transform(df_online_retail['StockCode'])
df_online_retail['CustomerID'] = le2.transform(df_online_retail['CustomerID'])

# Check Change 
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,3536,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom,5
1,536365,2794,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,5
2,536365,3044,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom,5
3,536365,2985,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,5
4,536365,2984,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,5


In [321]:
train, test = train_test_split(df_online_retail, test_size=0.2, random_state=1)

In [322]:
# Print the number of users and items in the training and test sets
print(f'Before Split: {df_online_retail.CustomerID.nunique()} users, {df_online_retail.StockCode.nunique()} items')
print(f'Train: {train.CustomerID.nunique()} users, {train.StockCode.nunique()} items')
print(f'Test: {test.CustomerID.nunique()} users, {test.StockCode.nunique()} items')

Before Split: 4373 users, 4070 items
Train: 4358 users, 4025 items
Test: 4112 users, 3627 items


For an affinity matrix, we need the same number of unique users and items in both datasets. This will be done in the next step

In [323]:
# find the set of unique items and users in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())
train_users = set(train['CustomerID'].unique())
test_users = set(test['CustomerID'].unique())

# find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)
common_users = train_users.intersection(test_users)

# filter the train and test sets to include only the rows with item user  IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
train = train[train['CustomerID'].isin(common_users)]
test = test[test['StockCode'].isin(common_items)]
test = test[test['CustomerID'].isin(common_users)]

In [324]:
# Check again, if the number of unique items and users match
print(f'Train: {train.CustomerID.nunique()} users, {train.StockCode.nunique()} items')
print(f'Test: {test.CustomerID.nunique()} users, {test.StockCode.nunique()} items')

Train: 4097 users, 3582 items
Test: 4097 users, 3582 items


In [325]:
print(train.shape)
print(test.shape)

(430936, 9)
(108316, 9)


## 1.2. Create User-Item Matrix

In [326]:
# create a pivot table from the dataframe
train_matrix = pd.pivot_table(train, values='purchased', index='CustomerID', columns='StockCode')

# replace non-zero values with 1 and missing values with 0
train_matrix[train_matrix > 0] = 1
#train_matrix = train_matrix.fillna(0)

# Show Matrix
train_matrix.head()

StockCode,0,1,2,6,7,8,10,11,12,13,...,4058,4059,4060,4061,4062,4063,4064,4066,4067,4068
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,1.0,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,,,,...,,,,,1.0,,,,,
5,,,,,,,,,,,...,,,1.0,,1.0,,,,,


In [327]:
# create a pivot table from the dataframe
test_matrix = pd.pivot_table(test, values='purchased', index='CustomerID', columns='StockCode', aggfunc=np.sum)

# replace non-zero values with 1 and missing values with 0
test_matrix[test_matrix > 0] = 1
#test_matrix = test_matrix.fillna(0)

# Show Matrix
test_matrix.head()

StockCode,0,1,2,6,7,8,10,11,12,13,...,4058,4059,4060,4061,4062,4063,4064,4066,4067,4068
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,1.0,,1.0,,,,,


In [328]:
# Print Shape of Train & Test Matrix 
print(train_matrix.shape)
print(test_matrix.shape)

(4097, 3582)
(4097, 3582)


In [333]:
print(train_matrix.isna().sum().sum())
print(test_matrix.isna().sum().sum())

14447913
14601422


In [334]:
# Check any rows that do not only contain 0 as a sanity check
train_matrix.loc[(train_matrix!=0).any(axis=1)]

StockCode,0,1,2,6,7,8,10,11,12,13,...,4058,4059,4060,4061,4062,4063,4064,4066,4067,4068
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,1.0,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,,,,...,,,,,1.0,,,,,
5,,,,,,,,,,,...,,,1.0,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4368,,,,,,,,,,,...,,,,,,,,,,
4369,,,,,,,,,,,...,,,,,,,,,,
4370,,,,,,,,,,,...,,,1.0,,,,,,,
4371,,,,,,,,,,,...,,,,,,,,,,


## 1.3. Model

In [335]:
import logging
import tensorflow as tf


#RBM 
from recommenders.models.rbm.rbm import RBM
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph

# Evaluation
from recommenders.evaluation.python_evaluation import (
    precision_at_k,
    recall_at_k,
    map_at_k,
    auc
)


In [336]:
try:
    del(model)
except:
    pass

In [337]:
#First we initialize the model class
model = RBM(
    possible_ratings=np.setdiff1d(np.unique(train_matrix), np.array([0])),
    visible_units=train_matrix.shape[1],
    hidden_units=5000,
    training_epoch=30,
    minibatch_size=600,
    keep_prob=0.7,
    with_metrics=True,
)

In [338]:
#Model Fit
with Timer() as train_time:
    model.fit(train_matrix)

print("Took {:.2f} seconds for training.".format(train_time.interval))

# Plot the train RMSE as a function of the epochs
#line_graph(values=model.rmse_train, labels='train', x_name='epoch', y_name='rmse_train')

Took 249.29 seconds for training.


In [339]:
predicted = model.predict(test_matrix)

In [340]:
predicted

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)

In [341]:
np.unique(predicted)

array([1.], dtype=float32)

In [342]:
test_pred = pd.DataFrame(data=predicted, index=test_matrix.index, columns=test_matrix.columns)

In [343]:
test_pred_dfx  = pd.melt(test_pred.reset_index(), id_vars=['CustomerID'], value_vars=test_pred.columns, var_name='StockCode', value_name='predictions')
test_true =  pd.melt(test_matrix.reset_index(), id_vars=['CustomerID'], value_vars=test_matrix.columns, var_name='StockCode', value_name='purchased')

In [347]:
test_true

Unnamed: 0,CustomerID,StockCode,purchased
0,1,0,
1,2,0,
2,3,0,
3,4,0,
4,5,0,
...,...,...,...
14675449,4368,4068,
14675450,4369,4068,
14675451,4370,4068,
14675452,4371,4068,


In [349]:
test_pred_dfx

Unnamed: 0,CustomerID,StockCode,predictions
0,1,0,1.0
1,2,0,1.0
2,3,0,1.0
3,4,0,1.0
4,5,0,1.0
...,...,...,...
14675449,4368,4068,1.0
14675450,4369,4068,1.0
14675451,4370,4068,1.0
14675452,4371,4068,1.0


In [344]:
from sklearn.metrics import roc_auc_score, recall_score, classification_report, confusion_matrix

In [345]:
roc_auc = roc_auc_score(test_true.purchased, test_pred_dfx.predictions)
print(roc_auc)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
recall_score_x = recall_score(test_true.purchased, test_pred_dfx.predictions)
print(recall_score_x)

1.0


In [346]:
print(classification_report(test_true.purchased, test_pred_dfx.predictions))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
print(confusion_matrix(test_true.purchased, test_pred_dfx.predictions))

[[       0 14601422]
 [       0    74032]]
