In [71]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model
from matplotlib import pyplot as plt

In [72]:
# Reading Data
df_online_retail = pd.read_csv('../00_Data/data.csv', encoding= 'unicode_escape')
df_online_retail_xlsx = pd.read_excel('../00_Data/online_retail.xlsx', sheet_name=0)
df_online_retail_II = pd.read_excel('../00_Data/online_retail_II.xlsx', sheet_name=0)
df_rec_sys_order = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=0)
df_rec_sys_customer = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=1)
df_rec_sys_product = pd.read_excel('../00_Data/Rec_sys_data.xlsx', sheet_name=2)

# 1. Online Retail Dataset

## 1.1 Data Split

In [73]:
# First check Data Types 
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [74]:
# Since we will use a chronological split for the train/test split, we need to convert Invoice Date to Datetime
df_online_retail['InvoiceDate'] = pd.to_datetime(df_online_retail['InvoiceDate'])
df_online_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [75]:
# Check if the date column is in chronological order
is_chronological = df_online_retail['InvoiceDate'].is_monotonic_increasing

if is_chronological:
    print('The date column is in chronological order')
else:
    print('The date column is not in chronological order')

The date column is in chronological order


In [76]:
# Last but not least, we will need a column, which indicates that an item was bought by a user. We are assuming, this column is binary - 1 means bought, 0 means no interaction
df_online_retail['purchased'] = 1
df_online_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1


------

------

In [77]:
# Load the Chronological Splitter
from recommenders.datasets.python_splitters import python_chrono_split

In [78]:
train, test = python_chrono_split(df_online_retail, ratio=0.8, filter_by='user', min_rating=10, col_user='CustomerID', col_item='StockCode', col_timestamp='InvoiceDate')

In [79]:
# Print the number of users and items in the training and test sets
print(f'Before Split: {df_online_retail.CustomerID.nunique()} users, {df_online_retail.StockCode.nunique()} items')
print(f'Train: {train.CustomerID.nunique()} users, {train.StockCode.nunique()} items')
print(f'Test: {test.CustomerID.nunique()} users, {test.StockCode.nunique()} items')

Before Split: 4372 users, 4070 items
Train: 3781 users, 3659 items
Test: 3781 users, 3022 items


For an affinity matrix, we need the same number of unique users and items in both datasets. This will be done in the next step

In [80]:
# find the set of unique items in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())

# find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)

# filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
test = test[test['StockCode'].isin(common_items)]

In [81]:
# Check again, if the number of unique items and users match
print(f'Train: {train.CustomerID.nunique()} users, {train.StockCode.nunique()} items')
print(f'Test: {test.CustomerID.nunique()} users, {test.StockCode.nunique()} items')

Train: 3781 users, 3001 items
Test: 3781 users, 3001 items


In [82]:
print(train.shape)
print(test.shape)

(318382, 9)
(80732, 9)


## 1.2. Create User-Item Matrix

In [83]:
# create a pivot table from the dataframe
train_matrix = pd.pivot_table(train, values='purchased', index='CustomerID', columns='StockCode')

# replace non-zero values with 1 and missing values with 0
train_matrix[train_matrix > 0] = 1
train_matrix = train_matrix.fillna(0)

# Show Matrix
train_matrix.head()

StockCode,10002,10080,10120,10124G,10125,10133,10135,11001,15030,15034,...,90214R,90214S,90214Y,BANK CHARGES,C2,CRUK,D,DOT,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12349.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12352.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [84]:
# create a pivot table from the dataframe
test_matrix = pd.pivot_table(test, values='purchased', index='CustomerID', columns='StockCode', aggfunc=np.sum)

# replace non-zero values with 1 and missing values with 0
test_matrix[test_matrix > 0] = 1
test_matrix = test_matrix.fillna(0)

# Show Matrix
test_matrix.head()

StockCode,10002,10080,10120,10124G,10125,10133,10135,11001,15030,15034,...,90214R,90214S,90214Y,BANK CHARGES,C2,CRUK,D,DOT,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12349.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12352.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [89]:
print(train_matrix.shape)
print(test_matrix.shape)

(3781, 3001)
(3781, 3001)


## 1.3. Model

In [87]:
# Convert the user and item columns to integers using LabelEncoder
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

train_matrix.index  = user_encoder.fit_transform(train_matrix.index)
train_matrix.columns = item_encoder.fit_transform(train_matrix.columns)

In [88]:
train_matrix = train_matrix.values
test_matrix = test_matrix.values

In [95]:
item_encoder.classes_.shape[0]

3001

In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model

class RBM(Model):
    def __init__(self, hidden_dim, learning_rate=0.001, momentum=0.9, decay=0.01, iterations=1):
        super(RBM, self).__init__()
        self.hidden_dim = hidden_dim
        self.visible_layer = Dense(units=item_encoder.classes_.shape[0], activation='sigmoid')
        self.hidden_layer = Dense(units=hidden_dim, activation='sigmoid')
        self.b_visible = tf.Variable(tf.zeros(shape=[item_encoder.classes_.shape[0]]), dtype=tf.float32)
        self.b_hidden = tf.Variable(tf.zeros(shape=[hidden_dim]), dtype=tf.float32)
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.decay = decay
        self.iterations = iterations

    def call(self, inputs):
        hidden = self.sample_hidden(inputs)
        visible = self.sample_visible(hidden)
        return visible

    def sample_hidden(self, visible):
        p_hidden = tf.nn.sigmoid(tf.matmul(visible, self.hidden_layer.weights[0]) + self.b_hidden)
        h = tf.nn.relu(tf.sign(p_hidden - tf.random.uniform(tf.shape(p_hidden))))
        for i in range(self.iterations - 1):
            p_visible = tf.nn.sigmoid(tf.matmul(h, self.hidden_layer.weights[0], transpose_b=True) + self.b_visible)
            p_hidden = tf.nn.sigmoid(tf.matmul(p_visible, self.hidden_layer.weights[0]) + self.b_hidden)
            h = tf.nn.relu(tf.sign(p_hidden - tf.random.uniform(tf.shape(p_hidden))))
        return h

    def sample_visible(self, hidden):
        p_visible = tf.nn.sigmoid(tf.matmul(hidden, self.hidden_layer.weights[0], transpose_b=True) + self.b_visible)
        v = tf.nn.relu(tf.sign(p_visible - tf.random.uniform(tf.shape(p_visible))))
        for i in range(self.iterations - 1):
            p_hidden = tf.nn.sigmoid(tf.matmul(v, self.hidden_layer.weights[0]) + self.b_hidden)
            p_visible = tf.nn.sigmoid(tf.matmul(p_hidden, self.hidden_layer.weights[0], transpose_b=True) + self.b_visible)
            v = tf.nn.relu(tf.sign(p_visible - tf.random.uniform(tf.shape(p_visible))))
        return v

    def train_step(self, x):
        with tf.GradientTape() as tape:
            h = self.sample_hidden(x)
            v = self.sample_visible(h)
            h_ = self.sample_hidden(v)
            loss = tf.reduce_mean(tf.square(x - v))
            reg = self.decay * (tf.reduce_sum(tf.square(self.hidden_layer.weights[0])) +
                                tf.reduce_sum(tf.square(self.visible_layer.weights[0])) +
                                tf.reduce_sum(tf.square(self.b_hidden)) +
                                tf.reduce_sum(tf.square(self.b_visible)))
            loss = loss + reg
        grads = tape.gradient(loss, self.trainable_weights)
        v_old = self.visible_layer(x)
        for i in range(self.iterations):
            if i == 0:
                v_new = self.visible_layer(self.sample_hidden(x))
            else:
                v_new = self.visible_layer(self.sample_hidden(v_new))
        grad_weights = tf.matmul(tf.transpose(v_old), self.sample_hidden(x)) - tf.matmul(tf.transpose(v_new), self.sample_hidden(v_new))
        grad_b_visible = tf.reduce_sum(v_old - v_new, axis=0)
        grad_b_hidden = tf.reduce_sum(self.sample_hidden(x) - self.sample_hidden(v_new), axis=0)
        grad_weights = grad_weights / tf.cast(tf.shape(x)[0], tf.float32)
        grad_b_visible = grad_b_visible / tf.cast(tf.shape(x)[0], tf.float32)
        grad_b_hidden = grad_b_hidden / tf.cast(tf.shape(x)[0], tf.float32)
        momentum_update = [self.momentum * w for w in self.velocity] + [(1 - self.momentum) * g for g in grads]
        self.velocity = momentum_update
        weight_update = [w - self.learning_rate * g for w, g in zip(self.trainable_weights, grad_weights)]
        b_visible_update = self.b_visible - self.learning_rate * grad_b_visible
        b_hidden_update = self.b_hidden - self.learning_rate * grad_b_hidden
        self.visible_layer.weights[0].assign(weight_update[0])
        self.hidden_layer.weights[0].assign(weight_update[1])
        self.b_visible.assign(b_visible_update)
        self.b_hidden.assign(b_hidden_update)


In [94]:
# Set the hyperparameters
learning_rate = 0.001
batch_size = 32
epochs = 10
hidden_dim = 128

# Define the model and compile it
rbm = RBM(hidden_dim)
rbm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))

# Train the model
for epoch in range(epochs):
    for i in range(0, train_matrix.shape[0], batch_size):
        batch = train_matrix[i:i+batch_size]
        rbm.train_on_batch(batch, batch)
        
    # Evaluate the model after each epoch
    y_true = test_matrix[test_matrix.nonzero()]
    y_pred = rbm(test_matrix)[test_matrix.nonzero()]
    auc = roc_auc_score(y_true, y_pred)
    print


StagingError: in user code:

    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/arasdirekoglu/opt/anaconda3/envs/recom/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /var/folders/bs/ht_t1wp94tl3wt5xhchcxq2m0000gn/T/ipykernel_24647/607935738.py:49 train_step
        h = self.sample_hidden(x)
    /var/folders/bs/ht_t1wp94tl3wt5xhchcxq2m0000gn/T/ipykernel_24647/607935738.py:30 sample_hidden
        p_hidden = tf.nn.sigmoid(tf.matmul(visible, self.hidden_layer.weights[0]) + self.b_hidden)

    IndexError: list index out of range
