In [2]:
#%pip install -q tensorflow-recommenders
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Embedding, Dense, StringLookup
from tensorflow.keras import Sequential
import tensorflow_recommenders as tfrs
from tensorflow.keras.metrics import AUC
import math
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.src.layers import LSTM

In [3]:
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [4]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events_10k.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')

item_properties = item_properties[item_properties['itemid'].isin(events.itemid.unique())]

display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
2482751,1046539,373805,1
2204101,1383579,287405,1
1964828,474264,153625,1
2369299,1079433,287356,1
640471,865010,113440,1
...,...,...,...
560807,269455,459735,1
1158627,228194,160499,1
2074769,435897,346429,1
1911438,1099927,261011,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
6,6,1091,1,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 1297729 n72.000 309206, 60935..."
31,32,1173,0,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n973.200 424566, 679677, 519..."
40,42,84,1,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n68.400 424566, 679677, 5197..."
137,147,646,1,"[1092, 112, 159, 202, 283, 348, 364, 461, 491,...","[291010, 679677, 519769, 229273 388993 1246541..."
153,163,407,0,"[112, 159, 202, 227, 283, 364, 376, 397, 483, ...","[679677, 519769, 62992 n7440.000 925243, 92933..."
...,...,...,...,...,...
416890,466685,1400,1,"[102, 1028, 112, 159, 202, 227, 275, 28, 283, ...","[769062, 769062, 679677, 519769, 1109436 45934..."
416941,466740,967,0,"[1008, 1036, 112, 120, 140, 159, 202, 227, 283...","[124229 n336.000 1144008, 1154859, 679677, 115..."
416960,466760,1549,0,"[1036, 1066, 112, 159, 202, 210, 227, 230, 283...","[1318567, n720.000 424566, 679677, 519769, 123..."
417047,466861,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 732011 424566, 679677, 519769, 10769..."


In [5]:
events = pd.merge(events, item_properties[['itemid', 'properties', 'property_values']],
                                   left_on='itemid',
                                   right_on='itemid',
                                   how='left')
display(events)

Unnamed: 0,visitorid,itemid,event_mapped,properties,property_values
0,1046539,373805,1,"[1075, 112, 159, 202, 25, 283, 364, 6, 678, 69...","[[n120.000 1029109 n1200.000 1029109, n120.000..."
1,1383579,287405,1,"[112, 159, 202, 227, 28, 283, 293, 30, 348, 36...","[679677, 519769, 1288624 n4944.000 969301, 282..."
2,474264,153625,1,"[1092, 112, 159, 202, 283, 348, 349, 364, 461,...","[291010, 679677, 519769, 695463, 726714 422480..."
3,1079433,287356,1,"[1066, 112, 159, 202, 227, 230, 283, 307, 364,...","[732011 424566, 679677, 519769, 1213269, 12143..."
4,865010,113440,1,"[1000, 1036, 112, 159, 202, 227, 230, 283, 324...","[871215 1022520, 726612, 679677, 519769, 75240..."
...,...,...,...,...,...
9995,269455,459735,1,"[1000, 1036, 112, 159, 202, 227, 230, 283, 364...","[237874 1022520, 726612, 679677, 519769, 12751..."
9996,228194,160499,1,"[112, 159, 202, 227, 28, 283, 293, 30, 348, 36...","[679677, 519769, 592578, 188678 607315, 150169..."
9997,435897,346429,1,"[112, 159, 202, 283, 348, 364, 461, 506, 591, ...","[679677, 519769, 898578, 726714 150169 210500 ..."
9998,1099927,261011,1,"[1036, 112, 152, 159, 202, 227, 230, 283, 348,...","[1154859, 679677, 769062, 519769, 875827 76692..."


In [6]:


import math

# Assuming events is your DataFrame

def process_property_values(series):
    def process_item(item):
        if isinstance(item, float):  # Check if the item is a float
            return item  # Return the float as is
        elif isinstance(item, list):  # Check if the item is a list
            return [sub_item[0] if isinstance(sub_item, list) else sub_item for sub_item in item]
        else:
            return item

    return series.apply(lambda x: process_item(x))

# Process 'properties' and 'property_values'
events['properties'] = events['properties'].apply(lambda x: str(x))
events['property_values'] = process_property_values(events['property_values'])

In [7]:
events['properties'] = events['properties'].apply(lambda x: str(x))
events['property_values'] = events['property_values'].apply(lambda x: str(x))

def get_padded_sequence(values):
    
    string_lookup = StringLookup()
    string_lookup.adapt(values)

    tokenizer_properties = Tokenizer(num_words=string_lookup.vocabulary_size(), oov_token="<OOV>")
    tokenizer_properties.fit_on_texts(values)

    # Convert text to sequences
    sequences = tokenizer_properties.texts_to_sequences(values)

    # Pad sequences
    return pad_sequences(sequences, padding='post')

padded_sequence_properties = get_padded_sequence(events['properties'])
padded_sequence_property_values = get_padded_sequence(events['property_values'])

In [8]:
events['property_values']

0       [property\n1075    n120.000 1029109 n1200.000 ...
1       ['679677', '519769', '1288624 n4944.000 969301...
2       ['291010', '679677', '519769', '695463', '7267...
3       ['732011 424566', '679677', '519769', '1213269...
4       ['871215 1022520', '726612', '679677', '519769...
                              ...                        
9995    ['237874 1022520', '726612', '679677', '519769...
9996    ['679677', '519769', '592578', '188678 607315'...
9997    ['679677', '519769', '898578', '726714 150169 ...
9998    ['1154859', '679677', '769062', '519769', '875...
9999                                                  nan
Name: property_values, Length: 10000, dtype: object

In [9]:
padded_sequence_property_values

array([[  7, 347, 210, ...,   0,   0,   0],
       [ 19,  20,   1, ...,   0,   0,   0],
       [377,  19,  20, ...,   0,   0,   0],
       ...,
       [ 19,  20,   1, ...,   0,   0,   0],
       [106,  19,   5, ...,   0,   0,   0],
       [171,   0,   0, ...,   0,   0,   0]], dtype=int32)

In [10]:
# Create a tf.data.Dataset from the interaction data
#todo check how to use array of ids
interaction_dataset = tf.data.Dataset.from_tensor_slices({
    'visitorid': events['visitorid'].values,
    'itemid': events['itemid'].values,
    'event': events['event_mapped'].values,
    'properties': padded_sequence_properties,  # Convert to list
    'property_values': padded_sequence_property_values,
})
#variables 
dataset_len = events.shape[0]
test_len = math.ceil(dataset_len * 0.2)
train_len = dataset_len - test_len

metrics_batchsize = 16
train_batch_size = 128
test_batch_size = 64
random_seed = 27

In [11]:
interaction_dataset

<_TensorSliceDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None), 'properties': TensorSpec(shape=(54,), dtype=tf.int32, name=None), 'property_values': TensorSpec(shape=(767,), dtype=tf.int32, name=None)}>

In [12]:
#train test split
tf.random.set_seed(random_seed)
# shuffled = interaction_dataset.shuffle(len, seed=random_seed, reshuffle_each_iteration=False)
# train = shuffled.take(train_len)
# test = shuffled.skip(train_len).take(test_len)
shuffled = interaction_dataset.shuffle(dataset_len, seed=random_seed, reshuffle_each_iteration=False)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)
display(train, test)

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None), 'properties': TensorSpec(shape=(54,), dtype=tf.int32, name=None), 'property_values': TensorSpec(shape=(767,), dtype=tf.int32, name=None)}>

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None), 'properties': TensorSpec(shape=(54,), dtype=tf.int32, name=None), 'property_values': TensorSpec(shape=(767,), dtype=tf.int32, name=None)}>

In [13]:
# Unique customer and product identifiers
unique_visitor_ids = np.array(events["visitorid"].unique())
unique_item_ids = np.array(events["itemid"].unique())


visitor_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
visitor_ids_vocabulary.adapt(unique_visitor_ids)

item_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
item_ids_vocabulary.adapt(unique_item_ids)


properties_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
properties_vocabulary.adapt(padded_sequence_properties)

property_values_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
property_values_vocabulary.adapt(padded_sequence_property_values)


# print(unique_product_ids.shape[0], unique_customer_ids.shape[0], unique_product_ids, unique_product_ids)

In [14]:
# Define a model using TensorFlow Recommenders
product_ids_dataset = tf.data.Dataset.from_tensor_slices(unique_item_ids)
class RetailModel(tfrs.Model):

    def __init__(self, unique_item_ids_vocab, unique_visitor_ids_vocab, unique_properties_vocab, unique_property_values_vocab, embedding_dimension=32):
        super().__init__()
        # Set up user and product representations
        self.user_embedding = tf.keras.Sequential([
            unique_visitor_ids_vocab,
            Embedding(unique_visitor_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        self.product_embedding = tf.keras.Sequential([
            unique_item_ids_vocab,
            Embedding(unique_item_ids_vocab.vocabulary_size(), embedding_dimension)
        ])

        max_features = 10000  # Adjust as needed
        max_len = 20          # Adjust as needed
        self.text_vectorization = tf.keras.layers.TextVectorization(
            max_tokens=max_features,
            output_mode='int',
            output_sequence_length=max_len
        )


        # Textual model
        self.textual_properties_model = tf.keras.Sequential([
            self.text_vectorization,
            Embedding(input_dim=unique_property_values_vocab.vocabulary_size(), output_dim=embedding_dimension),
            tf.keras.layers.Reshape((1, embedding_dimension)),  # Adding the 'timesteps' dimension
            LSTM(64),
            Dense(64, activation='relu')
        ])
        
        # Textual model
        self.textual_property_values_model = tf.keras.Sequential([
            self.text_vectorization,
            Embedding(input_dim=unique_property_values_vocab.vocabulary_size(), output_dim=embedding_dimension),
            tf.keras.layers.Reshape((1, embedding_dimension)),
            LSTM(64),
            Dense(64, activation='relu')
        ])

        self.reduced_product_embeddings = Dense(64, activation='relu')
        self.reduced_properties_embeddings = Dense(32, activation='relu')
        self.reduced_property_values_embeddings = Dense(32, activation='relu')
        
        
        # Set up a dense layer for the task.
        self.dense_layer = Dense(128, input_shape=(256,), activation="relu")

        # Set up retrieval task and metrics
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=product_ids_dataset.batch(metrics_batchsize).map(self.product_model))
        )
        self.auc_metric = AUC(name='auc')
        self.rmse_metric = tf.keras.metrics.RootMeanSquaredError(name='rmse')
        self.precision = tf.keras.metrics.Precision(name='precision')
        self.recall = tf.keras.metrics.Recall(name='recall')

    def product_model(self, product_ids):
        return self.product_embedding(product_ids)

    def dot_product_score(self, user, product):
        """
        Computes the dot product between user and product embeddings to get the interaction score.
        """
        return tf.reduce_sum(user * product, axis=1)

    def compute_loss(self, features, training=False):
        # print(features)
        print(list(features.keys()))
        user_embeddings = self.user_embedding(features["visitorid"])
        product_embeddings = self.product_embedding(features["itemid"])
        properties_embeddings = self.textual_properties_model(features["properties"])
        property_values_embeddings = self.textual_property_values_model(features["property_values"])

        reduced_product_embeddings = self.reduced_product_embeddings(product_embeddings)
        reduced_properties_embeddings = self.reduced_properties_embeddings(properties_embeddings)
        reduced_property_values_embeddings = self.reduced_property_values_embeddings(property_values_embeddings)
        
        
        # 
        # # Now concatenate
        product_combined = tf.concat([reduced_product_embeddings, reduced_properties_embeddings, reduced_property_values_embeddings], axis=1)  # shape will be [None, 128]
        


        user_output = self.dense_layer(user_embeddings)
        product_output = self.dense_layer(product_combined)
        # This is a hypothetical function that returns the logits or scores
        # This needs to be adapted based on your actual model
        positive_logits = self.dot_product_score(user_output, product_output)

        # Update RMSE
        self.rmse_metric.update_state(y_true=features["event"], y_pred=positive_logits)


        return self.task(user_output, product_output)

    def evaluate(self, validation_dataset, *args, **kwargs):
        # Call the base class's evaluate method
        
    
        # Compute additional metrics
        for features in validation_dataset:
            user_embeddings = self.user_embedding(features["visitorid"])
            product_embeddings = self.product_embedding(features["itemid"])
            
            user_output = self.dense_layer(user_embeddings)
            product_output = self.dense_layer(product_embeddings)
            
            positive_logits = self.dot_product_score(user_output, product_output)
            # Assuming you have a binary "label" in your dataset indicating 1 for positive interaction and 0 for negative
            self.precision.update_state(y_true=features["event"], y_pred=positive_logits)
            self.recall.update_state(y_true=features["event"], y_pred=positive_logits)
    
        precision_result = self.precision.result().numpy()
        recall_result = self.recall.result().numpy()
    
        # Calculate F1 score
        if (precision_result + recall_result) != 0:
            f1_score = 2 * (precision_result * recall_result) / (precision_result + recall_result)
        else:
            f1_score = 0.0
    
        # Reset the metrics for the next evaluation
        # self.precision.reset_states()
        # self.recall.reset_states()
        base_results = super(RetailModel, self).evaluate(validation_dataset, *args, **kwargs)
    
        return base_results, [precision_result, recall_result, f1_score]  # or append additional results as needed



In [15]:
model = RetailModel(item_ids_vocabulary, visitor_ids_vocabulary, properties_vocabulary, property_values_vocabulary, embedding_dimension=128)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001), metrics=[AUC()]) # using legacy instead of tf.keras.optimizers.Adagrad, because newer version is slow on m1/m2 macs

In [16]:
# Shuffle, batch, and cache the data.
cached_train = train.shuffle(dataset_len).batch(train_batch_size).cache()
cached_test = test.batch(test_batch_size).cache()
# Train the model
# model.fit(cached_train, validation_data=cached_test, validation_freq=5, epochs=3) 
model.fit(cached_train, validation_freq=5, epochs=3)

Epoch 1/3
['visitorid', 'itemid', 'event', 'properties', 'property_values']


ValueError: in user code:

    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "/var/folders/jp/3q1jwf6d5n11rx71jwtqcgv80000gn/T/ipykernel_69802/206799084.py", line 75, in compute_loss
        properties_embeddings = self.textual_properties_model(features["properties"])
    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/thomashuber/miniconda3/envs/m1-master-thesis/lib/python3.9/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 588, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 54) with rank=2
    
    Call arguments received by layer 'text_vectorization' (type TextVectorization):
      • inputs=tf.Tensor(shape=(None, 54), dtype=int32)


In [None]:
# Evaluate the model
k = 5
result_evaluate_train = model.evaluate(cached_train)
result_evaluate_train

In [None]:
result_evaluat_test = model.evaluate(cached_test)
result_evaluat_test

In [None]:
print("Name\t\tValue")
print("-" * 30)
for metric in model.metrics:
    print(f"{metric.name}\t\t{metric.result().numpy()}")
print(f"AUC: {model.auc_metric.result().numpy()}")

In [None]:
sample = next(iter(cached_test.take(1)))

# Extract customer_id from the sample
visitor_id = sample['visitorid'].numpy()
visitor_id

In [None]:
def get_random_customer_from_test_data():
    return next(iter(test.take(1)))['visitorid'].numpy()

def display_item_ids(item_ids):
    display(item_properties[item_properties.itemid.isin(item_ids) ])

def display_products_by_visitor_id(visitor_id):
    display_item_ids(events[events.visitorid == visitor_id].itemid.tolist())
    

index = tfrs.layers.factorized_top_k.BruteForce(model.user_embedding)
index.index_from_dataset(
    product_ids_dataset.batch(100).map(lambda id: (id, model.product_model(id))))

def predict_user(visitor_id):
    print('predicting user: ', visitor_id)
    print('user already bought following products: ')
    display_products_by_visitor_id(visitor_id)
    
    score, predicted_product_ids = index(np.array([visitor_id]))
    
    print('predicted products: ')
    display_item_ids(predicted_product_ids[0].numpy())
    print('scores: ', score[0].numpy())


user_id = get_random_customer_from_test_data()

predict_user(user_id)

In [None]:
user_id = next(iter(train.take(1)))['visitorid'].numpy()
predict_user(user_id)

In [None]:
model.save('../../models/retailrocket/rr_tensorflow_reco_3_epochs_v1.h5')

In [None]:
unique_recommended_items