In [1]:
#%pip install -q tensorflow-recommenders
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Embedding, Dense, StringLookup
from tensorflow.keras import Sequential
import tensorflow_recommenders as tfrs
from tensorflow.keras.metrics import AUC
import math
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.src.layers import LSTM

In [2]:
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [3]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events_10k.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')

item_properties = item_properties[item_properties['itemid'].isin(events.itemid.unique())]

display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
2482751,1046539,373805,1
2204101,1383579,287405,1
1964828,474264,153625,1
2369299,1079433,287356,1
640471,865010,113440,1
...,...,...,...
560807,269455,459735,1
1158627,228194,160499,1
2074769,435897,346429,1
1911438,1099927,261011,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
6,6,1091,1,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 1297729 n72.000 309206, 60935..."
31,32,1173,0,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n973.200 424566, 679677, 519..."
40,42,84,1,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n68.400 424566, 679677, 5197..."
137,147,646,1,"[1092, 112, 159, 202, 283, 348, 364, 461, 491,...","[291010, 679677, 519769, 229273 388993 1246541..."
153,163,407,0,"[112, 159, 202, 227, 283, 364, 376, 397, 483, ...","[679677, 519769, 62992 n7440.000 925243, 92933..."
...,...,...,...,...,...
416890,466685,1400,1,"[102, 1028, 112, 159, 202, 227, 275, 28, 283, ...","[769062, 769062, 679677, 519769, 1109436 45934..."
416941,466740,967,0,"[1008, 1036, 112, 120, 140, 159, 202, 227, 283...","[124229 n336.000 1144008, 1154859, 679677, 115..."
416960,466760,1549,0,"[1036, 1066, 112, 159, 202, 210, 227, 230, 283...","[1318567, n720.000 424566, 679677, 519769, 123..."
417047,466861,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 732011 424566, 679677, 519769, 10769..."


In [4]:
# item_properties['properties'] = item_properties['properties'].apply(lambda x: str(x))
# item_properties['property_values'] = item_properties['property_values'].apply(lambda x: str(x))


In [5]:
# Create a tf.data.Dataset from the interaction data
#todo check how to use array of ids
interaction_dataset = tf.data.Dataset.from_tensor_slices({
    'visitorid': events['visitorid'].values,
    'itemid': events['itemid'].values,
    'event': events['event_mapped'].values,
    # 'properties': item_properties['properties'].values,
    # 'property_values': item_properties['property_values'].values,
})
#variables 
dataset_len = events.shape[0]
test_len = math.ceil(dataset_len * 0.2)
train_len = dataset_len - test_len

metrics_batchsize = 16
train_batch_size = 128
test_batch_size = 64
random_seed = 27

In [6]:
interaction_dataset

<_TensorSliceDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [7]:
#train test split
tf.random.set_seed(random_seed)
# shuffled = interaction_dataset.shuffle(len, seed=random_seed, reshuffle_each_iteration=False)
# train = shuffled.take(train_len)
# test = shuffled.skip(train_len).take(test_len)
shuffled = interaction_dataset.shuffle(dataset_len, seed=random_seed, reshuffle_each_iteration=False)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)
display(train, test)

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [8]:
# Unique customer and product identifiers
unique_visitor_ids = np.array(events["visitorid"].unique())
unique_item_ids = np.array(events["itemid"].unique())


visitor_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
visitor_ids_vocabulary.adapt(unique_visitor_ids)

item_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
item_ids_vocabulary.adapt(unique_item_ids)
# print(unique_product_ids.shape[0], unique_customer_ids.shape[0], unique_product_ids, unique_product_ids)

In [9]:
# Define a model using TensorFlow Recommenders
product_ids_dataset = tf.data.Dataset.from_tensor_slices(unique_item_ids)
class RetailModel(tfrs.Model):

    def __init__(self, unique_item_ids_vocab, unique_visitor_ids_vocab, embedding_dimension=32):
        super().__init__()
        # Set up user and product representations
        self.user_embedding = tf.keras.Sequential([
            unique_visitor_ids_vocab,
            Embedding(unique_visitor_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        self.product_embedding = tf.keras.Sequential([
            unique_item_ids_vocab,
            Embedding(unique_item_ids_vocab.vocabulary_size(), embedding_dimension)
        ])

        # self.property_model = tf.keras.Sequential([
        #     Embedding(input_dim=unique_product_ids_vocab.vocabulary_size(), output_dim=embedding_dimension),
        #     LSTM(64),
        #     Dense(64, activation='relu')
        # ])


        # # Textual description model
        # self.textual_description_model = tf.keras.Sequential([
        #     Embedding(input_dim=unique_item_ids_vocab.vocabulary_size(), output_dim=embedding_dimension),
        #     LSTM(64),
        #     Dense(64, activation='relu')
        # ])
        # 
        # # New embeddings for color and description
        # self.reduced_product_embeddings = Dense(64, activation='relu')
        # self.reduced_description_embeddings = Dense(32, activation='relu')
        # self.reduced_color_embeddings = Dense(32, activation='relu')
        
        # Set up a dense layer for the task.
        self.dense_layer = Dense(128, input_shape=(256,), activation="relu")

        # Set up retrieval task and metrics
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=product_ids_dataset.batch(metrics_batchsize).map(self.product_model))
        )
        self.auc_metric = AUC(name='auc')
        self.rmse_metric = tf.keras.metrics.RootMeanSquaredError(name='rmse')
        self.precision = tf.keras.metrics.Precision(name='precision')
        self.recall = tf.keras.metrics.Recall(name='recall')

    def product_model(self, product_ids):
        return self.product_embedding(product_ids)

    def dot_product_score(self, user, product):
        """
        Computes the dot product between user and product embeddings to get the interaction score.
        """
        return tf.reduce_sum(user * product, axis=1)

    def compute_loss(self, features, training=False):
        # print(features)
        print(list(features.keys()))
        user_embeddings = self.user_embedding(features["visitorid"])
        product_embeddings = self.product_embedding(features["itemid"])

        # properterties_embeddings = self.color_embedding(features["properties"])
        # property_value_embeddings = self.textual_description_model(features["property_values"])
        # 
        # reduced_product_embeddings = self.reduced_product_embeddings(product_embeddings)
        # reduced_property_value_embeddings = self.reduced_description_embeddings(property_value_embeddings)
        # reduced_property_embeddings = self.reduced_color_embeddings(properterties_embeddings)
        # 
        # 
        # # Now concatenate
        # user_combined = tf.concat([user_embeddings, ], axis=1)  # shape will be [None, 128]
        # product_combined = tf.concat([reduced_product_embeddings, reduced_property_embeddings, reduced_property_value_embeddings], axis=1)  # shape will be [None, 128]
        


        user_output = self.dense_layer(user_embeddings)
        product_output = self.dense_layer(product_embeddings)
        # This is a hypothetical function that returns the logits or scores
        # This needs to be adapted based on your actual model
        positive_logits = self.dot_product_score(user_output, product_output)

        # Update RMSE
        self.rmse_metric.update_state(y_true=features["event"], y_pred=positive_logits)

 
        return self.task(user_output, product_output)

    def evaluate(self, validation_dataset, *args, **kwargs):
        # Call the base class's evaluate method
        
    
        # Compute additional metrics
        for features in validation_dataset:
            user_embeddings = self.user_embedding(features["visitorid"])
            product_embeddings = self.product_embedding(features["itemid"])
            
            user_output = self.dense_layer(user_embeddings)
            product_output = self.dense_layer(product_embeddings)
            
            positive_logits = self.dot_product_score(user_output, product_output)
            # Assuming you have a binary "label" in your dataset indicating 1 for positive interaction and 0 for negative
            self.precision.update_state(y_true=features["event"], y_pred=positive_logits)
            self.recall.update_state(y_true=features["event"], y_pred=positive_logits)
    
        precision_result = self.precision.result().numpy()
        recall_result = self.recall.result().numpy()
    
        # Calculate F1 score
        if (precision_result + recall_result) != 0:
            f1_score = 2 * (precision_result * recall_result) / (precision_result + recall_result)
        else:
            f1_score = 0.0
    
        # Reset the metrics for the next evaluation
        # self.precision.reset_states()
        # self.recall.reset_states()
        base_results = super(RetailModel, self).evaluate(validation_dataset, *args, **kwargs)
    
        return base_results, [precision_result, recall_result, f1_score]  # or append additional results as needed



In [10]:
model = RetailModel(item_ids_vocabulary, visitor_ids_vocabulary, embedding_dimension=128)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001)) # using legacy instead of tf.keras.optimizers.Adagrad, because newer version is slow on m1/m2 macs

In [11]:
# Shuffle, batch, and cache the data.
cached_train = train.shuffle(dataset_len).batch(train_batch_size).cache()
cached_test = test.batch(test_batch_size).cache()
# Train the model
# model.fit(cached_train, validation_data=cached_test, validation_freq=5, epochs=3) 
model.fit(cached_train, validation_freq=5, epochs=3)

Epoch 1/3
['visitorid', 'itemid', 'event']
['visitorid', 'itemid', 'event']
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2b7a79d90>

In [12]:
# Evaluate the model
k = 5
result_evaluate_train = model.evaluate(cached_train)
result_evaluate_train

['visitorid', 'itemid', 'event']


([0.0,
  4.643726348876953,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  176.77102661132812,
  0,
  176.77102661132812],
 [1.0, 1.0, 1.0])

In [13]:
result_evaluat_test = model.evaluate(cached_test)
result_evaluat_test



([0.0,
  2.4143753051757812,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  44.878028869628906,
  0,
  44.878028869628906],
 [1.0, 1.0, 1.0])

In [14]:
print("Name\t\tValue")
print("-" * 30)
for metric in model.metrics:
    print(f"{metric.name}\t\t{metric.result().numpy()}")
print(f"AUC: {model.auc_metric.result().numpy()}")

Name		Value
------------------------------
auc		0.0
rmse		2.4143753051757812
precision		0.0
recall		0.0
factorized_top_k/top_1_categorical_accuracy		1.0
factorized_top_k/top_5_categorical_accuracy		1.0
factorized_top_k/top_10_categorical_accuracy		1.0
factorized_top_k/top_50_categorical_accuracy		1.0
factorized_top_k/top_100_categorical_accuracy		1.0
AUC: 0.0


In [15]:
sample = next(iter(cached_test.take(1)))

# Extract customer_id from the sample
visitor_id = sample['visitorid'].numpy()
visitor_id

array([1164119, 1041770,  918612, 1015928, 1329704, 1397149, 1159302,
       1290428,  194207, 1003965,  719678, 1046392, 1359141,  788230,
        355415,  876730,  404709,  269418,  809739,   18483,  943323,
        990272,  655614,  690498,  883080,  872837, 1282452, 1269394,
        558272,  684514, 1195090,  745113, 1243693,  672785,  336581,
        789542, 1135668,  441858, 1265850, 1280629,  331321,  905124,
        315411,  245266, 1165288,  672257, 1014116,  382875,  767656,
        946321, 1099605,  363136, 1146770, 1079746,  349053, 1095684,
        976833,  748085, 1214199, 1115740, 1072506,  253881,  279762,
       1295583])

In [16]:
def get_random_customer_from_test_data():
    return next(iter(test.take(1)))['visitorid'].numpy()

def display_item_ids(item_ids):
    display(item_properties[item_properties.itemid.isin(item_ids) ])

def display_products_by_visitor_id(visitor_id):
    display_item_ids(events[events.visitorid == visitor_id].itemid.tolist())
    

index = tfrs.layers.factorized_top_k.BruteForce(model.user_embedding, k=5)
index.index_from_dataset(
    product_ids_dataset.batch(100).map(lambda id: (id, model.product_model(id))))

def predict_user(visitor_id):
    print('predicting user: ', visitor_id)
    print('user already bought following products: ')
    display_products_by_visitor_id(visitor_id)
    
    score, predicted_product_ids = index(np.array([visitor_id]))
    
    print('predicted products: ')
    display_item_ids(predicted_product_ids[0].numpy())
    print('scores: ', score[0].numpy())


user_id = get_random_customer_from_test_data()

predict_user(user_id)

predicting user:  1164119
user already bought following products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
260067,291050,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 269011 n2400.000 424566, 679677, 519..."


predicted products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
15494,17352,1344,1,"[104, 112, 113, 123, 15, 159, 188, 202, 227, 2...","[769062, 679677, 1312959, 769062, 924572, 5197..."
39920,44759,804,1,"[1036, 112, 159, 202, 227, 283, 364, 6, 678, 6...","[59972, 679677, 519769, 323086 1100891 n12.000..."
106951,119636,819,0,"[112, 159, 202, 227, 28, 283, 293, 30, 348, 36...","[679677, 519769, n162096.000, 1322342 23569, 1..."
233065,260861,196,0,"[1036, 1050, 112, 159, 161, 202, 227, 253, 28,...","[[1318567 1133979, 1318567 1133979, 1318567 11..."
276562,309475,936,0,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 1297729 n216.000 309206, 5884..."


scores:  [0.04249578 0.04230487 0.0409303  0.04006204 0.04005549]


In [17]:
user_id = next(iter(train.take(1)))['visitorid'].numpy()
predict_user(user_id)

predicting user:  1055787
user already bought following products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values


predicted products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
141575,158381,779,0,"[1064, 112, 159, 202, 220, 227, 283, 364, 6, 6...",[n24.000 628176 1235560 1284577 386093 1284577...
344549,385611,1542,1,"[1036, 112, 152, 159, 202, 227, 230, 283, 348,...","[726612, 679677, 769062, 519769, 852251, 63383..."
366322,409991,487,0,"[112, 159, 19, 202, 283, 348, 364, 420, 6, 678...","[679677, 519769, 1297729 n72.000 1749 1178208 ..."


scores:  [0.09754381 0.06199662 0.06181492 0.05754564 0.05591531]


In [18]:
#model.save('../../models/retailrocket/rr_tensorflow_reco_3_epochs_v1.h5')

In [20]:
def calculate_coverage_and_scores(user_ids, total_items_count, example_user_id):
    all_scores = []
    all_predicted_product_ids = []  # Use a set to track unique recommended items

    for user_id in user_ids:
        # Call the prediction function for each user
        score, predicted_product_ids = index(np.array([user_id]))
        if user_id == example_user_id:
            print('predicting user: ', user_id)
            display_products_by_visitor_id(user_id)
            print('predicted products: ')
            display_item_ids(predicted_product_ids[0].numpy())
            print(score[0].numpy(), predicted_product_ids[0].numpy())


    # Append the scores to the all_scores list
        all_scores.extend(score[0].numpy())

        # Update the set of all unique predicted product IDs
        all_predicted_product_ids.extend(predicted_product_ids[0].numpy())

    # Calculate min, max, and average scores
    min_score = np.min(all_scores)
    max_score = np.max(all_scores)
    avg_score = np.mean(all_scores)

    # Calculate item coverage and catalog coverage
    num_unique_items = len(np.unique(all_predicted_product_ids))
    catalog_coverage = num_unique_items / total_items_count

    # Print the results
    print(f"Minimum Score: {min_score}")
    print(f"Maximum Score: {max_score}")
    print(f"Average Score: {avg_score}")
    print(f"Catalog Coverage: {catalog_coverage:.2%} (Unique recommended items {num_unique_items} / Total catalog items {total_items_count})")

    return min_score, max_score, avg_score, catalog_coverage, all_predicted_product_ids

# Example usage
total_items_count = len(events.itemid.unique())

user_ids = events.visitorid.unique()
test_user_id = 325780
min_score, max_score, avg_score, catalog_coveragem, all_predicted_p_ids = calculate_coverage_and_scores(user_ids, total_items_count, test_user_id)


predicting user:  325780


Unnamed: 0,itemid,categoryid,available,properties,property_values
385170,431099,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 602490 857333, 1263557 150169..."


predicted products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
21210,23762,84,0,"[1036, 112, 159, 202, 227, 230, 283, 322, 327,...","[1318567, 679677, 519769, 223760, 1285402 1042..."
77627,86816,342,0,"[101, 1036, 1054, 1066, 107, 112, 159, 202, 20...","[769062, 285933, n156.000 639502 n168.000, n18..."
117145,131028,333,1,"[1036, 112, 159, 202, 227, 28, 283, 328, 364, ...","[1154859, 679677, 519769, 406556, 1037547 1501..."
292311,327104,1529,1,"[1036, 112, 159, 202, 227, 230, 283, 327, 348,...","[1154859, 679677, 519769, 743941, 1290698, 129..."


[0.05824801 0.04726531 0.04302901 0.04188947 0.0402928 ] [ 86816 327104 331276 131028  23762]
Minimum Score: 0.0319015234708786
Maximum Score: 0.29660487174987793
Average Score: 0.0743759423494339
Catalog Coverage: 82.34% (Unique recommended items 7036 / Total catalog items 8545)


In [23]:
item_properties_all = pd.read_pickle(basepath +'item_data_extracted.pkl')

item_properties_all[item_properties_all.itemid == 331276]

Unnamed: 0,itemid,categoryid,available,properties,property_values
