In [1]:
#%pip install -q tensorflow-recommenders
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Embedding, Dense, StringLookup, Dropout
from tensorflow.keras import Sequential
import tensorflow_recommenders as tfrs
from tensorflow.keras.metrics import AUC
import math
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.src.layers import LSTM

In [2]:
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [3]:
# Load data
def map_event(event):
    if event == 'view':
        return 1
    elif event == 'addtocart':
        return 2
    elif event == 'transaction':
        return 3
    else:
        return 0

basepath = '../../datasets/preprocessed_datasets/retailrocket/'
events = pd.read_pickle(basepath +'events_10k.pkl')
events['event_mapped'] = events['event'].apply(map_event)
events = events.drop(columns=['timestamp', 'event', 'transactionid', 'datetime'])
item_properties = pd.read_pickle(basepath +'item_data_extracted.pkl')

item_properties = item_properties[item_properties['itemid'].isin(events.itemid.unique())]

display(events, item_properties)

Unnamed: 0,visitorid,itemid,event_mapped
2482751,1046539,373805,1
2204101,1383579,287405,1
1964828,474264,153625,1
2369299,1079433,287356,1
640471,865010,113440,1
...,...,...,...
560807,269455,459735,1
1158627,228194,160499,1
2074769,435897,346429,1
1911438,1099927,261011,1


Unnamed: 0,itemid,categoryid,available,properties,property_values
6,6,1091,1,"[112, 159, 19, 202, 227, 28, 283, 364, 521, 55...","[679677, 519769, 1297729 n72.000 309206, 60935..."
31,32,1173,0,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n973.200 424566, 679677, 519..."
40,42,84,1,"[1036, 1052, 1066, 112, 159, 202, 227, 230, 28...","[726612, 1116693, n68.400 424566, 679677, 5197..."
137,147,646,1,"[1092, 112, 159, 202, 283, 348, 364, 461, 491,...","[291010, 679677, 519769, 229273 388993 1246541..."
153,163,407,0,"[112, 159, 202, 227, 283, 364, 376, 397, 483, ...","[679677, 519769, 62992 n7440.000 925243, 92933..."
...,...,...,...,...,...
416890,466685,1400,1,"[102, 1028, 112, 159, 202, 227, 275, 28, 283, ...","[769062, 769062, 679677, 519769, 1109436 45934..."
416941,466740,967,0,"[1008, 1036, 112, 120, 140, 159, 202, 227, 283...","[124229 n336.000 1144008, 1154859, 679677, 115..."
416960,466760,1549,0,"[1036, 1066, 112, 159, 202, 210, 227, 230, 283...","[1318567, n720.000 424566, 679677, 519769, 123..."
417047,466861,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 732011 424566, 679677, 519769, 10769..."


In [4]:
# item_properties['properties'] = item_properties['properties'].apply(lambda x: str(x))
# item_properties['property_values'] = item_properties['property_values'].apply(lambda x: str(x))


In [5]:
# Create a tf.data.Dataset from the interaction data
#todo check how to use array of ids
interaction_dataset = tf.data.Dataset.from_tensor_slices({
    'visitorid': events['visitorid'].values,
    'itemid': events['itemid'].values,
    'event': events['event_mapped'].values,
    # 'properties': item_properties['properties'].values,
    # 'property_values': item_properties['property_values'].values,
})
#variables 
dataset_len = events.shape[0]
test_len = math.ceil(dataset_len * 0.2)
train_len = dataset_len - test_len

metrics_batchsize = 1024
dimension = 128
train_batch_size = dimension
test_batch_size = int(dimension/2)
random_seed = 27

In [6]:
interaction_dataset

<_TensorSliceDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [7]:
#train test split
tf.random.set_seed(random_seed)
# shuffled = interaction_dataset.shuffle(len, seed=random_seed, reshuffle_each_iteration=False)
# train = shuffled.take(train_len)
# test = shuffled.skip(train_len).take(test_len)
shuffled = interaction_dataset.shuffle(dataset_len, seed=random_seed, reshuffle_each_iteration=False)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)
display(train, test)

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

<_TakeDataset element_spec={'visitorid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'itemid': TensorSpec(shape=(), dtype=tf.int64, name=None), 'event': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [8]:
# Unique customer and product identifiers
unique_visitor_ids = np.array(events["visitorid"].unique())
unique_item_ids = np.array(events["itemid"].unique())


visitor_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
visitor_ids_vocabulary.adapt(unique_visitor_ids)

item_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
item_ids_vocabulary.adapt(unique_item_ids)
# print(unique_product_ids.shape[0], unique_customer_ids.shape[0], unique_product_ids, unique_product_ids)

In [9]:
# Define a model using TensorFlow Recommenders
product_ids_dataset = tf.data.Dataset.from_tensor_slices(unique_item_ids)
class RetailModel(tfrs.Model):

    def __init__(self, unique_visitor_ids_vocab, unique_item_ids_vocab, embedding_dimension=32, metrics_batchsize=1024,
                 retrieval_loss_weight=1.0, ranking_loss_weight=1.0, dropout_rate=0.2):
        super().__init__()
        self.retrieval_loss_weight = retrieval_loss_weight
        self.ranking_loss_weight = ranking_loss_weight
    
        # Set up user and product representations
        self.user_embedding = tf.keras.Sequential([
            unique_visitor_ids_vocab,
            Embedding(unique_visitor_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        self.product_embedding = tf.keras.Sequential([
            unique_item_ids_vocab,
            Embedding(unique_item_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        
        # Set up a dense layer for the task.
        self.dense_layer = Dense(embedding_dimension, activation="relu")
        self.dense_dropout = Dropout(dropout_rate)
        # Setup retrieval task and metrics
        
        self.retrieval_task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=product_ids_dataset.batch(metrics_batchsize).map(self.product_model))
        )

        # Setup ranking task with pairwise CategoricalCrossentropy loss
        self.ranking_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.AUC()]
        )


    def product_model(self, product_ids):
        return self.product_embedding(product_ids)

    def dot_product_score(self, user, product):
        """
        Computes the dot product between user and product embeddings to get the interaction score.
        """
        return tf.reduce_sum(user * product, axis=1)

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_embedding(features["visitorid"])
        product_embeddings = self.product_embedding(features["itemid"])

        # Apply dense layers
        user_output = self.dense_layer(user_embeddings)
        product_output = self.dense_layer(product_embeddings)

        # Compute scores
        positive_logits = self.dot_product_score(user_output, product_output)
        # Compute losses for both retrieval and ranking
        retrieval_loss = self.retrieval_task(user_output, product_output, compute_metrics=not training)
        ranking_loss = self.ranking_task(labels=features["event"], predictions=positive_logits, sample_weight=features.get("weight"))

        return (self.retrieval_loss_weight * retrieval_loss +
                self.ranking_loss_weight * ranking_loss)

    def evaluate(self, validation_dataset, *args, **kwargs):
        return super().evaluate(validation_dataset, *args, **kwargs)



In [10]:
model = RetailModel(item_ids_vocabulary, visitor_ids_vocabulary, embedding_dimension=dimension, retrieval_loss_weight=1.0, ranking_loss_weight=0.8, metrics_batchsize=metrics_batchsize)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001), loss='categorical_crossentropy') # using legacy instead of tf.keras.optimizers.Adagrad, because newer version is slow on m1/m2 macs

In [11]:
# Shuffle, batch, and cache the data.
cached_train = train.shuffle(dataset_len).batch(train_batch_size).cache()
cached_test = test.batch(test_batch_size).cache()
# Train the model
# model.fit(cached_train, validation_data=cached_test, validation_freq=5, epochs=3) 
model.fit(cached_train, validation_freq=5, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x31d4cf520>

In [12]:
# Evaluate the model
k = 5
result_evaluate_train = model.evaluate(cached_train)
result_evaluate_train



[0.999750018119812,
 0.999750018119812,
 0.999750018119812,
 0.999750018119812,
 0.999750018119812,
 0.9607082605361938,
 0.0476190485060215,
 0.0,
 489.0877685546875,
 0,
 489.0877685546875]

In [13]:
result_evaluat_test = model.evaluate(cached_test)
result_evaluat_test



[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9358300566673279,
 0.125,
 0.0,
 82.06863403320312,
 0,
 82.06863403320312]

In [14]:
print("Name\t\tValue")
print("-" * 30)
for metric in model.metrics:
    print(f"{metric.name}\t\t{metric.result().numpy()}")
#print(f"AUC: {model.auc_metric.result().numpy()}")

Name		Value
------------------------------
factorized_top_k/top_1_categorical_accuracy		1.0
factorized_top_k/top_5_categorical_accuracy		1.0
factorized_top_k/top_10_categorical_accuracy		1.0
factorized_top_k/top_50_categorical_accuracy		1.0
factorized_top_k/top_100_categorical_accuracy		1.0
root_mean_squared_error		0.9358300566673279
categorical_accuracy		0.125
auc		0.0


In [15]:
sample = next(iter(cached_test.take(1)))

# Extract customer_id from the sample
visitor_id = sample['visitorid'].numpy()
visitor_id

array([1164119, 1041770,  918612, 1015928, 1329704, 1397149, 1159302,
       1290428,  194207, 1003965,  719678, 1046392, 1359141,  788230,
        355415,  876730,  404709,  269418,  809739,   18483,  943323,
        990272,  655614,  690498,  883080,  872837, 1282452, 1269394,
        558272,  684514, 1195090,  745113, 1243693,  672785,  336581,
        789542, 1135668,  441858, 1265850, 1280629,  331321,  905124,
        315411,  245266, 1165288,  672257, 1014116,  382875,  767656,
        946321, 1099605,  363136, 1146770, 1079746,  349053, 1095684,
        976833,  748085, 1214199, 1115740, 1072506,  253881,  279762,
       1295583])

In [16]:
def get_random_customer_from_test_data():
    return next(iter(test.take(1)))['visitorid'].numpy()

def display_item_ids(item_ids):
    display(item_properties[item_properties.itemid.isin(item_ids) ])

def display_products_by_visitor_id(visitor_id):
    display_item_ids(events[events.visitorid == visitor_id].itemid.tolist())
    

index = tfrs.layers.factorized_top_k.BruteForce(model.user_embedding, k=5)
index.index_from_dataset(
    product_ids_dataset.batch(100).map(lambda id: (id, model.product_model(id))))

def predict_user(visitor_id):
    print('predicting user: ', visitor_id)
    print('user already bought following products: ')
    display_products_by_visitor_id(visitor_id)
    
    score, predicted_product_ids = index(np.array([visitor_id]))
    
    print('predicted products: ')
    display_item_ids(predicted_product_ids[0].numpy())
    print('scores: ', score[0].numpy())


user_id = get_random_customer_from_test_data()

predict_user(user_id)

predicting user:  1164119
user already bought following products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
260067,291050,1051,0,"[1036, 1066, 112, 159, 202, 227, 230, 283, 300...","[1318567, 269011 n2400.000 424566, 679677, 519..."


predicted products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
21469,24059,1148,1,"[112, 131, 159, 165, 262, 28, 283, 287, 364, 5...","[679677, 929484, 519769, 769062, 769062, 15016..."
167789,187751,619,0,"[112, 114, 159, 202, 227, 246, 283, 313, 364, ...","[679677, 769062, 519769, 1211212 n612.000, 115..."
301012,336842,1247,1,"[112, 159, 202, 227, 28, 283, 325, 364, 521, 5...","[679677, 519769, 1074346 312202, 984245 132234..."
315618,353237,1120,1,"[0, 1036, 112, 159, 202, 208, 225, 227, 283, 3...","[[98606 632686 858644 145994 1113685, 98606 63..."
319887,358029,1387,1,"[1081, 112, 159, 202, 227, 283, 364, 470, 6, 6...","[769062, 679677, 519769, 860528 741065, 113991..."


scores:  [0.04728506 0.03173855 0.0271187  0.02477267 0.02430293]


In [17]:
user_id = next(iter(train.take(6)))['visitorid'].numpy()
predict_user(user_id)

predicting user:  1055787
user already bought following products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values


predicted products: 


Unnamed: 0,itemid,categoryid,available,properties,property_values
21469,24059,1148,1,"[112, 131, 159, 165, 262, 28, 283, 287, 364, 5...","[679677, 929484, 519769, 769062, 769062, 15016..."
167789,187751,619,0,"[112, 114, 159, 202, 227, 246, 283, 313, 364, ...","[679677, 769062, 519769, 1211212 n612.000, 115..."
301012,336842,1247,1,"[112, 159, 202, 227, 28, 283, 325, 364, 521, 5...","[679677, 519769, 1074346 312202, 984245 132234..."
315618,353237,1120,1,"[0, 1036, 112, 159, 202, 208, 225, 227, 283, 3...","[[98606 632686 858644 145994 1113685, 98606 63..."
319887,358029,1387,1,"[1081, 112, 159, 202, 227, 283, 364, 470, 6, 6...","[769062, 679677, 519769, 860528 741065, 113991..."


scores:  [0.04728506 0.03173855 0.0271187  0.02477267 0.02430293]


In [18]:
#model.save('../../models/retailrocket/rr_tensorflow_reco_3_epochs_v1.h5')

In [19]:
def calculate_coverage_and_scores(user_ids, total_items_count, example_user_id):
    all_scores = []
    all_predicted_product_ids = []  # Use a set to track unique recommended items

    for user_id in user_ids:
        # Call the prediction function for each user
        score, predicted_product_ids = index(np.array([user_id]))
        if user_id == example_user_id:
            display('predicting user: ', user_id)
            display('predicted products: ')
            display(score[0].numpy(), predicted_product_ids[0].numpy())
            display_products_by_visitor_id(user_id)
            display_item_ids(predicted_product_ids[0].numpy())
        # print(predicted_product_ids[0])
        
    # Append the scores to the all_scores list
        all_scores.extend(score[0].numpy())

        # Update the set of all unique predicted product IDs
        all_predicted_product_ids.extend(predicted_product_ids[0].numpy())

    # Calculate min, max, and average scores
    min_score = np.min(all_scores)
    max_score = np.max(all_scores)
    avg_score = np.mean(all_scores)

    # Calculate item coverage and catalog coverage
    num_unique_items = len(np.unique(all_predicted_product_ids))
    catalog_coverage = num_unique_items / total_items_count

    # Print the results
    print(f"Minimum Score: {min_score}")
    print(f"Maximum Score: {max_score}")
    print(f"Average Score: {avg_score}")
    print(f"Catalog Coverage: {catalog_coverage:.2%} (Unique recommended items {num_unique_items} / Total catalog items {total_items_count})")

    return min_score, max_score, avg_score, catalog_coverage, all_predicted_product_ids, num_unique_items, total_items_count

# Example usage
total_items_count = len(events.itemid.unique())

user_ids = events.visitorid.unique()
test_user_id = 325780
min_score, max_score, avg_score, catalog_coverage, all_predicted_p_ids, num_unique_items, total_items_count = calculate_coverage_and_scores(user_ids, total_items_count, test_user_id)


'predicting user: '

325780

'predicted products: '

array([0.04728506, 0.03173855, 0.0271187 , 0.02477267, 0.02430293],
      dtype=float32)

array([187751, 353237, 336842,  24059, 358029])

Unnamed: 0,itemid,categoryid,available,properties,property_values
385170,431099,746,1,"[112, 159, 202, 227, 28, 283, 319, 364, 454, 4...","[679677, 519769, 602490 857333, 1263557 150169..."


Unnamed: 0,itemid,categoryid,available,properties,property_values
21469,24059,1148,1,"[112, 131, 159, 165, 262, 28, 283, 287, 364, 5...","[679677, 929484, 519769, 769062, 769062, 15016..."
167789,187751,619,0,"[112, 114, 159, 202, 227, 246, 283, 313, 364, ...","[679677, 769062, 519769, 1211212 n612.000, 115..."
301012,336842,1247,1,"[112, 159, 202, 227, 28, 283, 325, 364, 521, 5...","[679677, 519769, 1074346 312202, 984245 132234..."
315618,353237,1120,1,"[0, 1036, 112, 159, 202, 208, 225, 227, 283, 3...","[[98606 632686 858644 145994 1113685, 98606 63..."
319887,358029,1387,1,"[1081, 112, 159, 202, 227, 283, 364, 470, 6, 6...","[769062, 679677, 519769, 860528 741065, 113991..."


Minimum Score: 0.0010863523930311203
Maximum Score: 0.05253128707408905
Average Score: 0.030995981767773628
Catalog Coverage: 0.66% (Unique recommended items 56 / Total catalog items 8545)


In [20]:
display(f"min score: {min_score}, max score: {max_score}, avg_score: {avg_score}, catalog_coverage: {catalog_coverage}, min score: {min_score},")


'min score: 0.0010863523930311203, max score: 0.05253128707408905, avg_score: 0.030995981767773628, catalog_coverage: 0.006553540081919251, min score: 0.0010863523930311203,'

In [21]:
#10 epochs: 'min score: 0.01474432647228241, max score: 0.2987794280052185, avg_score: 0.10522415488958359, catalog_coverage: 0.0053832650672908135, min score: 0.01474432647228241,'

In [22]:
display("predicted items", len(list(set(all_predicted_p_ids))), "unique items", num_unique_items, "number of items", total_items_count)

'predicted items'

56

'unique items'

56

'number of items'

8545

In [23]:
item_properties_all = pd.read_pickle(basepath +'item_data_extracted.pkl')

item_properties_all[item_properties_all.itemid == 331276]

Unnamed: 0,itemid,categoryid,available,properties,property_values
