In [1]:
#%pip install -q tensorflow-recommenders
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras import Sequential
import tensorflow_recommenders as tfrs
from tensorflow.keras.metrics import AUC
import math
import numpy as np



In [2]:
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [3]:
# Load data
basepath = '../../datasets/preprocessed_datasets/gabor/'
pandas_interaction_data = pd.read_pickle(basepath + 'user_item_interactions_only_main_products_10k.pkl')
pandas_interaction_data = pandas_interaction_data.sample(1000)
pandas_interaction_data.amount = pandas_interaction_data.amount.apply(lambda x: 1 if x > 0 else 0) 
product_data = pd.read_pickle(basepath + 'no_product_variants_with_duplicate_ids.pkl')
pandas_interaction_data.main_product_id = pandas_interaction_data.main_product_id.astype(int)
product_data.main_product_id = product_data.main_product_id.astype(int)

display(pandas_interaction_data, product_data)

Unnamed: 0,customer_id,main_product_id,amount
467965,9585643,3473232,1
466622,9567853,3479112,1
381941,8296404,5568919,1
139582,4484976,3908188,1
46027,2081509,3482926,1
...,...,...,...
425331,9048921,4914689,1
408012,8760152,549920,1
153414,4718281,4031837,1
486318,9791717,3696391,1


Unnamed: 0,main_product_id,productNumber,productName,productColorName,articleNumber,brand__id,mainCategory__id,gender,originCountry,line,...,heelHeightGroup,sizeEu,sizeUk,sizeIndex,shaftLength__value,sole,isTransferee,isSuccessor,duplicate_product__ids,product_sizes
0,557559,4058394021466,sportliche Ballerinas Glattleder schwarz,schwarz,02.643.57,6590678,315571,w,PT,F-S|H-W,...,bis 3 cm,40.0,6.5,11.0,0.0,Gummi,0.0,0.0,"[557559, 549733, 549734, 557553, 589488, 58948...","[35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40...."
1,551622,4054452768212,elegante Pumps Glattleder schwarz,schwarz,05.160.37,6590677,315573,w,PT,F-S|H-W,...,3 cm - 5 cm,38.0,5.0,8.0,0.0,EVA,0.0,0.0,"[551622, 551505, 551510, 551509, 552973, 55162...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
2,547193,4054452768427,Slipper Glattleder schwarz,schwarz,04.443.27,6590677,315576,w,SK,F-S|H-W,...,3 cm - 5 cm,42.0,8.0,14.0,0.0,PU-TPU,0.0,0.0,"[547193, 547191, 547196, 547187, 547189, 54719...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
3,549776,4059701687894,Sneaker low Rauleder blau,blau,06.968.46,4997827,315567,w,VN,F-S|H-W,...,3 cm - 5 cm,39.0,6.0,10.0,0.0,Gummi-EVA,0.0,0.0,"[549776, 550599, 550596, 550593, 550601, 54978...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
4,550726,4054452851594,elegante Ballerinas Materialmix Lederimitat sc...,schwarz,06.102.67,6590678,315574,w,SK,F-S|H-W,...,bis 3 cm,37.5,4.5,7.0,0.0,EVA,0.0,0.0,"[550726, 550736, 550735, 550732, 550728, 55072...","[35.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40.0, 40...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3292,10603553,4066558951733,Sandale met plateauzool Suède blauw,blauw,24.764.36,6590677,315568,w,SK,F-S,...,5 cm - 8 cm,35.0,2.5,3.0,0.0,PU-TPU,0.0,0.0,"[10603553, 10544662]","[35.0, 37.0]"
3293,10442948,4065171827272,Mokassin Materialmix Leder pink,pink,26.090.21,6590678,315590,w,PT,F-S,...,bis 3 cm,40.5,7.0,12.0,0.0,Gummi,0.0,0.0,"[10442948, 10442949]","[40.5, 41.0]"
3294,10782704,4251234499207,Shopper ANDIE blau,blau,921453,363013,363017,w,DE,F-S,...,,,,,,,,,[10782704],[]
3295,10679703,4066558303617,Elegante pumps Glad leer wit,wit,21.450.60,6590677,315573,w,PT,F-S,...,5 cm - 8 cm,35.0,2.5,3.0,0.0,TPU,0.0,0.0,[10679703],[35.0]


In [4]:
# Create a tf.data.Dataset from the interaction data
interaction_dataset = tf.data.Dataset.from_tensor_slices({
    'customer_id': pandas_interaction_data['customer_id'].values,
    'main_product_id': pandas_interaction_data['main_product_id'].values,
    'amount': pandas_interaction_data['amount'].values
})
#variables 
dataset_len = pandas_interaction_data.shape[0]
test_len = math.ceil(dataset_len * 0.2)
train_len = dataset_len - test_len

metrics_batchsize = 16
train_batch_size = 128
test_batch_size = 64
random_seed = 27

In [5]:
#train test split
tf.random.set_seed(random_seed)
# shuffled = interaction_dataset.shuffle(len, seed=random_seed, reshuffle_each_iteration=False)
# train = shuffled.take(train_len)
# test = shuffled.skip(train_len).take(test_len)
shuffled = interaction_dataset.shuffle(dataset_len, seed=random_seed, reshuffle_each_iteration=False)

train = shuffled.take(train_len)
test = shuffled.skip(train_len).take(test_len)
display(train, test)

<_TakeDataset element_spec={'customer_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'main_product_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'amount': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

<_TakeDataset element_spec={'customer_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'main_product_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'amount': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [6]:
# Unique customer and product identifiers
unique_customer_ids = np.array(pandas_interaction_data["customer_id"].unique())
unique_product_ids = np.array(pandas_interaction_data["main_product_id"].unique())

user_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
user_ids_vocabulary.adapt(unique_customer_ids)

product_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
product_ids_vocabulary.adapt(unique_product_ids)
# print(unique_product_ids.shape[0], unique_customer_ids.shape[0], unique_product_ids, unique_product_ids)

In [7]:
# Define a model using TensorFlow Recommenders
product_ids_dataset = tf.data.Dataset.from_tensor_slices(unique_product_ids)
class RetailModel(tfrs.Model):

    def __init__(self, unique_product_ids_vocab, unique_customer_ids_vocab, embedding_dimension=32):
        super().__init__()
        # Set up user and product representations
        self.user_embedding = tf.keras.Sequential([
            unique_customer_ids_vocab,
            Embedding(unique_customer_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        self.product_embedding = tf.keras.Sequential([
            unique_product_ids_vocab,
            Embedding(unique_product_ids_vocab.vocabulary_size(), embedding_dimension)
        ])
        # Set up a dense layer for the task.
        self.dense_layer = Dense(128, activation="relu")
        


        # Set up retrieval task and metrics
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=product_ids_dataset.batch(metrics_batchsize).map(self.product_model))
        )
        self.auc_metric = AUC(name='auc')
        self.rmse_metric = tf.keras.metrics.RootMeanSquaredError(name='rmse')
        self.precision = tf.keras.metrics.Precision(name='precision')
        self.recall = tf.keras.metrics.Recall(name='recall')

    def product_model(self, product_ids):
        return self.product_embedding(product_ids)

    def dot_product_score(self, user, product):
        """
        Computes the dot product between user and product embeddings to get the interaction score.
        """
        return tf.reduce_sum(user * product, axis=1)

    def compute_loss(self, features, training=False):
        # print(features)
        user_embeddings = self.user_embedding(features["customer_id"])
        product_embeddings = self.product_embedding(features["main_product_id"])
        
        user_output = self.dense_layer(user_embeddings)
        product_output = self.dense_layer(product_embeddings)
        # This is a hypothetical function that returns the logits or scores
        # This needs to be adapted based on your actual model
        positive_logits = self.dot_product_score(user_output, product_output)

        # Update RMSE
        self.rmse_metric.update_state(y_true=features["amount"], y_pred=positive_logits)


        return self.task(user_output, product_output)

    def evaluate(self, validation_dataset, *args, **kwargs):
        # Call the base class's evaluate method
        
    
        # Compute additional metrics
        for features in validation_dataset:
            user_embeddings = self.user_embedding(features["customer_id"])
            product_embeddings = self.product_embedding(features["main_product_id"])
            
            user_output = self.dense_layer(user_embeddings)
            product_output = self.dense_layer(product_embeddings)
            
            positive_logits = self.dot_product_score(user_output, product_output)
            # Assuming you have a binary "label" in your dataset indicating 1 for positive interaction and 0 for negative
            self.precision.update_state(y_true=features["amount"], y_pred=positive_logits)
            self.recall.update_state(y_true=features["amount"], y_pred=positive_logits)
    
        precision_result = self.precision.result().numpy()
        recall_result = self.recall.result().numpy()
    
        # Calculate F1 score
        if (precision_result + recall_result) != 0:
            f1_score = 2 * (precision_result * recall_result) / (precision_result + recall_result)
        else:
            f1_score = 0.0
    
        # Reset the metrics for the next evaluation
        # self.precision.reset_states()
        # self.recall.reset_states()
        base_results = super(RetailModel, self).evaluate(validation_dataset, *args, **kwargs)
    
        return base_results, [precision_result, recall_result, f1_score]  # or append additional results as needed



In [8]:
model = RetailModel(product_ids_vocabulary, user_ids_vocabulary, embedding_dimension=128)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001)) # using legacy instead of tf.keras.optimizers.Adagrad, because newer version is slow on m1/m2 macs

In [9]:
# Shuffle, batch, and cache the data.
cached_train = train.shuffle(dataset_len).batch(train_batch_size).cache()
cached_test = test.batch(test_batch_size).cache()
# Train the model
# model.fit(cached_train, validation_data=cached_test, validation_freq=5, epochs=3)
model.fit(cached_train, validation_freq=5, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x28bb56070>

In [10]:
# Evaluate the model
k = 5
result_evaluate_train = model.evaluate(cached_train)
result_evaluate_train



([0.0,
  0.9295324683189392,
  0.0,
  0.0,
  0.9574999809265137,
  0.9800000190734863,
  0.9862499833106995,
  0.9925000071525574,
  0.9962499737739563,
  109.53569793701172,
  0,
  109.53569793701172],
 [0.0, 0.0, 0.0])

In [11]:
result_evaluat_test = model.evaluate(cached_test)
result_evaluat_test



([0.0,
  0.9833747744560242,
  0.0,
  0.0,
  0.2150000035762787,
  0.3149999976158142,
  0.3799999952316284,
  0.5899999737739563,
  0.6899999976158142,
  16.67401123046875,
  0,
  16.67401123046875],
 [0.0, 0.0, 0.0])

In [12]:
print("Name\t\tValue")
print("-" * 30)
for metric in model.metrics:
    print(f"{metric.name}\t\t{metric.result().numpy()}")
print(f"AUC: {model.auc_metric.result().numpy()}")

Name		Value
------------------------------
auc		0.0
rmse		0.9833747744560242
precision		0.0
recall		0.0
factorized_top_k/top_1_categorical_accuracy		0.2150000035762787
factorized_top_k/top_5_categorical_accuracy		0.3149999976158142
factorized_top_k/top_10_categorical_accuracy		0.3799999952316284
factorized_top_k/top_50_categorical_accuracy		0.5899999737739563
factorized_top_k/top_100_categorical_accuracy		0.6899999976158142
AUC: 0.0


In [13]:
sample = next(iter(cached_test.take(1)))

# Extract customer_id from the sample
customer_id = sample['customer_id'].numpy()
customer_id

array([11037529,  5656565,  2020852, 10211791,  2109056,  8649550,
        5511081,  7980892,  5873556,  8200260,  7955699,  7428187,
        5098410, 10153936, 10245006,  2109312, 10930165,  2080340,
        9476017,  5634218,  4465572, 10720396,  4044194,  7067472,
        4662210,  4013121,  9930684,  6055355,  2134242,  5872935,
        7007733,  9865747, 10074706,  2148626, 10245931,  9608719,
        6913909,  4535739,  8600081,  4165511,  8958747,  2039235,
        9557554,  6084211, 10928749,  4420120, 10082472,  9537829,
       10960487,  3382793, 10655750, 10720942,  7338314,  2208319,
        4014312,  9478149,  5851011,  6670674,  3897951,  2029314,
        2149648,  2132585,  4384771,  2184398])

In [14]:
def get_random_customer_from_test_data():
    return next(iter(test.take(1)))['customer_id'].numpy()

def display_item_ids(item_ids):
    display(product_data[product_data.main_product_id.isin(item_ids) ])

def display_products_by_customer_id(customer_id):
    display_item_ids(pandas_interaction_data[pandas_interaction_data.customer_id ==  customer_id].main_product_id.tolist())
    

index = tfrs.layers.factorized_top_k.BruteForce(model.user_embedding)
index.index_from_dataset(
    product_ids_dataset.batch(100).map(lambda id: (id, model.product_model(id))))

def predict_user(user_id):
    print('predicting user: ', user_id)
    print('user already bought following products: ')
    display_products_by_customer_id(user_id)
    
    score, predicted_product_ids = index(np.array([user_id]))
    
    print('predicted products: ')
    display_item_ids(predicted_product_ids[0].numpy())
    print('scores: ', score[0].numpy())


user_id = get_random_customer_from_test_data()

predict_user(user_id)

predicting user:  11037529
user already bought following products: 


Unnamed: 0,main_product_id,productNumber,productName,productColorName,articleNumber,brand__id,mainCategory__id,gender,originCountry,line,...,heelHeightGroup,sizeEu,sizeUk,sizeIndex,shaftLength__value,sole,isTransferee,isSuccessor,duplicate_product__ids,product_sizes
3,549776,4059701687894,Sneaker low Rauleder blau,blau,06.968.46,4997827,315567,w,VN,F-S|H-W,...,3 cm - 5 cm,39.0,6.0,10.0,0.0,Gummi-EVA,0.0,0.0,"[549776, 550599, 550596, 550593, 550601, 54978...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."


predicted products: 


Unnamed: 0,main_product_id,productNumber,productName,productColorName,articleNumber,brand__id,mainCategory__id,gender,originCountry,line,...,heelHeightGroup,sizeEu,sizeUk,sizeIndex,shaftLength__value,sole,isTransferee,isSuccessor,duplicate_product__ids,product_sizes
19,549066,4054452850924,Keilpumps Rauleder schwarz,schwarz,02.690.47,6590678,315588,w,SK,F-S|H-W,...,3 cm - 5 cm,39.0,6.0,10.0,0.0,TPU,0.0,0.0,"[549066, 549072, 549070, 549063, 549069, 54907...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
276,3847929,4060666923019,Hochfrontpump Glattleder schwarz,schwarz,52.165.57,6590678,315593,w,SK,H-W,...,5 cm - 8 cm,44.0,9.5,17.0,0.0,TPU,0.0,1.0,"[3847929, 3847921, 3847915, 3847922, 3847927, ...","[35.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40.0, 40...."
447,4078046,4062862069522,Sneaker low Materialmix Leder weiß,weiß,56.918.40,4997827,315567,w,VN,H-W,...,5 cm - 8 cm,42.0,8.0,14.0,0.0,Gummi-EVA,0.0,0.0,"[4078046, 4078047, 4078048, 4078041, 4078045, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
598,4774679,4062862829751,Keilsandalette Rauleder gelb,gelb,62.750.22,6590678,315564,w,PT,F-S,...,3 cm - 5 cm,35.0,2.5,3.0,0.0,TR,0.0,1.0,"[4774679, 4773177, 4774684, 4773750, 4773755, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
622,4771321,4062862905509,Plateau Pumps Effektleder silber,silber,61.260.61,6590677,315569,w,PT,F-S,...,5 cm - 8 cm,35.5,3.0,4.0,0.0,EVA,0.0,1.0,"[4771321, 4771326, 4771325, 4771327, 4771322, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
1007,5385925,4062862552475,Pantolette Lederimitat blau,blau,64.663.46,6590677,315572,w,SK,F-S,...,5 cm - 8 cm,37.0,4.0,6.0,0.0,PU-TPU,0.0,0.0,"[5385925, 5385928, 5385929, 5385935, 5385930, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
1015,5375134,4062862892588,Pantolette Effektleder braun,braun,63.705.34,6590677,315572,w,SK,F-S,...,bis 3 cm,39.0,6.0,7.0,0.0,Gummi,0.0,0.0,"[5375134, 5375135, 5375132, 5507286, 5507291, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
1149,3851138,4060666988049,Eleganter Pumps Effektleder silber,silber,05.482.61,6590677,315573,w,SK,F-S|H-W,...,5 cm - 8 cm,44.0,9.5,17.0,0.0,TPU,0.0,1.0,"[3851138, 3851125, 3851132, 3851134, 3851133, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
1314,6164348,4064032658598,Sneaker low Materialmix Leder,,76.433.44,6590678,315567,w,SK,H-W,...,3 cm - 5 cm,40.5,7.0,12.0,0.0,TR,0.0,0.0,"[6164348, 5077410, 6555142, 6446945, 6163947, ...","[35.0, 35.5, 36.0, 37.0, 37.5, 38.0, 38.5, 39...."
2403,8965025,4065171356260,Eleganter Ballerina Rauleder blau,blau,02.690.46,6590678,315588,w,SK,F-S|H-W,...,3 cm - 5 cm,43.0,9.0,16.0,0.0,TPU,0.0,0.0,"[8965025, 8965026, 8965017, 8965023, 8965021, ...","[36.0, 37.0, 37.5, 38.0, 38.5, 39.0, 40.0, 40...."


scores:  [0.02941321 0.02907445 0.02816804 0.02668403 0.02558309 0.02325297
 0.02261669 0.02081436 0.02071881 0.02041219]
