In [1]:
import pandas as pd
from keras.layers import Input, Embedding, LSTM, Dense, Dot, Softmax, Concatenate, BatchNormalization, Attention
from keras.models import Model
import tensorflow as tf
import numpy as np
from keras.utils import pad_sequences
from datetime import datetime
import mlflow
from sklearn.model_selection import train_test_split


In [2]:
activate_mlflow = False

if activate_mlflow:
    registry_uri = "sqlite:///mlflow.db"
    tracking_uri = "http://127.0.0.1:5000"
    exp_name = "RNN Attention"

    mlflow.tracking.set_registry_uri(registry_uri)
    mlflow.tracking.set_tracking_uri(tracking_uri)

    try:
        exp_id = mlflow.create_experiment(name=exp_name)
    except:
        exp_id = mlflow.get_experiment_by_name(name=exp_name).experiment_id

    run_name = 'RNN_V1'
    mlflow.start_run(run_name=run_name, experiment_id=exp_id)


In [3]:
basepath = '../../datasets/preprocessed_datasets/gabor/'
product_data = pd.read_pickle(basepath +'orders_and_products_by_customer.pkl')

In [4]:
#product_data = product_data.sample(n=1000)
# convert dates to ordinal
product_data['order_dates'] = product_data['order_dates'].apply(lambda x: [ [datetime.fromtimestamp(date).date().toordinal() for date in dateList] for dateList in x] )
product_data

Unnamed: 0,customer_id,order_ids,product_ids,product_names,amounts,total_prices,order_dates
0,1940761.0,"[5624878, 7011005, 7673667, 8888291, 11032470]","[[4839681, 4839682], [6596395], [6523930], [86...",[[PG1018 Slipper Materialmix Leder/Textil grün...,"[[1, 1], [1], [1], [1, 1], [1, 1]]","[[62.65, 62.65], [51.77], [46.77], [96.69, 91....","[[737895, 737895], [738067], [738140], [738319..."
1,1940767.0,[5108369],"[[3849255, 4822681]]","[[Spangenpumps Textil silber, Plateau Pumps Ra...","[[1, 1]]","[[99.95, 99.95]]","[[737830, 737830]]"
2,1940775.0,"[4962102, 6875196]","[[3907778], [6556855, 6556856]]","[[Chelsea Boot Rauleder schwarz], [Slipper Rau...","[[1], [1, 1]]","[[69.0], [99.95, 99.95]]","[[737809], [738053, 738053]]"
3,1940782.0,[7417330],"[[6407177, 6435918]]","[[Elegante Stiefelette Glattleder schwarz, Win...","[[1, 1]]","[[125.0, 99.95]]","[[738107, 738107]]"
4,1940806.0,[10781348],[[8642140]],[[Chelsea Boot Rauleder schwarz]],[[1]],[[69.0]],[[738565]]
...,...,...,...,...,...,...,...
357774,11128024.0,[11128020],"[[589817, 10560958]]","[[Eleganter Pumps Glattleder blau, Eleganter P...","[[1, 1]]","[[89.95, 99.95]]","[[738602, 738602]]"
357775,11128039.0,[11128051],[[10451840]],[[Sneaker low Rauleder pink]],[[1]],[[120.0]],[[738602]]
357776,11128082.0,[11128080],[[10559336]],[[Eleganter Pumps Effektleder beige]],[[1]],[[110.0]],[[738602]]
357777,11128095.0,[11128092],[[10504300]],[[Sneaker low Materialmix Leder pink]],[[1]],[[115.0]],[[738602]]


In [5]:
def bringToSameSizeOneDim(column):
    max_length = max(column.apply(len))
    return column.apply(lambda x: x + [0] * (max_length - len(x)))


def bringToSameSizeTwoDimAndFlatten(column):
    max_length = max(column.apply(lambda x: max(len(sublist) for sublist in x)))
    new_column = column.apply(lambda x: [sublist + [0] * (max_length - len(sublist)) for sublist in x])
    new_column = new_column.apply(lambda x: x + [[0] * max_length] * (max_length - len(x)))
    return new_column.apply(lambda x: [item for sublist in x for item in sublist])

product_data.order_ids = bringToSameSizeOneDim(product_data.order_ids)
product_data.product_ids = bringToSameSizeTwoDimAndFlatten(product_data.product_ids)
product_data.amounts = bringToSameSizeTwoDimAndFlatten(product_data.amounts)
product_data.total_prices = bringToSameSizeTwoDimAndFlatten(product_data.total_prices)
product_data.order_dates = bringToSameSizeTwoDimAndFlatten(product_data.order_dates)


In [6]:


def convertToSequence(column):
    list = column.tolist()
    max_length = max(len(x) for x in list)
    return pad_sequences(list, maxlen=max_length, padding='post', value=0), max_length

products_inputs, max_length_products = convertToSequence(product_data.product_ids)
dates_inputs, max_length_dates = convertToSequence(product_data.order_dates)
target_data, _ = convertToSequence(product_data.amounts)
target_data = np.expand_dims(target_data, axis=-1)
num_targets = target_data.shape[1]


#X_train, X_test, y_train, y_test = train_test_split(
#    [products_inputs, dates_inputs], num_targets, test_size=0.33, random_state=42)

#display(X_train, X_test, y_train, y_test)

In [7]:
display(max_length_dates, dates_inputs.shape[1], target_data.shape)
display(target_data)
display(dates_inputs)
display(products_inputs)
# 1602099712

525

525

(357779, 525, 1)

array([[[1],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]], dtype=int32)

array([[737895, 737895,      0, ...,      0,      0,      0],
       [737830, 737830,      0, ...,      0,      0,      0],
       [737809,      0,      0, ...,      0,      0,      0],
       ...,
       [738602,      0,      0, ...,      0,      0,      0],
       [738602,      0,      0, ...,      0,      0,      0],
       [738602,      0,      0, ...,      0,      0,      0]], dtype=int32)

array([[ 4839681,  4839682,        0, ...,        0,        0,        0],
       [ 3849255,  4822681,        0, ...,        0,        0,        0],
       [ 3907778,        0,        0, ...,        0,        0,        0],
       ...,
       [10559336,        0,        0, ...,        0,        0,        0],
       [10504300,        0,        0, ...,        0,        0,        0],
       [10366497,        0,        0, ...,        0,        0,        0]],
      dtype=int32)

In [8]:
num_dates = len(np.unique(product_data.order_dates))
num_products = len(np.unique(product_data.product_ids))
max_index = max(np.max(dates_inputs), np.max(products_inputs))
vocab_size = max_index + 1
max_date = np.max(np.unique(product_data.order_dates.apply(np.unique).apply(max))) + 1
max_product_ids = np.max(np.unique(product_data.product_ids.apply(np.unique).apply(max))) + 1

In [9]:
date_embedding_dim = 16
product_embedding_dim = 16
date_inputs = Input(shape=(max_length_dates,))
product_inputs = Input(shape=(max_length_products,))

encoding_padding_mask = tf.math.logical_not(tf.math.equal(product_inputs, 0))

date_emb = Embedding(max_date, date_embedding_dim, input_length=max_length_dates)(date_inputs)
product_emb = Embedding(max_product_ids, product_embedding_dim, input_length=max_length_products)(product_inputs)


concat_embedding_input = Concatenate(
    axis=-1, name='concat_embedding_input')([date_emb, product_emb])

batchnorm_inputs = BatchNormalization(
    name='batchnorm_inputs')(concat_embedding_input)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)

att = Attention(use_scale=False,
                name='attention')(inputs=[lstm, lstm],
                                  mask=[encoding_padding_mask,
                                        encoding_padding_mask])

output = Dense(num_targets)(att)
model = Model(inputs=[date_inputs, product_inputs], outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 525)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 525)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 525, 16)      11817648    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 525, 16)      177656384   ['input_2[0][0]']                
                                                                                              

In [10]:
epochs = 15
batch_size = 32
history = model.fit([dates_inputs, products_inputs], target_data, epochs=epochs, batch_size=batch_size)  # Adjust epochs and batch size as needed

Epoch 1/15


2023-06-04 18:31:10.402375: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-06-04 18:31:11.263961: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


   60/11181 [..............................] - ETA: 1:41:34 - loss: 0.1720 - mse: 0.1652 - mae: 0.2431 - accuracy: 0.0062

KeyboardInterrupt: 

In [None]:
if activate_mlflow:
    #mlflow.keras.log_model(model, "rnn_model")
    mlflow.log_param('epochs', epochs)
    mlflow.log_param('batch_size', batch_size)

In [None]:
display(history.history)

In [None]:
# Evaluate the model
loss, mse, mae, acc = model.evaluate([dates_inputs, products_inputs], target_data)
print("Test loss: {:.4f}, Test MSE: {:.4f}".format(loss, mse))
print("Test MAE: {:.4f}, Test Accuracy: {:.4f}".format(mae, acc))
if activate_mlflow:
    mlflow.log_metric('test loss', loss)
    mlflow.log_metric('test mse', mse)
    mlflow.log_metric('test mae', mae)
    mlflow.log_metric('test accuracy', acc)

In [None]:
if activate_mlflow:
    mlflow.end_run()