In [211]:
import pandas as pd
from keras.layers import Input, Embedding, LSTM, Dense, Dot, Softmax, Concatenate, BatchNormalization, Attention
from keras.models import Model
import tensorflow as tf
import numpy as np
from keras.utils import pad_sequences
from datetime import datetime
import mlflow

In [212]:
activate_mlflow = True

if activate_mlflow:
    registry_uri = "sqlite:///mlflow.db"
    tracking_uri = "http://127.0.0.1:5000"
    exp_name = "RNN Attention"

    mlflow.tracking.set_registry_uri(registry_uri)
    mlflow.tracking.set_tracking_uri(tracking_uri)

    try:
        exp_id = mlflow.create_experiment(name=exp_name)
    except:
        exp_id = mlflow.get_experiment_by_name(name=exp_name).experiment_id

    run_name = 'RNN_V1'
    mlflow.start_run(run_name=run_name, experiment_id=exp_id)


In [213]:
basepath = '../../datasets/preprocessed_datasets/gabor/'
product_data = pd.read_pickle(basepath +'orders_and_products_by_customer.pkl')

In [214]:
product_data = product_data.sample(n=1000)
# convert dates to ordinal
product_data['order_dates'] = product_data['order_dates'].apply(lambda x: [ [datetime.fromtimestamp(date).date().toordinal() for date in dateList] for dateList in x] )
product_data

Unnamed: 0,customer_id,order_ids,product_ids,product_names,amounts,total_prices,order_dates
109726,5793848.0,[5793845],"[[5015049, 5015266]]",[[PG8001 Sneaker low Materialmix Leder/Textil ...,"[[1, 1]]","[[140.0, 140.0]]","[[737914, 737914]]"
297606,9954965.0,[9954962],[[8800346]],[[Eleganter Pumps Glattleder rosa]],[[1]],[[110.0]],[[738457]]
283310,9719227.0,[9719224],[[8666136]],[[Chelsea Boot Rauleder grün]],[[1]],[[120.0]],[[738431]]
323408,10416448.0,[10416444],"[[8664970, 8502656]]","[[Chelsea Boot Glattleder schwarz, Chelsea Boo...","[[1, 1]]","[[125.0, 130.0]]","[[738519, 738519]]"
222132,8520899.0,[8520895],[[3476934]],[[Keilpumps Rauleder rosa]],[[1]],[[99.95]],[[738275]]
...,...,...,...,...,...,...,...
212243,8257024.0,[8257021],[[7800287]],[[Slipper Rauleder blau]],[[1]],[[120.0]],[[738246]]
124823,6190643.0,[6190641],[[4773137]],[[Pantolette Glattleder weiß]],[[1]],[[89.95]],[[737957]]
185604,7612570.0,[7612567],[[6164342]],[[Sneaker low Materialmix Leder beige]],[[1]],[[89.0]],[[738128]]
113270,5873875.0,[5873872],[[3763951]],[[Elegante Stiefelette Rauleder braun]],[[1]],[[125.0]],[[737923]]


In [215]:
def bringToSameSizeOneDim(column):
    max_length = max(column.apply(len))
    return column.apply(lambda x: x + [0] * (max_length - len(x)))


def bringToSameSizeTwoDimAndFlatten(column):
    max_length = max(column.apply(lambda x: max(len(sublist) for sublist in x)))
    new_column = column.apply(lambda x: [sublist + [0] * (max_length - len(sublist)) for sublist in x])
    new_column = new_column.apply(lambda x: x + [[0] * max_length] * (max_length - len(x)))
    return new_column.apply(lambda x: [item for sublist in x for item in sublist])

product_data.order_ids = bringToSameSizeOneDim(product_data.order_ids)
product_data.product_ids = bringToSameSizeTwoDimAndFlatten(product_data.product_ids)
product_data.amounts = bringToSameSizeTwoDimAndFlatten(product_data.amounts)
product_data.total_prices = bringToSameSizeTwoDimAndFlatten(product_data.total_prices)
product_data.order_dates = bringToSameSizeTwoDimAndFlatten(product_data.order_dates)


In [216]:
def convertToSequence(column):
    list = column.tolist()
    max_length = max(len(x) for x in list)
    return pad_sequences(list, maxlen=max_length, padding='post', value=0), max_length

products_inputs, max_length_products = convertToSequence(product_data.product_ids)
dates_inputs, max_length_dates = convertToSequence(product_data.order_dates)
target_data, _ = convertToSequence(product_data.amounts)
target_data = np.expand_dims(target_data, axis=-1)
num_targets = target_data.shape[1]

In [217]:
display(max_length_dates, dates_inputs.shape[1], target_data.shape)
display(target_data)
display(dates_inputs)
display(products_inputs)
# 1602099712

80

80

(1000, 80, 1)

array([[[1],
        [1],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       ...,

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]],

       [[1],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]], dtype=int32)

array([[737914, 737914,      0, ...,      0,      0,      0],
       [738457,      0,      0, ...,      0,      0,      0],
       [738431,      0,      0, ...,      0,      0,      0],
       ...,
       [738128,      0,      0, ...,      0,      0,      0],
       [737923,      0,      0, ...,      0,      0,      0],
       [738017,      0,      0, ...,      0,      0,      0]], dtype=int32)

array([[5015049, 5015266,       0, ...,       0,       0,       0],
       [8800346,       0,       0, ...,       0,       0,       0],
       [8666136,       0,       0, ...,       0,       0,       0],
       ...,
       [6164342,       0,       0, ...,       0,       0,       0],
       [3763951,       0,       0, ...,       0,       0,       0],
       [4774231,       0,       0, ...,       0,       0,       0]],
      dtype=int32)

In [218]:
num_dates = len(np.unique(product_data.order_dates))
num_products = len(np.unique(product_data.product_ids))
max_index = max(np.max(dates_inputs), np.max(products_inputs))
vocab_size = max_index + 1
max_date = np.max(np.unique(product_data.order_dates.apply(np.unique).apply(max))) + 1
max_product_ids = np.max(np.unique(product_data.product_ids.apply(np.unique).apply(max))) + 1

In [219]:
date_embedding_dim = 16
product_embedding_dim = 16
date_inputs = Input(shape=(max_length_dates,))
product_inputs = Input(shape=(max_length_products,))

encoding_padding_mask = tf.math.logical_not(tf.math.equal(product_inputs, 0))

date_emb = Embedding(max_date, date_embedding_dim, input_length=max_length_dates)(date_inputs)
product_emb = Embedding(max_product_ids, product_embedding_dim, input_length=max_length_products)(product_inputs)


concat_embedding_input = Concatenate(
    axis=-1, name='concat_embedding_input')([date_emb, product_emb])

batchnorm_inputs = BatchNormalization(
    name='batchnorm_inputs')(concat_embedding_input)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)

att = Attention(use_scale=False,
                name='attention')(inputs=[lstm, lstm],
                                  mask=[encoding_padding_mask,
                                        encoding_padding_mask])

output = Dense(num_targets)(att)
model = Model(inputs=[date_inputs, product_inputs], outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_27 (InputLayer)          [(None, 80)]         0           []                               
                                                                                                  
 input_28 (InputLayer)          [(None, 80)]         0           []                               
                                                                                                  
 embedding_25 (Embedding)       (None, 80, 16)       11817648    ['input_27[0][0]']               
                                                                                                  
 embedding_26 (Embedding)       (None, 80, 16)       172770592   ['input_28[0][0]']               
                                                                                           

In [220]:
epochs = 15
batch_size = 32
history = model.fit([dates_inputs, products_inputs], target_data, epochs=epochs, batch_size=batch_size)  # Adjust epochs and batch size as needed

Epoch 1/15


2023-06-04 15:00:04.334725: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [230]:
if activate_mlflow:
    #mlflow.keras.log_model(model, "rnn_model")
    mlflow.log_param('epochs', epochs)
    mlflow.log_param('batch_size', batch_size)

In [232]:
display(history.history)

{'loss': [0.3408089876174927,
  0.01622704043984413,
  0.010080899111926556,
  0.008567382581532001,
  0.006279885768890381,
  0.0065847402438521385,
  0.005807570647448301,
  0.005194287747144699,
  0.0051511796191334724,
  0.004938358906656504,
  0.004019284155219793,
  0.004318671766668558,
  0.003855029819533229,
  0.003573944792151451,
  0.0038797897286713123],
 'mse': [0.3358329236507416,
  0.01773727685213089,
  0.011324574239552021,
  0.009058338589966297,
  0.00782093871384859,
  0.006869747769087553,
  0.0063854847103357315,
  0.0060285963118076324,
  0.005620488431304693,
  0.005247111897915602,
  0.005105760879814625,
  0.004742765333503485,
  0.004413100890815258,
  0.004186039324849844,
  0.004038817249238491],
 'mae': [0.415631502866745,
  0.09333819895982742,
  0.06854512542486191,
  0.058961525559425354,
  0.05152508243918419,
  0.046077050268650055,
  0.04318191856145859,
  0.041689421981573105,
  0.03913922980427742,
  0.03629768639802933,
  0.03171909600496292,
  0.

In [233]:
# Evaluate the model
loss, mse, mae, acc = model.evaluate([dates_inputs, products_inputs], target_data)
print("Test loss: {:.4f}, Test MSE: {:.4f}".format(loss, mse))
print("Test MAE: {:.4f}, Test Accuracy: {:.4f}".format(mae, acc))
if activate_mlflow:
    mlflow.log_metric('test loss', loss)
    mlflow.log_metric('test mse', mse)
    mlflow.log_metric('test mae', mae)
    mlflow.log_metric('test accuracy', acc)



2023-06-04 15:07:16.930162: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Test loss: 0.4565, Test MSE: 0.4629
Test MAE: 0.6739, Test Accuracy: 0.0000


In [234]:
if activate_mlflow:
    mlflow.end_run()