In [1]:
import pandas as pd
from keras.layers import Input, Embedding, LSTM, Dense, Dot, Softmax, Concatenate, BatchNormalization, Attention
from keras.models import Model
import tensorflow as tf
import numpy as np
from keras.utils import pad_sequences
from datetime import datetime
import mlflow
from sklearn.model_selection import train_test_split


In [2]:
activate_mlflow = False

if activate_mlflow:
    registry_uri = "sqlite:///mlflow.db"
    tracking_uri = "http://127.0.0.1:5000"
    exp_name = "RNN Attention"

    mlflow.tracking.set_registry_uri(registry_uri)
    mlflow.tracking.set_tracking_uri(tracking_uri)

    try:
        exp_id = mlflow.create_experiment(name=exp_name)
    except:
        exp_id = mlflow.get_experiment_by_name(name=exp_name).experiment_id

    run_name = 'RNN_V1'
    mlflow.start_run(run_name=run_name, experiment_id=exp_id)


In [3]:
basepath = '../../datasets/preprocessed_datasets/gabor/'
product_data = pd.read_pickle(basepath +'orders_and_products_by_customer.pkl')

In [4]:
product_data = product_data.sample(n=1000)
# convert dates to ordinal
product_data['order_dates'] = product_data['order_dates'].apply(lambda x: [ [datetime.fromtimestamp(date).date().toordinal() for date in dateList] for dateList in x] )
product_data

Unnamed: 0,customer_id,order_ids,product_ids,product_names,amounts,total_prices,order_dates
254174,9240964.0,[9240961],[[7824477]],[[Plateau Sandale Rauleder braun]],[[1]],[[69.0]],[[738373]]
137947,6501265.0,[6501262],[[4773594]],[[Spangenpumps Effektleder weiß]],[[1]],[[69.0]],[[737999]]
280308,9681510.0,[9681508],[[8664691]],[[Sneaker low Lackleder schwarz]],[[1]],[[99.95]],[[738427]]
158882,6972675.0,[6972670],"[[6109954, 6685007, 6115218]]","[[Chelsea Boot Glattleder schwarz, Chelsea Boo...","[[1, 1, 1]]","[[130.0, 130.0, 120.0]]","[[738062, 738062, 738062]]"
338718,10746149.0,[10746147],[[8806701]],[[Sneaker low Lackleder schwarz]],[[1]],[[130.0]],[[738561]]
...,...,...,...,...,...,...,...
178930,7429268.0,[7429264],"[[6249124, 6249140]]","[[PG1027 Boot Rauleder braun, PG1027 Boot Raul...","[[1, 1]]","[[130.0, 130.0]]","[[738108, 738108]]"
12743,2089412.0,[8448951],"[[7870326, 7791107, 7700109, 7597295]]","[[Sportliche Stiefelette Glattleder blau, Spor...","[[1, 1, 1, 1]]","[[99.95, 89.95, 99.95, 99.95]]","[[738269, 738269, 738269, 738269]]"
42131,4217077.0,[4217073],"[[4151383, 4125263]]","[[Chelsea Boot Glattleder rot, Chelsea Boot Ra...","[[1, 1]]","[[116.97, 97.43]]","[[737698, 737698]]"
189967,7726388.0,[7726385],[[6114778]],[[Sneaker low Materialmix Leder braun]],[[1]],[[79.0]],[[738157]]


In [5]:
def bringToSameSizeOneDim(column):
    max_length = max(column.apply(len))
    return column.apply(lambda x: x + [0] * (max_length - len(x)))


def bringToSameSizeTwoDimAndFlatten(column):
    max_length = max(column.apply(lambda x: max(len(sublist) for sublist in x)))
    new_column = column.apply(lambda x: [sublist + [0] * (max_length - len(sublist)) for sublist in x])
    new_column = new_column.apply(lambda x: x + [[0] * max_length] * (max_length - len(x)))
    return new_column.apply(lambda x: [item for sublist in x for item in sublist])

# save for later use
max_bought_products_per_order = max(product_data.product_ids.apply(lambda x: max(len(sublist) for sublist in x)))

product_data.order_ids = bringToSameSizeOneDim(product_data.order_ids)
product_data.product_ids = bringToSameSizeTwoDimAndFlatten(product_data.product_ids)
product_data.amounts = bringToSameSizeTwoDimAndFlatten(product_data.amounts)
product_data.total_prices = bringToSameSizeTwoDimAndFlatten(product_data.total_prices)
product_data.order_dates = bringToSameSizeTwoDimAndFlatten(product_data.order_dates)


254174    [738373, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
137947    [737999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
280308    [738427, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
158882    [738062, 738062, 738062, 0, 0, 0, 0, 0, 0, 0, ...
338718    [738561, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
                                ...                        
178930    [738108, 738108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
12743     [738269, 738269, 738269, 738269, 0, 0, 0, 0, 0...
42131     [737698, 737698, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
189967    [738157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
24823     [738349, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
Name: order_dates, Length: 1000, dtype: object

In [6]:


def convertToSequence(column):
    list = column.tolist()
    max_length = max(len(x) for x in list)
    return pad_sequences(list, maxlen=max_length, padding='post', value=0), max_length

products_inputs, max_length_products = convertToSequence(product_data.product_ids)
orders_inputs, max_length_order = convertToSequence(product_data.order_ids)
dates_inputs, max_length_dates = convertToSequence(product_data.order_dates)
target_data, _ = convertToSequence(product_data.amounts)
target_data = np.expand_dims(target_data, axis=-1)
num_targets = target_data.shape[1]




#display(X_train, X_test, y_train, y_test)

In [7]:
#bring into same shape
old_orders = np.copy(orders_inputs)
shape_diff = products_inputs.shape[1] - orders_inputs.shape[1]
orders_inputs = np.pad(orders_inputs, ((0, 0), (0, shape_diff)), mode='constant', constant_values=0)
max_length_order = orders_inputs.shape[1]
# products_inputs[-5], resized_orders_array[-5], old_orders[-5], max_length_products

In [8]:
display(max_length_dates, dates_inputs.shape[1], target_data.shape)
# display(target_data)
# display(dates_inputs)
# display(orders_inputs)
display(orders_inputs[-2])
# display(products_inputs)
display(products_inputs[-2])
display(orders_inputs.shape)
# 1602099712

110

110

(1000, 110, 1)

array([7726385,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,     

array([6114778,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,     

(1000, 110)

In [9]:
num_dates = len(np.unique(product_data.order_dates))
num_orders = len(np.unique(product_data.order_ids))
num_products = len(np.unique(product_data.product_ids))
max_index = max(np.max(dates_inputs), np.max(products_inputs))
vocab_size = max_index + 1
max_date = np.max(np.unique(product_data.order_dates.apply(np.unique).apply(max))) + 1
max_product_ids = np.max(np.unique(product_data.product_ids.apply(np.unique).apply(max))) + 1
max_order_ids = np.max(np.unique(product_data.order_ids.apply(np.unique).apply(max))) + 1

In [10]:
max_order_ids, max_length_order, max_product_ids, max_length_products, max_length_dates

(11126861, 110, 10787218, 110, 110)

In [11]:
date_embedding_dim = 16
product_embedding_dim = 16
order_embedding_dim = 16
date_inputs = Input(shape=(max_length_dates,))
product_inputs = Input(shape=(max_length_products,))
order_inputs = Input(shape=(max_length_order,))

encoding_padding_mask = tf.math.logical_not(tf.math.equal(product_inputs, 0))

date_emb = Embedding(max_date, date_embedding_dim, input_length=max_length_dates)(date_inputs)
product_emb = Embedding(max_product_ids, product_embedding_dim, input_length=max_length_products)(product_inputs)
order_emb = Embedding(max_order_ids, order_embedding_dim, input_length=max_length_order)(order_inputs)


concat_embedding_input = Concatenate(
    axis=-1, name='concat_embedding_input')([date_emb, product_emb, order_emb])

batchnorm_inputs = BatchNormalization(
    name='batchnorm_inputs')(concat_embedding_input)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)

att = Attention(use_scale=False,
                name='attention')(inputs=[lstm, lstm],
                                  mask=[encoding_padding_mask,
                                        encoding_padding_mask])

output = Dense(num_targets)(att)
model = Model(inputs=[date_inputs, product_inputs, order_inputs], outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 110)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 110)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 110)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 110, 16)      11817648    ['input_1[0][0]']                
                                                                                              

In [12]:
# X_train, X_test, y_train, y_test = train_test_split(
#     [dates_inputs, products_inputs, orders_inputs], target_data, test_size=0.33, random_state=42)

In [13]:
epochs = 3
batch_size = 32
history = model.fit([dates_inputs, products_inputs, orders_inputs], target_data, epochs=epochs, batch_size=batch_size)  # Adjust epochs and batch size as needed

Epoch 1/3


2023-06-22 14:20:22.872829: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-06-22 14:20:24.274574: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/3
Epoch 3/3


In [14]:
if activate_mlflow:
    #mlflow.keras.log_model(model, "rnn_model")
    mlflow.log_param('epochs', epochs)
    mlflow.log_param('batch_size', batch_size)

In [15]:
display(history.history)

{'loss': [0.28608864545822144, 0.01405801810324192, 0.007083631586283445],
 'mse': [0.2874542772769928, 0.014371691271662712, 0.007162719964981079],
 'mae': [0.37561681866645813, 0.09326629340648651, 0.06502830237150192],
 'accuracy': [0.0005527915782295167, 0.0, 0.009397457353770733]}

In [16]:
# Evaluate the model
loss, mse, mae, acc = model.evaluate([dates_inputs, products_inputs, orders_inputs], target_data)
print("Test loss: {:.4f}, Test MSE: {:.4f}".format(loss, mse))
print("Test MAE: {:.4f}, Test Accuracy: {:.4f}".format(mae, acc))
if activate_mlflow:
    mlflow.log_metric('test loss', loss)
    mlflow.log_metric('test mse', mse)
    mlflow.log_metric('test mae', mae)
    mlflow.log_metric('test accuracy', acc)



2023-06-22 14:22:37.267920: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Test loss: 0.9019, Test MSE: 0.9032
Test MAE: 0.9499, Test Accuracy: 0.0000


In [17]:
if activate_mlflow:
    mlflow.end_run()