In [243]:
import pandas as pd
from keras.layers import Input, Embedding, LSTM, Dense, Dot, Softmax, Concatenate, BatchNormalization, Attention
from keras.models import Model
import tensorflow as tf
import numpy as np
from keras.utils import pad_sequences
from datetime import datetime
import mlflow
from sklearn.model_selection import train_test_split


In [244]:
activate_mlflow = False

if activate_mlflow:
    registry_uri = "sqlite:///mlflow.db"
    tracking_uri = "http://127.0.0.1:5000"
    exp_name = "RNN Attention"

    mlflow.tracking.set_registry_uri(registry_uri)
    mlflow.tracking.set_tracking_uri(tracking_uri)

    try:
        exp_id = mlflow.create_experiment(name=exp_name)
    except:
        exp_id = mlflow.get_experiment_by_name(name=exp_name).experiment_id

    run_name = 'RNN_V1'
    mlflow.start_run(run_name=run_name, experiment_id=exp_id)


In [245]:
basepath = '../../datasets/preprocessed_datasets/gabor/'
product_data = pd.read_pickle(basepath +'orders_and_products_by_customer.pkl')

In [246]:
product_data = product_data.sample(n=1000)
# convert dates to ordinal
product_data['order_dates'] = product_data['order_dates'].apply(lambda x: [ [datetime.fromtimestamp(date).date().toordinal() for date in dateList] for dateList in x] )
product_data

Unnamed: 0,customer_id,order_ids,product_ids,product_names,amounts,total_prices,order_dates
160987,7013599.0,[7013596],[[6249186]],[[PG1027 Boot Rauleder grau]],[[1]],[[130.0]],[[738067]]
231686,8763751.0,[8763748],"[[7593437, 7791782]]","[[Pantolette Lackleder schwarz, Pantolette Gla...","[[1, 1]]","[[99.95, 79.95]]","[[738300, 738300]]"
208594,8165582.0,[8165577],[[7801012]],[[Slipper Rauleder grau]],[[1]],[[99.95]],[[738236]]
203547,8046416.0,[8046413],[[7693709]],[[Sneaker high Rauleder blau]],[[1]],[[120.0]],[[738218]]
29274,3699204.0,[3699199],[[3591638]],[[Sportliche Ballerina Rauleder beige]],[[1]],[[97.43]],[[737633]]
...,...,...,...,...,...,...,...
94804,5463976.0,"[5463972, 10675661, 10678255, 10784343]","[[5338593, 5385582], [10434848], [10360014, 10...","[[Plateau Sandale Rauleder grün, Riemchensanda...","[[1, 1], [1], [1, 1], [1, 1, 1]]","[[99.95, 89.95], [150.0], [99.95, 99.95], [120...","[[737875, 737875], [738553], [738553, 738553],..."
149446,6757441.0,[6757438],[[6555668]],[[Chelsea Boot Glattleder braun]],[[1]],[[125.0]],[[738037]]
231076,8747984.0,[8747982],[[7792763]],[[Sneaker low Glattleder weiß]],[[1]],[[115.0]],[[738298]]
144901,6655589.0,[6655583],"[[6114057, 3694575, 6112360]]","[[Schnürstiefelette Glattleder schwarz, Schnür...","[[1, 1, 1]]","[[125.0, 140.0, 99.95]]","[[738023, 738023, 738023]]"


In [247]:
def bringToSameSizeOneDim(column):
    max_length = max(column.apply(len))
    return column.apply(lambda x: x + [0] * (max_length - len(x)))


def bringToSameSizeTwoDimAndFlatten(column):
    max_length = max(column.apply(lambda x: max(len(sublist) for sublist in x)))
    new_column = column.apply(lambda x: [sublist + [0] * (max_length - len(sublist)) for sublist in x])
    new_column = new_column.apply(lambda x: x + [[0] * max_length] * (max_length - len(x)))
    return new_column.apply(lambda x: [item for sublist in x for item in sublist])

# save for later use
max_bought_products_per_order = max(product_data.product_ids.apply(lambda x: max(len(sublist) for sublist in x)))

product_data.order_ids = bringToSameSizeOneDim(product_data.order_ids)
product_data.product_ids = bringToSameSizeTwoDimAndFlatten(product_data.product_ids)
product_data.amounts = bringToSameSizeTwoDimAndFlatten(product_data.amounts)
product_data.total_prices = bringToSameSizeTwoDimAndFlatten(product_data.total_prices)
product_data.order_dates = bringToSameSizeTwoDimAndFlatten(product_data.order_dates)


In [248]:


def convertToSequence(column):
    list = column.tolist()
    max_length = max(len(x) for x in list)
    return pad_sequences(list, maxlen=max_length, padding='post', value=0), max_length

products_inputs, max_length_products = convertToSequence(product_data.product_ids)
orders_inputs, max_length_order = convertToSequence(product_data.order_ids)
dates_inputs, max_length_dates = convertToSequence(product_data.order_dates)
target_data, _ = convertToSequence(product_data.amounts)
target_data = np.expand_dims(target_data, axis=-1)
num_targets = target_data.shape[1]




#display(X_train, X_test, y_train, y_test)

In [249]:
#bring into same shape
old_orders = np.copy(orders_inputs)
shape_diff = products_inputs.shape[1] - orders_inputs.shape[1]
orders_inputs = np.pad(orders_inputs, ((0, 0), (0, shape_diff)), mode='constant', constant_values=0)
max_length_order = orders_inputs.shape[1]
# products_inputs[-5], resized_orders_array[-5], old_orders[-5], max_length_products

In [250]:
display(max_length_dates, dates_inputs.shape[1], target_data.shape)
# display(target_data)
# display(dates_inputs)
# display(orders_inputs)
display(orders_inputs[-2])
# display(products_inputs)
display(products_inputs[-2])
display(orders_inputs.shape)
# 1602099712

110

110

(1000, 110, 1)

array([6655583,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,     

array([6114057, 3694575, 6112360,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,     

(1000, 110)

In [251]:
num_dates = len(np.unique(product_data.order_dates))
num_orders = len(np.unique(product_data.order_ids))
num_products = len(np.unique(product_data.product_ids))
max_index = max(np.max(dates_inputs), np.max(products_inputs))
vocab_size = max_index + 1
max_date = np.max(np.unique(product_data.order_dates.apply(np.unique).apply(max))) + 1
max_product_ids = np.max(np.unique(product_data.product_ids.apply(np.unique).apply(max))) + 1
max_order_ids = np.max(np.unique(product_data.order_ids.apply(np.unique).apply(max))) + 1

In [252]:
max_order_ids, max_length_order, max_product_ids, max_length_products, max_length_dates

(11126064, 110, 10777081, 110, 110)

In [253]:
date_embedding_dim = 16
product_embedding_dim = 16
order_embedding_dim = 16
date_inputs = Input(shape=(max_length_dates,))
product_inputs = Input(shape=(max_length_products,))
order_inputs = Input(shape=(max_length_order,))

encoding_padding_mask = tf.math.logical_not(tf.math.equal(product_inputs, 0))

date_emb = Embedding(max_date, date_embedding_dim, input_length=max_length_dates)(date_inputs)
product_emb = Embedding(max_product_ids, product_embedding_dim, input_length=max_length_products)(product_inputs)
order_emb = Embedding(max_order_ids, order_embedding_dim, input_length=max_length_order)(order_inputs)


concat_embedding_input = Concatenate(
    axis=-1, name='concat_embedding_input')([date_emb, product_emb, order_emb])

batchnorm_inputs = BatchNormalization(
    name='batchnorm_inputs')(concat_embedding_input)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)
# LSTM layer
lstm = LSTM(64, return_sequences=True)(concat_embedding_input)

lstm = BatchNormalization(name='batchnorm_lstm')(lstm)

att = Attention(use_scale=False,
                name='attention')(inputs=[lstm, lstm],
                                  mask=[encoding_padding_mask,
                                        encoding_padding_mask])

output = Dense(num_targets)(att)
model = Model(inputs=[date_inputs, product_inputs, order_inputs], outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse', 'mae', 'accuracy'])
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_43 (InputLayer)          [(None, 110)]        0           []                               
                                                                                                  
 input_44 (InputLayer)          [(None, 110)]        0           []                               
                                                                                                  
 input_45 (InputLayer)          [(None, 110)]        0           []                               
                                                                                                  
 embedding_42 (Embedding)       (None, 110, 16)      11817648    ['input_43[0][0]']               
                                                                                           

In [254]:
# X_train, X_test, y_train, y_test = train_test_split(
#     [dates_inputs, products_inputs, orders_inputs], target_data, test_size=0.33, random_state=42)

In [255]:
epochs = 3
batch_size = 32
history = model.fit([dates_inputs, products_inputs, orders_inputs], target_data, epochs=epochs, batch_size=batch_size)  # Adjust epochs and batch size as needed

Epoch 1/3


2023-06-10 10:26:14.299936: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/3
Epoch 3/3


In [256]:
if activate_mlflow:
    #mlflow.keras.log_model(model, "rnn_model")
    mlflow.log_param('epochs', epochs)
    mlflow.log_param('batch_size', batch_size)

In [257]:
display(history.history)

{'loss': [0.31592515110969543, 0.01564685069024563, 0.009094079956412315],
 'mse': [0.2943454086780548, 0.01583845540881157, 0.008871468715369701],
 'mae': [0.384521484375, 0.09172148257493973, 0.06417300552129745],
 'accuracy': [0.0, 0.0428336076438427, 0.0005491488263942301]}

In [258]:
# Evaluate the model
loss, mse, mae, acc = model.evaluate([dates_inputs, products_inputs, orders_inputs], target_data)
print("Test loss: {:.4f}, Test MSE: {:.4f}".format(loss, mse))
print("Test MAE: {:.4f}, Test Accuracy: {:.4f}".format(mae, acc))
if activate_mlflow:
    mlflow.log_metric('test loss', loss)
    mlflow.log_metric('test mse', mse)
    mlflow.log_metric('test mae', mae)
    mlflow.log_metric('test accuracy', acc)



2023-06-10 10:27:34.069497: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Test loss: 0.9066, Test MSE: 0.9056
Test MAE: 0.9497, Test Accuracy: 0.0000


In [259]:
if activate_mlflow:
    mlflow.end_run()