In [43]:
import numpy as np
import pandas as pd
import tensorflow as tf

from fastDamerauLevenshtein import damerauLevenshtein
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from tensorflow.keras import layers
from tqdm import trange

## 1. Read + prepare data
Data is expected to be transformed into a single-step version using the `transform_data_to_single_step_workflow.py` script.

In [3]:
df = pd.read_csv('data/single_step_df_ints_2022-10-11_encoded.csv', header=0)

Float columns: Fill NAs with -99.

In [5]:
float_cols = df.select_dtypes(include=['float64']).columns
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)

Object columns: Fill NAs with empty string.

In [6]:
cols = df.select_dtypes(include=['object'])
for col in cols.columns.values:
    df[col] = df[col].fillna('')

Show value counts for target to check if all classes have more than one instance (required for stratisfied shuffle split).

In [44]:
df['target'].value_counts()
df[:1437]['target'].value_counts()

p    188
k    162
g    160
f    157
c    148
s    134
b     78
w     74
o     73
a     72
r     47
d     27
h     27
t     22
e     22
z     21
m     13
n      9
x      2
i      1
Name: target, dtype: int64

Remove row with class that only occurs once.

In [45]:
index_to_drop = df['target'].loc[df['target'] == 'i'].index[0]
index_to_drop

727

In [46]:
nr_of_sequences = len(df[df['input'] == '<start>'])

In [47]:
df_new = df.drop(index=[index_to_drop], axis=0)
df_new.reset_index(inplace=True, drop=True)

Use stratisfied shuffle split to generate training and test set. The dataframe is cut at index 1436 here to only use the table setting data for training the model.

In [48]:
split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

for train_index, test_index in split.split(df_new[:1436], df_new[:1436]['target']):
    strat_train = df_new.loc[train_index]
    strat_test_val = df_new.loc[test_index]

Split test data into test and validation set.

In [49]:
split_test_val = len(strat_test_val)//2

strat_test = strat_test_val[:split_test_val]
strat_val = strat_test_val[split_test_val:]

Generate list of labels to pass to MultiLabelBinarizer so there's the same number of classes for all datasets.

In [50]:
labels = df_new['target'].unique()

In [51]:
len(labels)

35

## 2. Create train, test, validation data sets

In [53]:
def create_dataset(dataframe, labels, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels_ds = df.pop('target')
    encoder = MultiLabelBinarizer(classes=labels)
    encoded_labels = encoder.fit_transform(labels_ds)
    
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), encoded_labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    
    return dataset

In [54]:
batch_size = 128

In [18]:
train_ds = create_dataset(strat_train, labels, batch_size=batch_size)
val_ds = create_dataset(strat_val, labels, shuffle=False, batch_size=batch_size)
test_ds = create_dataset(strat_test, labels, shuffle=False, batch_size=batch_size)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}
2023-08-08 11:21:49.904040: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-08 11:21:50.019249: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-08 11:21:50.019423: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-08 11:21:50.021962: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enabl

## 3. Define functions to create layers and input data

In [19]:
def get_normalization_layer(name, dataset):
  # create a normalization layer for the feature
    normalizer = layers.Normalization(axis=None)

  # prepare a dataset that only yields the feature
    feature_ds = dataset.map(lambda x, y: x[name])

  # learn the statistics of the data
    normalizer.adapt(feature_ds)
    
    return normalizer

In [20]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
        
    # prepare tf.data.Dataset that only yields the feature    
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # learn set of possible values and assign fixed int index
    index.adapt(feature_ds)
    
    # encode int indices
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    
    # apply multi-hot encoding to indices
    # lambda function captures the layer to include them in Keras functional models later
    return lambda feature: encoder(index(feature))

In [21]:
def create_input_data(dataframe):
    all_inputs = []
    encoded_features = []
    
    for header in dataframe.columns:
        # numerical features
        if 'coord' in header or 'already' in header:
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)
        
        # categorical features
        elif 'containment' in header or 'food' in header or 'mid' in header or \
        'strong' in header:
            categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
            encoding_layer = get_category_encoding_layer(name=header,
                                                        dataset=train_ds,
                                                        dtype='int64')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
        elif header == 'input':
            categorical_col = tf.keras.Input(shape=(1,), name='input', dtype='string')
            encoding_layer = get_category_encoding_layer(name='input',
                                                        dataset=train_ds,
                                                        dtype='string')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
    return all_inputs, encoded_features

## 4. Create and train model

In [22]:
all_inputs, encoded_features = create_input_data(df)

In [23]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

In [55]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(512, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(256, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(len(labels))(x)

model = tf.keras.Model(all_inputs, output)

In [26]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=["accuracy"])

In [29]:
model.fit(train_ds, epochs=300, validation_data=val_ds, callbacks=[callback])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300


Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300


<keras.callbacks.History at 0x7f73553cbbe0>

In [30]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.4976744055747986


Optional: Save model for later reuse.

In [41]:
# model.save('models/next_obj_classifier_tablesetting')

2022-10-11 10:48:07.300508: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/next_obj_classifier_tablesetting_2022-10-11/assets


INFO:tensorflow:Assets written to: models/next_obj_classifier_tablesetting_2022-10-11/assets


In [42]:
# reloaded_model = tf.keras.models.load_model('models/next_obj_classifier_tablesetting')

### Test model prediction for one sample

In [31]:
sample = df.loc[0].drop('target').to_dict()

In [32]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}

In [33]:
prediction = model.predict(input_dict)
prediction = tf.nn.sigmoid(prediction[0])

Get label for prediction.

In [34]:
pred_label = labels[np.argmax(prediction)]
pred_label

'p'

## 5. Run prediction for each sequence using prequential approach

In [35]:
def get_prequential_error(dataframe, model, labels, nr_of_sequences):
    errors = [[] for seq in range(0, nr_of_sequences + 1)]
    predictions = [[] for seq in range(0, nr_of_sequences + 1)]
    start_token_count = 0
    sequence_nr = 0
    
    for row in trange(dataframe.index[0], dataframe.index[-1]): 
        observed_target = dataframe.loc[row, 'target']
        sample = dataframe.loc[row].drop('target').to_dict()
        input_dict = {name: tf.convert_to_tensor([value]) for name, value in 
                          sample.items()}
        predicted_target = model.predict(input_dict)
        predicted_target = tf.nn.sigmoid(predicted_target[0])
            
        pred_label = labels[np.argmax(predicted_target)]
        error = 1 - damerauLevenshtein(pred_label, observed_target)
        errors[sequence_nr].append(error)
        predictions[sequence_nr].append(pred_label)
        
        if row != 0 and dataframe.loc[row, 'input'] == '<start>':
            start_token_count += 1
        
        if start_token_count > 0:
            sequence_nr += 1
            start_token_count = 0
            
    return errors, predictions

Define number of sequences for which prediction is run (cooking data).

In [36]:
nr_of_sequences_preds = len(df[2075:][df[2075:]['input'] == '<start>'])
nr_of_sequences_preds

16

Run prediction, sum up errors and get median.

In [39]:
errors, predictions = get_prequential_error(df[2075:], model, labels, nr_of_sequences_preds)

100%|██████████████████████████████████████████████████████████████| 127/127 [01:24<00:00,  1.50it/s]


In [40]:
summed_error = [sum(error) for error in errors]

In [41]:
np.median(summed_error)

8.0