In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

from fastDamerauLevenshtein import damerauLevenshtein
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from tensorflow.keras import layers
from tqdm import trange

In [17]:
df = pd.read_csv('data/single_step_df_ints_2022-10-11_encoded.csv', header=0)

In [18]:
df

Unnamed: 0,input,target,start_coords.x,start_coords.y,start_coords.z,4.already_seen,bowl_1.already_seen,k.already_seen,dish_3.already_seen,c.already_seen,...,dish_3.food_k,dish_3.strong_k,dish_3.mid_k,coordinates_9.x,coordinates_9.y,coordinates_9.z,9.containment,9.food_k,9.strong_k,9.mid_k
0,<start>,p,-0.451354,-0.413918,0.156247,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,p,o,0.513000,-0.531000,0.740000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,o,c,0.513000,-0.531000,0.740000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,c,g,0.513000,-0.531000,0.740000,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
4,g,k,0.513000,-0.531000,0.740000,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2198,2,3,2.500000,4.500000,2.000000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2199,3,4,2.500000,4.500000,2.000000,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2200,4,5,2.500000,4.500000,2.000000,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2201,5,6,2.500000,4.500000,2.000000,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [19]:
float_cols = df.select_dtypes(include=['float64']).columns
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)

In [20]:
cols = df.select_dtypes(include=['object'])
for col in cols.columns.values:
    df[col] = df[col].fillna('')

In [21]:
df['target'].value_counts()
df[:1437]['target'].value_counts()
#df[:1437]['target'].loc[df[:1437]['target'] == 'i'].index[0]
#df_new[1436:]['target'].value_counts()

p    188
k    162
g    160
f    157
c    148
s    134
b     78
w     74
o     73
a     72
r     47
d     27
h     27
t     22
e     22
z     21
m     13
n      9
x      2
i      1
Name: target, dtype: int64

In [22]:
#index_to_drop = df[1436:]['target'].loc[df[1436:]['target'] == 'y'].index[0]
#index_to_drop2 = df[1436:]['target'].loc[df[1436:]['target'] == 'h'].index[0]
index_to_drop = df['target'].loc[df['target'] == 'i'].index[0]
index_to_drop

727

In [12]:
index_to_drop2 = df_new[:1437]['target'].loc[df_new[:1437]['target'] == 'i'].index[0]
index_to_drop2

727

In [23]:
nr_of_sequences = len(df[df['input'] == '<start>'])

In [24]:
# remove row with target that only occurs once

df_new = df.drop(index=[index_to_drop], axis=0)
#df_new = df.drop(index=[index_to_drop2], axis=0)
df_new.reset_index(inplace=True, drop=True)

In [25]:
# generate stratified split

split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

for train_index, test_index in split.split(df_new[:1436], df_new[:1436]['target']):
#for train_index, test_index in split.split(df_new, df_new['target']):
#for train_index, test_index in split.split(df_new[1436:], df_new[1436:]['target']):
    strat_train = df_new.loc[train_index]
    strat_test_val = df_new.loc[test_index]

In [26]:
# split test_val into test and val

split_test_val = len(strat_test_val)//2

strat_test = strat_test_val[:split_test_val]
strat_val = strat_test_val[split_test_val:]

In [27]:
# generate list of labels to pass to MultiLabelBinarizer so there's the same number of
# classes for all datasets

labels = df_new['target'].unique()

In [28]:
len(labels)

35

In [29]:
def create_dataset(dataframe, labels, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels_ds = df.pop('target')
    encoder = MultiLabelBinarizer(classes=labels)
    encoded_labels = encoder.fit_transform(labels_ds)
    
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), encoded_labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    
    return dataset

In [30]:
batch_size = 128

In [31]:
train_ds = create_dataset(strat_train, labels, batch_size=batch_size)
val_ds = create_dataset(strat_val, labels, shuffle=False, batch_size=batch_size)
test_ds = create_dataset(strat_test, labels, shuffle=False, batch_size=batch_size)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}


In [32]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    
    return normalizer

In [33]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
        
    # prepare tf.data.Dataset that only yields the feature    
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # learn set of possible values and assign fixed int index
    index.adapt(feature_ds)
    
    # encode int indices
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    
    # apply multi-hot encoding to indices
    # lambda function captures the layer to include them in Keras functional models later
    return lambda feature: encoder(index(feature))

In [34]:
def create_input_data(dataframe):
    all_inputs = []
    encoded_features = []
    
    for header in dataframe.columns:
        # numerical features
        if 'coord' in header or 'already' in header:
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)
        
        # categorical features
        elif 'containment' in header or 'food' in header or 'mid' in header or \
        'strong' in header:
            categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
            encoding_layer = get_category_encoding_layer(name=header,
                                                        dataset=train_ds,
                                                        dtype='int64')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
        elif header == 'input':
            categorical_col = tf.keras.Input(shape=(1,), name='input', dtype='string')
            encoding_layer = get_category_encoding_layer(name='input',
                                                        dataset=train_ds,
                                                        dtype='string')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
    return all_inputs, encoded_features

In [35]:
all_inputs, encoded_features = create_input_data(df)

In [36]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

In [37]:
# use model from tutorial with dense layers

all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(512, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(256, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(len(labels))(x)

model = tf.keras.Model(all_inputs, output)

In [38]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=["accuracy"])

In [81]:
#tf.keras.utils.plot_model(model, show_shapes=True, rankdir='LR')

In [39]:
model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


<keras.callbacks.History at 0x7f03ef8f9fc0>

In [40]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.5069767236709595


In [41]:
model.save('models/next_obj_classifier_tablesetting_2022-10-11')

2022-10-11 10:48:07.300508: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/next_obj_classifier_tablesetting_2022-10-11/assets


INFO:tensorflow:Assets written to: models/next_obj_classifier_tablesetting_2022-10-11/assets


In [42]:
reloaded_model = tf.keras.models.load_model('models/next_obj_classifier_tablesetting_2022-10-11/')

In [None]:
# test prediction for one sample

In [43]:
sample = df.loc[0].drop('target').to_dict()

In [44]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}

In [45]:
prediction = reloaded_model.predict(input_dict)
prediction = tf.nn.sigmoid(prediction[0])

In [46]:
# get label for prediction

pred_label = labels[np.argmax(prediction)]
pred_label

'p'

In [59]:
def get_prequential_error(dataframe, model, labels, nr_of_sequences):
    errors = [[] for seq in range(0, nr_of_sequences + 1)]
    predictions = [[] for seq in range(0, nr_of_sequences + 1)]
    start_token_count = 0
    sequence_nr = 0
    
    for row in trange(dataframe.index[0], dataframe.index[-1]): 
        observed_target = dataframe.loc[row, 'target']
        sample = dataframe.loc[row].drop('target').to_dict()
        input_dict = {name: tf.convert_to_tensor([value]) for name, value in 
                          sample.items()}
        predicted_target = model.predict(input_dict)
        predicted_target = tf.nn.sigmoid(predicted_target[0])
            
        pred_label = labels[np.argmax(predicted_target)]
        error = 1 - damerauLevenshtein(pred_label, observed_target)
        errors[sequence_nr].append(error)
        predictions[sequence_nr].append(pred_label)
        
        if row != 0 and dataframe.loc[row, 'input'] == '<start>':
            start_token_count += 1
        
        if start_token_count > 0:
            sequence_nr += 1
            start_token_count = 0
            
    return errors, predictions

In [57]:
nr_of_sequences_preds = len(df[2075:][df[2075:]['input'] == '<start>'])
nr_of_sequences_preds

16

In [48]:
df[2075:].index[-1]

2202

In [60]:
errors, predictions = get_prequential_error(df[2075:], reloaded_model, labels, nr_of_sequences_preds)

100%|██████████████████████████████████████████████████████████████| 127/127 [01:17<00:00,  1.65it/s]


In [67]:
summed_error = [sum(error) for error in errors]

In [68]:
np.median(summed_error)

8.0

In [70]:
with open('results/nn_spatialinfo_prequential_summed_fitted_on_ts_results_cleaning_2022-10-11.txt', 'w') as file:
    file.write(str(summed_error))

In [69]:
len(summed_error)

17