In [41]:
import numpy as np
import pandas as pd
import tensorflow as tf

from fastDamerauLevenshtein import damerauLevenshtein
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('data/single_step_df_ints_2022-01-11.csv')

In [3]:
df

Unnamed: 0,input,target,start_coords_x,start_coords_y,start_coords_z,coordinates_p.x,coordinates_p.y,coordinates_p.z,p.containment,p.food_k,...,h.food_k,h.strong_k,h.mid_k,coordinates_q.x,coordinates_q.y,coordinates_q.z,q.containment,q.food_k,q.strong_k,q.mid_k
0,<start>,p,-0.451354,-0.413918,0.156247,0.008034,0.957082,0.689054,0.0,0.0,...,,,,,,,,,,
1,o,c,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
2,c,g,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
3,g,k,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
4,k,r,0.513000,-0.531000,0.740000,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,<start>,h,1.000000,3.000000,2.000000,,,,,,...,0.0,0.0,0.0,,,,,,,
1268,h,s,0.000000,4.000000,1.000000,,,,,,...,0.0,0.0,0.0,,,,,,,
1269,s,d,0.000000,4.000000,1.000000,,,,,,...,,,,,,,,,,
1270,d,g,0.000000,2.000000,2.000000,,,,,,...,,,,,,,,,,


In [4]:
float_cols = df.select_dtypes(include=['float64']).columns
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)

In [5]:
len(df['target'].unique())

20

In [44]:
nr_of_sequences = len(df[df['input'] == '<start>'])

In [6]:
#train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [7]:
# generate stratified split for train - test/val

split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

for train_index, test_index in split.split(df, df['target']):
    strat_train = df.loc[train_index]
    strat_test_val = df.loc[test_index]

In [8]:
# split test_val into test and val

split_test_val = len(strat_test_val)//2

strat_test = strat_test_val[:split_test_val]
strat_val = strat_test_val[split_test_val:]

In [9]:
# generate list of labels to pass to MultiLabelBinarizer so there's the same number of
# classes for all datasets

labels = df['target'].unique()

In [10]:
def create_dataset(dataframe, labels, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels_ds = df.pop('target')
    encoder = MultiLabelBinarizer(classes=labels)
    encoded_labels = encoder.fit_transform(labels_ds)
    
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), encoded_labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    
    return dataset

In [11]:
batch_size = 256

In [12]:
train_ds = create_dataset(strat_train, labels, batch_size=batch_size)
val_ds = create_dataset(strat_val, labels, shuffle=False, batch_size=batch_size)
test_ds = create_dataset(strat_test, labels, shuffle=False, batch_size=batch_size)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}
2022-01-18 11:46:07.469095: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-18 11:46:07.469644: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-18 11:46:07.469750: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-01-18 11:46:07.469835: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-01-18 11:46:07.469912: W 

In [13]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of targets:', label_batch)

Every feature: ['input', 'start_coords_x', 'start_coords_y', 'start_coords_z', 'coordinates_p.x', 'coordinates_p.y', 'coordinates_p.z', 'p.containment', 'p.food_k', 'p.strong_k', 'p.mid_k', 'coordinates_o.x', 'coordinates_o.y', 'coordinates_o.z', 'o.containment', 'o.food_k', 'o.strong_k', 'o.mid_k', 'coordinates_c.x', 'coordinates_c.y', 'coordinates_c.z', 'c.containment', 'c.food_k', 'c.strong_k', 'c.mid_k', 'coordinates_g.x', 'coordinates_g.y', 'coordinates_g.z', 'g.containment', 'g.food_k', 'g.strong_k', 'g.mid_k', 'coordinates_k.x', 'coordinates_k.y', 'coordinates_k.z', 'k.containment', 'k.food_k', 'k.strong_k', 'k.mid_k', 'coordinates_w.x', 'coordinates_w.y', 'coordinates_w.z', 'w.containment', 'w.food_k', 'w.strong_k', 'w.mid_k', 'coordinates_f.x', 'coordinates_f.y', 'coordinates_f.z', 'f.containment', 'f.food_k', 'f.strong_k', 'f.mid_k', 'coordinates_s.x', 'coordinates_s.y', 'coordinates_s.z', 's.containment', 's.food_k', 's.strong_k', 's.mid_k', 'coordinates_t.x', 'coordinates_t

In [14]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    
    return normalizer

In [15]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
        
    # prepare tf.data.Dataset that only yields the feature    
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # learn set of possible values and assign fixed int index
    index.adapt(feature_ds)
    
    # encode int indices
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    
    # apply multi-hot encoding to indices
    # lambda function captures the layer to include them in Keras functional models later
    return lambda feature: encoder(index(feature))

In [16]:
def create_input_data(dataframe):
    all_inputs = []
    encoded_features = []
    
    for header in dataframe.columns:
        # numerical features
        if 'coord' in header:
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)
        
        # categorical features
        elif 'containment' in header or 'food' in header or 'mid' in header or \
        'strong' in header:
            categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
            encoding_layer = get_category_encoding_layer(name=header,
                                                        dataset=train_ds,
                                                        dtype='int64')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
        elif header == 'input':
            categorical_col = tf.keras.Input(shape=(1,), name='input', dtype='string')
            encoding_layer = get_category_encoding_layer(name='input',
                                                        dataset=train_ds,
                                                        dtype='string')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
    return all_inputs, encoded_features

In [17]:
all_inputs, encoded_features = create_input_data(df)

In [18]:
# use model from tutorial with dense layers

all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(512, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(256, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(20)(x)

model = tf.keras.Model(all_inputs, output)

In [19]:
model.compile(optimizer='adam',
             loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=["accuracy"])

In [20]:
#tf.keras.utils.plot_model(model, show_shapes=True, rankdir='LR')

In [21]:
model.fit(train_ds, epochs=100, validation_data=val_ds)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f478d9daa90>

In [22]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.43455496430397034


In [23]:
model.save('next_obj_classifier')

2022-01-18 11:47:32.878805: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: next_obj_classifier/assets


INFO:tensorflow:Assets written to: next_obj_classifier/assets


In [24]:
reloaded_model = tf.keras.models.load_model('next_obj_classifier')

In [38]:
# test prediction for one sample

In [34]:
sample = df.loc[0].drop('target').to_dict()

In [35]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}

In [36]:
prediction = reloaded_model.predict(input_dict)
prediction = tf.nn.sigmoid(prediction[0])

In [37]:
# get label for prediction

pred_label = labels[np.argmax(prediction)]
pred_label

'p'

In [74]:
def get_prequential_error(dataframe, model, labels, nr_of_sequences):
    errors = [[] for seq in range(0, nr_of_sequences + 1)]
    start_token_count = 0
    sequence_nr = 0
    
    for row in range(0, len(dataframe)): 
        observed_target = dataframe.loc[row, 'target']
        sample = dataframe.loc[row].drop('target').to_dict()
        input_dict = {name: tf.convert_to_tensor([value]) for name, value in 
                          sample.items()}
        predicted_target = model.predict(input_dict)
        predicted_target = tf.nn.sigmoid(predicted_target[0])
            
        pred_label = labels[np.argmax(predicted_target)]
        error = 1 - damerauLevenshtein(pred_label, observed_target)
        errors[sequence_nr].append(error)
        
        if row != 0 and dataframe.loc[row, 'input'] == '<start>':
            start_token_count += 1
        
        if start_token_count > 0:
            sequence_nr += 1
            start_token_count = 0
            
    return errors

In [73]:
errors = get_prequential_error(df, model, labels, nr_of_sequences)

191 0 0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  1
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  2
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  3
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  4
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  5
Start token count:  0
Start token count:  0
Start to

Seq nr.:  43
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  44
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  45
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  46
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  47
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  48
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  49
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start t

Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  99
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  100
Start token count:  0
Start token count:  0
Seq nr.:  101
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  102
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  103
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  104
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  105
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  106
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  107
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count

Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  152
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  153
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  154
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  155
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  156
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Start token count:  0
Seq nr.:  157
Start token count:  0
Start token count:  0
Start token count:  0
Start token co

In [78]:
summed_error = [sum(error) for error in errors[:-1]]

In [80]:
np.median(summed_error)

4.0

In [82]:
#with open('results/nn_spatialinfo_prequential_summed.txt', 'w') as file:
#    file.write(str(summed_error))