In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from fastDamerauLevenshtein import damerauLevenshtein
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from tensorflow.keras import layers

2022-03-23 12:09:58.309241: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-23 12:09:58.309277: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
df = pd.read_csv('data/single_step_df_ints_2022-03-15.csv')

In [3]:
float_cols = df.select_dtypes(include=['float64']).columns
df.loc[:, float_cols] = df.loc[:, float_cols].fillna(-99)

In [4]:
df['target'].unique()
df['target'].value_counts()

df['target'].loc[df['target'] == 'l']

727    l
Name: target, dtype: object

In [5]:
nr_of_sequences = len(df[df['input'] == '<start>'])

In [6]:
# remove row with target that only occurs once

df_new = df.drop(index=727, axis=0)
df_new.reset_index(inplace=True, drop=True)

In [7]:
# generate stratified split for train - test/val

split = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)

for train_index, test_index in split.split(df_new, df_new['target']):
    strat_train = df_new.loc[train_index]
    strat_test_val = df_new.loc[test_index]

In [8]:
# split test_val into test and val

split_test_val = len(strat_test_val)//2

strat_test = strat_test_val[:split_test_val]
strat_val = strat_test_val[split_test_val:]

In [9]:
# generate list of labels to pass to MultiLabelBinarizer so there's the same number of
# classes for all datasets

labels = df_new['target'].unique()

In [10]:
def create_dataset(dataframe, labels, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels_ds = df.pop('target')
    encoder = MultiLabelBinarizer(classes=labels)
    encoded_labels = encoder.fit_transform(labels_ds)
    
    df = {key: value[:, tf.newaxis] for key, value in df.items()}
    dataset = tf.data.Dataset.from_tensor_slices((dict(df), encoded_labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    
    return dataset

In [11]:
batch_size = 256

In [12]:
train_ds = create_dataset(strat_train, labels, batch_size=batch_size)
val_ds = create_dataset(strat_val, labels, shuffle=False, batch_size=batch_size)
test_ds = create_dataset(strat_test, labels, shuffle=False, batch_size=batch_size)

  df = {key: value[:, tf.newaxis] for key, value in df.items()}
2022-03-23 12:10:04.476628: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-23 12:10:04.477211: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-23 12:10:04.477410: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-03-23 12:10:04.477582: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-03-23 12:10:04.477745: W 

In [13]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    
    return normalizer

In [14]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
        
    # prepare tf.data.Dataset that only yields the feature    
    feature_ds = dataset.map(lambda x, y: x[name])
    
    # learn set of possible values and assign fixed int index
    index.adapt(feature_ds)
    
    # encode int indices
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    
    # apply multi-hot encoding to indices
    # lambda function captures the layer to include them in Keras functional models later
    return lambda feature: encoder(index(feature))

In [15]:
def create_input_data(dataframe):
    all_inputs = []
    encoded_features = []
    
    for header in dataframe.columns:
        # numerical features
        if 'coord' in header or 'already' in header:
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)
        
        # categorical features
        elif 'containment' in header or 'food' in header or 'mid' in header or \
        'strong' in header:
            categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
            encoding_layer = get_category_encoding_layer(name=header,
                                                        dataset=train_ds,
                                                        dtype='int64')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
        elif header == 'input':
            categorical_col = tf.keras.Input(shape=(1,), name='input', dtype='string')
            encoding_layer = get_category_encoding_layer(name='input',
                                                        dataset=train_ds,
                                                        dtype='string')
            encoded_categorical_col = encoding_layer(categorical_col)
            all_inputs.append(categorical_col)
            encoded_features.append(encoded_categorical_col)
            
    return all_inputs, encoded_features

In [16]:
all_inputs, encoded_features = create_input_data(df)

In [17]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

In [23]:
def get_model_accuracy(train_ds, test_ds, val_ds, 
                       all_inputs=all_inputs, encoded_features=encoded_features,
                       callback=callback):
    accuracy_list = []
    
    dropout_rate = [0.2,0.3,0.4,0.5,0.6,0.7]
    neurons = [128,256,512,1024]
    
    for dropout in dropout_rate:
        for neuron_nr in neurons:
            all_features = tf.keras.layers.concatenate(encoded_features)
            x = tf.keras.layers.Dense(neuron_nr, activation='relu')(all_features)
            x = tf.keras.layers.Dropout(dropout)(x)
            x = tf.keras.layers.Dense(neuron_nr//2, activation='relu')(all_features)
            x = tf.keras.layers.Dropout(dropout)(x)
            output = tf.keras.layers.Dense(20)(x)
    
            model = tf.keras.Model(all_inputs, output)
    
            model.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
    
            history = model.fit(train_ds, epochs=200, validation_data=val_ds, 
                                callbacks=[callback], verbose=0)
            accuracy = history.history['accuracy'][-1]
    
            loss, accuracy_test = model.evaluate(test_ds)
    
            accuracy_list.append([accuracy, accuracy_test, 'params (neurons, dropout): ', neuron_nr, dropout])
    
    return accuracy_list

In [25]:
accuracies = get_model_accuracy(train_ds, test_ds, val_ds, all_inputs, encoded_features, callback)



In [26]:
results = pd.DataFrame(columns=['accuracy', 'test_accuracy', 'neurons', 'dropout'])
results['accuracy'] = [lst[0] for lst in accuracies]
results['test_accuracy'] = [lst[1] for lst in accuracies]
results['neurons'] = [lst[3] for lst in accuracies]
results['dropout'] = [lst[4] for lst in accuracies]
results['diff'] = abs(results['accuracy'] - results['test_accuracy'])

In [39]:
highest_acc = results['test_accuracy'].idxmax()
lowest_diff = results['diff'].idxmin()

results.loc[highest_acc, :]

accuracy            0.744618
test_accuracy       0.552511
neurons          1024.000000
dropout             0.300000
diff                0.192107
Name: 7, dtype: float64

In [30]:
#results.to_csv('results/gridsearch_train_test_accuracies_2022-03-23.csv', header=True, index=False)

In [37]:
%matplotlib qt

x = [x for x in range(0, len(results))]
median_acc = [np.median(results['accuracy'])] * len(x)
median_test_acc = [np.median(results['test_accuracy'])] * len(x)
std = [results['accuracy'].std()] * len(x)
std_test = [results['test_accuracy'].std()] * len(x)
labels = ['train acc', 'test acc']

fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(12,6),
                              facecolor='white')

ax0.boxplot([results['accuracy'], results['test_accuracy']], patch_artist=True,
            labels=labels, showmeans=True,
            boxprops=dict(facecolor='aliceblue', color='black'),
            meanprops=dict(marker='D', markerfacecolor='darkred', markeredgecolor='black'),
            medianprops=dict(linestyle='-', color='darkgreen', linewidth=1.5),
            flierprops=dict(marker='o', markeredgecolor='firebrick', markersize=8,
                           markerfacecolor='orangered'))

#ax0.legend(fontsize=10, framealpha=0.8, loc='upper right')

#ax0 = sns.boxplot(data=results.iloc[:,:2])

ax1.scatter(x, results['accuracy'], s=18, c='navy')
ax1.plot(x, median_acc, c='dodgerblue', 
         label='accuracy median: ' + str(np.round(median_acc[0], 2)))
ax1.fill_between(x, median_acc, results['accuracy'], 
                 alpha=0.2, color='dodgerblue')

ax1.scatter(x, results['test_accuracy'], s=18, c='limegreen')
ax1.plot(x, median_test_acc, c='green', 
         label='test accuracy median: ' + str(np.round(median_test_acc[0], 2)))
ax1.fill_between(x, median_test_acc, results['test_accuracy'], 
                 alpha=0.2, color='limegreen')

plt.ylim(0.3, 0.8)
ax1.legend(fontsize=10, framealpha=0.8, loc='lower right', markerscale=2.5)

<matplotlib.legend.Legend at 0x7f497a44aa90>