In [1]:
import tensorflow
import tensorflow.keras as keras
import tensorflow.keras.utils
from tensorflow.keras.layers import LSTM, Input, Dense, TimeDistributed, Masking, Bidirectional, Concatenate
from tensorflow.keras.models import Model

In [2]:
tensorflow.__version__

'2.0.0'

# General

In [3]:
alphabet = list("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.,;:!?-+*()[]&/ \"#") # %:;&#
alphabet.insert(0, chr(0)) # '\x00' character, i.e., ord(0) to label concatenate
alphabet.insert(1, '\t') # start of sequence
alphabet.insert(2, '\n') # end of sequence

# Model

In [4]:
timesteps = None
input_features = 3
encoder_space = 512
decoder_space = encoder_space * 2
decoder_timesteps = 300

In [5]:
# Encoder
encoder_input = Input(shape=(timesteps, input_features), name='encoder_input')
masking_layer =  Masking(mask_value=0., input_shape=(timesteps, input_features))(encoder_input)
encoder_output, forward_h, forward_c, backward_h, backward_c= Bidirectional(LSTM(encoder_space, return_state=True))(masking_layer)

# merge states
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

#Decoder
decoder_input = Input(shape=(timesteps, len(alphabet)), name='decoder_input')
decoder_outputs, _, _ = LSTM(decoder_space, return_sequences=True, return_state=True)(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(alphabet), activation='softmax')(decoder_outputs)

In [25]:
model = Model([encoder_input, decoder_input], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Preprocessing

In [7]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [8]:
def pad_to_length(sequences, max_len, two_dimensional=True):
    padded_sequence = []
    for seq in sequences:
        pad_len = max_len - len(seq)
        if two_dimensional:
            padded_seq = np.pad(seq, [(0, pad_len), (0, 0)], mode='constant', constant_values=0)
        else:
            padded_seq = np.pad(seq, (0, pad_len), mode='constant', constant_values=0)
            padded_seq = np.expand_dims(padded_seq, axis=1)
        padded_sequence.append(padded_seq)
    # check whether all lists have actually the same length
    assert len(list(filter(lambda x: x != max_len, [len(seq) for seq in padded_sequence]))) == 0
    return np.array(padded_sequence)

In [9]:
training_dataset = np.load('/home/martin/Downloads/deepwriting_dataset/deepwriting_training.npz', allow_pickle=True)

In [10]:
list(training_dataset.keys())

['subject_labels',
 'eoc_labels',
 'alphabet',
 'strokes',
 'eow_labels',
 'char_labels',
 'word_labels',
 'max',
 'min',
 'soc_labels',
 'mean',
 'texts',
 'std',
 'preprocessing',
 'sow_labels']

In [11]:
max_len_training_encoder = len(max(training_dataset['strokes'], key=len))

In [13]:
encoder_input_data = pad_to_length(training_dataset['strokes'], max_len_training_encoder)

In [14]:
print(encoder_input_data.shape)

(34577, 489, 3)


In [15]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(alphabet)

In [19]:
decoder_input_texts = ["\t" + text for text in training_dataset['texts']]
decoder_input_data = [label_encoder.transform([char for char in text]) for text in decoder_input_texts]

In [20]:
max_len_training_decoder = len(max(decoder_input_data, key=len))

In [21]:
decoder_input_data = pad_to_length(decoder_input_data, max_len_training_decoder, False)
decoder_input_data = to_categorical(decoder_input_data, num_classes=len(alphabet))
decoder_input_data.shape

(34577, 65, 84)

In [23]:
decoder_output_texts = [text + "\n" for text in training_dataset['texts']]
decoder_output_data = [label_encoder.transform([char for char in text]) for text in decoder_input_texts]
decoder_output_data = pad_to_length(decoder_output_data, max_len_training_decoder, False)

# Training

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data)

Train on 34577 samples
   64/34577 [..............................] - ETA: 3:56:03 - loss: 7.7500 - accuracy: 0.2474    