In [191]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Activation, Input, Dense, Lambda, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.optimizers import SGD, RMSprop
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import TerminateOnNaN

import sys
sys.path.append('..')
from text_recognizer.datasets.emnist_lines import EmnistLinesDataset

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
dataset = EmnistLinesDataset(max_overlap=0)
dataset.load_or_generate_data()

EmnistLinesDataset loading data from HDF5...


In [145]:
# First, let's make sure that we can learn on the output data itself

inputs = Input(shape=(32, 65))
outputs = TimeDistributed(Dense(65, activation='softmax'))(inputs)
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile('rmsprop', 'categorical_crossentropy', ['accuracy'])
model.fit(x=dataset.y_train, y=dataset.y_train, epochs=10, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, 32, 65)            0         
_________________________________________________________________
time_distributed_37 (TimeDis (None, 32, 65)            4290      
Total params: 4,290
Trainable params: 4,290
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6b2fbb2f98>

In [146]:
# Now let's switch to a data generator

class TrivialDataset(Sequence):
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y
    

inputs = Input(shape=(32, 65))
outputs = TimeDistributed(Dense(65, activation='softmax'))(inputs)
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile('rmsprop', 'categorical_crossentropy', ['accuracy'])
generator = TrivialDataset(dataset.y_train, dataset.y_train, 32)
model.fit_generator(generator, epochs=10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 32, 65)            0         
_________________________________________________________________
time_distributed_38 (TimeDis (None, 32, 65)            4290      
Total params: 4,290
Trainable params: 4,290
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6b2e66ff98>

In [84]:
# Now let's name inputs

class TrivialDataset(Sequence):
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_inputs = {
            'raw': batch_x,
            'argmax': np.argmax(batch_x, -1)
        }
        return batch_inputs, batch_y
    

input_raw = Input(shape=(32, 65), name='raw')
input_argmax = Input(shape=(32,), name='argmax')
outputs = TimeDistributed(Dense(65, activation='softmax'))(input_raw)
model = Model(inputs=[input_raw, input_argmax], outputs=outputs)
model.summary()
model.compile('rmsprop', 'categorical_crossentropy', ['accuracy'])
generator = TrivialDataset(dataset.y_train, dataset.y_train, 32)
model.fit_generator(generator, epochs=10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
raw (InputLayer)             (None, 32, 65)            0         
_________________________________________________________________
time_distributed_36 (TimeDis (None, 32, 65)            4290      
Total params: 4,290
Trainable params: 4,290
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6b4d1f5828>

In [147]:
# Now let's name outputs and compute loss and acc directly in the network
# Note that it's important that every model output has a corresponding entry in the data.

class TrivialDataset(Sequence):
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_inputs = {
            'raw': batch_x,
            'argmax': np.argmax(batch_x, -1)
        }
        batch_outputs = {
            'loss_output': batch_y,
            'acc_output': np.argmax(batch_x, -1)
        }
        return batch_inputs, batch_outputs

    
input_raw = Input(shape=(32, 65), name='raw')
input_argmax = Input(shape=(32,), name='argmax')

softmax_output = TimeDistributed(Dense(65, activation='softmax'), name='softmax_output')(input_raw)

loss_output = Lambda(lambda x: K.mean(K.categorical_crossentropy(x[0], x[1]), axis=-1), name='loss_output')([softmax_output, input_raw])
acc_output = Lambda(lambda x: K.mean(K.equal(K.argmax(x[0], axis=-1), K.argmax(x[1], axis=-1)), axis=-1), name='acc_output')([softmax_output, input_raw])

model = Model(inputs=[input_raw, input_argmax], outputs=[loss_output, acc_output])
model.summary()
model.compile('rmsprop', 
              loss={
                  'loss_output': lambda y_true, y_pred: y_pred,
                  'acc_output': lambda y_true, y_pred: y_pred
              },
              loss_weights={
                  'loss_output': 1,
                  'acc_output': 0
              })
generator = TrivialDataset(dataset.y_train, dataset.y_train, 32)
model.fit_generator(generator, epochs=10)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
raw (InputLayer)                (None, 32, 65)       0                                            
__________________________________________________________________________________________________
softmax_output (TimeDistributed (None, 32, 65)       4290        raw[0][0]                        
__________________________________________________________________________________________________
loss_output (Lambda)            (None,)              0           softmax_output[0][0]             
                                                                 raw[0][0]                        
__________________________________________________________________________________________________
acc_output (Lambda)             (None,)              0           softmax_output[0][0]             
          

<tensorflow.python.keras.callbacks.History at 0x7f6b2f637748>

In [218]:
# Now we try CTC loss

class TrivialDataset(Sequence):
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        # NOTE: if not using np.take, some batches will be less than batch_size and stuff can get weird
#         batch_x = np.take(self.x, range(idx * self.batch_size, (idx + 1) * self.batch_size), axis=0, mode='wrap')
        batch_y = np.take(self.y, range(idx * self.batch_size, (idx + 1) * self.batch_size), axis=0, mode='wrap')
        
        batch_y = np.dstack((
            batch_y,
            np.zeros((batch_y.shape[0], batch_y.shape[1]))
        ))
        
        batch_inputs = {
            'raw': batch_y,
            'y_true': np.argmax(batch_y, -1),
            'input_length': np.ones((self.batch_size, 1)) * 32,
            'label_length': np.array([np.where(batch_y[ind, :, -2] == 1)[0][0] for ind in range(self.batch_size)])
        }
        batch_outputs = {
            'categorical_crossentropy_loss_output': batch_y,
            'ctc_loss_output': batch_y,
            'acc_output': np.argmax(batch_y, -1)
        }
        return batch_inputs, batch_outputs

    
from tensorflow.python.ops import ctc_ops as ctc

def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    """Runs CTC loss algorithm on each batch element.
    # Arguments
        y_true: tensor `(samples, max_string_length)`
            containing the truth labels.
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_pred`.
        label_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_true`.
    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element.
    """
    label_length = tf.to_int32(tf.squeeze(label_length, axis=-1))
    input_length = tf.to_int32(tf.squeeze(input_length, axis=-1))
    sparse_labels = tf.to_int32(K.ctc_label_dense_to_sparse(y_true, label_length))

#     y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 10 * K.epsilon())
    y_pred = tf.transpose(y_pred, perm=[1, 0, 2])

    return tf.expand_dims(ctc.ctc_loss(inputs=y_pred,
                                       labels=sparse_labels,
                                       sequence_length=input_length), 1)
    
    
input_raw = Input(shape=(32, 66), name='raw')

y_true = Input(shape=(32,), name='y_true')
input_length = Input(shape=(1,), name='input_length')
label_length = Input(shape=(1,), name='label_length')

y_pred = TimeDistributed(Dense(66, activation='softmax'), name='y_pred')(input_raw)

ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss_output')([y_true, input_raw, input_length, label_length])
categorical_crossentropy_loss_output = Lambda(lambda x: K.mean(K.categorical_crossentropy(x[0], x[1]), axis=-1), name='categorical_crossentropy_loss_output')([y_pred, input_raw])
acc_output = Lambda(lambda x: K.mean(K.equal(K.argmax(x[0], axis=-1), K.argmax(x[1], axis=-1)), axis=-1), name='acc_output')([y_pred, input_raw])

model = Model(inputs=[input_raw, y_true, input_length, label_length],
              outputs=[ctc_loss_output, categorical_crossentropy_loss_output, acc_output])
model.summary()

optimizer = SGD(lr=0.001, clipnorm=5)
model.compile(optimizer,
              loss={
                  'ctc_loss_output': lambda y_true, y_pred: y_pred,
                  'categorical_crossentropy_loss_output': lambda y_true, y_pred: y_pred,
                  'acc_output': lambda y_true, y_pred: y_pred
              },
              loss_weights={
                  'ctc_loss_output': 0,
                  'categorical_crossentropy_loss_output': 1,
                  'acc_output': 0
              })
generator = TrivialDataset(dataset.y_train, dataset.y_train, 16)
model.fit_generator(generator, epochs=10, callbacks=[TerminateOnNaN()])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
raw (InputLayer)                (None, 32, 66)       0                                            
__________________________________________________________________________________________________
y_true (InputLayer)             (None, 32)           0                                            
__________________________________________________________________________________________________
input_length (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
label_length (InputLayer)       (None, 1)            0                                            
__________________________________________________________________________________________________
y_pred (Ti

KeyboardInterrupt: 