In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ''

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Activation, Conv2D, Dropout, Flatten, SimpleRNN, MaxPooling2D, Input, Dense, Lambda, TimeDistributed, Reshape
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import Sequence
from tensorflow.keras.optimizers import SGD, RMSprop
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import TerminateOnNaN

import sys
sys.path.append('..')
from text_recognizer.datasets.emnist_lines import EmnistLinesDataset
from text_recognizer.models.line_rnn import LineLstm

%load_ext autoreload
%autoreload 2

In [2]:
dataset = EmnistLinesDataset(max_overlap=0)
dataset.load_or_generate_data()

EmnistLinesDataset loading data from HDF5...


In [5]:
class DatasetSequence(Sequence):
    def __init__(self, x, y, batch_size, output_sequence_length):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.output_sequence_length = output_sequence_length
    
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = np.take(self.x, range(idx * self.batch_size, (idx + 1) * self.batch_size), axis=0, mode='clip')
        batch_y = np.take(self.y, range(idx * self.batch_size, (idx + 1) * self.batch_size), axis=0, mode='clip')
        
#         batch_y = np.dstack((
#             batch_y,
#             np.zeros((batch_y.shape[0], batch_y.shape[1]))
#         ))
        
        batch_inputs = {
            'image': batch_x,
            'y_true': np.argmax(batch_y, -1),
            'input_length': np.ones((self.batch_size, 1)) * self.output_sequence_length,
            'label_length': np.array([np.where(batch_y[ind, :, -1] == 1)[0][0] for ind in range(self.batch_size)])
        }
#         batch_outputs = {
#             'categorical_crossentropy_loss_output': batch_y,
#             'ctc_loss_output': batch_y,
#             'acc_output': np.argmax(batch_y, -1)
#         }
        return batch_inputs, batch_y

In [None]:
model = LineLstm()

def lenet(image_height: int, image_width: int) -> Model:
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(image_height, image_width, 1)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    return model

def create_sliding_window_rnn_model(input_shape, max_length, num_classes, window_width, window_stride):
    def slide_window(image, window_width=window_width, window_stride=window_stride):
        kernel = [1, 1, window_width, 1]
        strides = [1, 1, window_stride, 1]
        patches = tf.extract_image_patches(image, kernel, strides, [1, 1, 1, 1], 'SAME')
        patches = tf.transpose(patches, (0, 2, 1, 3))
        patches = tf.expand_dims(patches, -1)
        return patches
    
    image_height, image_width = input_shape    
    image_input = Input(shape=input_shape, name='image')
    y_true = Input(shape=(max_length,), name='y_true')
    input_length = Input(shape=(1,), name='input_length')
    label_length = Input(shape=(1,), name='label_length')
    
    image_reshaped = Reshape((image_height, image_width, 1))(image_input)
    image_patches = Lambda(slide_window)(image_reshaped)  # (num_windows, image_height, window_width, 1)
    convnet = lenet(image_height, window_width)
    convnet_outputs = TimeDistributed(convnet)(image_patches)  # (num_windows, 128)
    
    # LSTM outputting a single vector
    rnn_output = SimpleRNN(128, return_sequences=True)(convnet_outputs) # (sequence_length, 128)
    softmaxed_outputs = TimeDistributed(Dense(num_classes, activation='softmax'))(rnn_output)
    
    ctc_loss_output = Lambda(
        lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]),
        name='ctc_loss_output'
    )([y_true, softmaxed_outputs, input_length, label_length])
    
    model = Model(inputs=[image_input, y_true, input_length, label_length], outputs=ctc_loss_output)
    model.summary()
    return model


dataset_sequence = DatasetSequence(
    dataset.x_train,
    dataset.y_train,
    batch_size=16,
    output_sequence_length=56
)


keras_model = create_sliding_window_rnn_model(
    model.input_shape,
    model.max_length,
    model.num_classes,
    32 // 2,
    32 // 2
)

keras_model.compile('rmsprop', loss=lambda yt, yp: yp)
keras_model.fit_generator(dataset_sequence, epochs=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              (None, 28, 896)      0                                            
__________________________________________________________________________________________________
reshape_5 (Reshape)             (None, 28, 896, 1)   0           image[0][0]                      
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 56, 28, 16, 1 0           reshape_5[0][0]                  
__________________________________________________________________________________________________
time_distributed_10 (TimeDistri (None, 56, 128)      608768      lambda_5[0][0]                   
__________________________________________________________________________________________________
simple_rnn