In [1]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

In [2]:
from modules.lipreading import WordReader
from modules.generators import BatchForCTC
import numpy as np
import h5py
from time import localtime as time
from time import strftime as timeformat
from glob import glob

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
params = {
    'resume':False,
    'initial_epoch': 0,
    'frame_length': 75,
    'frame_width': 100,
    'frame_height': 50,
    'hdf5_data_list': glob(os.path.abspath('../datasets')+'/*sentence*.hdf5'),
    'generator_queue_size': 2, 
    'loss_func':{'ctc_loss': lambda y_true, y_pred: y_pred},
    'sample_size': 8,
    'batch_size': 2,
    'epochs': 3,
    'learning_rate': 1e-03,
    'learning_beta1': 0.9,
    'learning_beta2': 0.999,
    'learning_decay': 1e-08,
    'validation_split': 0.2,
    'model_file': os.path.abspath('../weights/lipnet_15_'+timeformat("%d-%m-%Y-%H-%M-%S", time())+'.hdf5'),
    'log_dir': os.path.abspath('../logs')
}

In [11]:
params['hdf5_data_list']

['/home/sziraqui/Documents/vsp-dev/datasets/grid_sentences_ctc_0-127.hdf5']

In [12]:
wr = WordReader(params)

In [13]:
generator = BatchForCTC(params)

In [14]:
wr.train_with_generator(params, generator)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3e90087048>

In [None]:
X,Y = None,None
sample_size = int(params['sample_size']*params['validation_split'])
with h5py.File('../datasets/grid_words15_0-127.hdf5') as f:
    X = f["features"][:sample_size]
    Y = f["labels"][:sample_size]

In [None]:
wr.model.evaluate(X,Y)

In [None]:
del X
del Y

In [None]:
from modules.metrics import CTC
from keras.layers.convolutional import Conv3D, ZeroPadding3D
from keras.layers.pooling import MaxPooling3D
from keras.layers.core import Dense, Activation, SpatialDropout3D, Flatten
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import GRU
from keras.layers.normalization import BatchNormalization
from keras.layers import Input
from keras.models import Model
import keras.backend as K

In [None]:
class LipNet(object):
    def __init__(self, img_c=3, img_w=100, img_h=50, frames_n=75, absolute_max_string_len=32, output_size=28):
        self.img_c = img_c
        self.img_w = img_w
        self.img_h = img_h
        self.frames_n = frames_n
        self.absolute_max_string_len = absolute_max_string_len
        self.output_size = output_size
        self.build()

    def build(self):
        if K.image_data_format() == 'channels_first':
            input_shape = (self.img_c, self.frames_n, self.img_h, self.img_w)
        else:
            input_shape = (self.frames_n, self.img_h, self.img_w, self.img_c)

        self.input_data = Input(name='input', shape=input_shape, dtype='float32')

        self.zero1 = ZeroPadding3D(padding=(1, 2, 2), name='zero1')(self.input_data)
        self.conv1 = Conv3D(32, (3, 5, 5), strides=(1, 2, 2), kernel_initializer='he_normal', name='conv1')(self.zero1)
        self.batc1 = BatchNormalization(name='batc1')(self.conv1)
        self.actv1 = Activation('relu', name='actv1')(self.batc1)
        self.drop1 = SpatialDropout3D(0.5)(self.actv1)
        self.maxp1 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(self.drop1)

        self.zero2 = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(self.maxp1)
        self.conv2 = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv2')(self.zero2)
        self.batc2 = BatchNormalization(name='batc2')(self.conv2)
        self.actv2 = Activation('relu', name='actv2')(self.batc2)
        self.drop2 = SpatialDropout3D(0.5)(self.actv2)
        self.maxp2 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(self.drop2)

        self.zero3 = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(self.maxp2)
        self.conv3 = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv3')(self.zero3)
        self.batc3 = BatchNormalization(name='batc3')(self.conv3)
        self.actv3 = Activation('relu', name='actv3')(self.batc3)
        self.drop3 = SpatialDropout3D(0.5)(self.actv3)
        self.maxp3 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(self.drop3)

        self.resh1 = TimeDistributed(Flatten())(self.maxp3)

        self.gru_1 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(self.resh1)
        self.gru_2 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru2'), merge_mode='concat')(self.gru_1)

        # transforms RNN output to character activations:
        self.dense1 = Dense(self.output_size, kernel_initializer='he_normal', name='dense1')(self.gru_2)

        self.y_pred = Activation('softmax', name='softmax')(self.dense1)

        self.labels = Input(name='label_input', shape=[self.absolute_max_string_len], dtype='float32')
        self.input_length = Input(name='input_length', shape=[1], dtype='int64')
        self.label_length = Input(name='label_length', shape=[1], dtype='int64')

        self.loss_out = CTC([self.y_pred, self.labels, self.input_length, self.label_length], name='ctc')

        self.model = Model(inputs=[self.input_data, self.labels, self.input_length, self.label_length], outputs=self.loss_out)

    def summary(self):
        Model(inputs=self.input_data, outputs=self.y_pred).summary()

    def predict(self, input_batch):
        return self.test_function([input_batch, 0])[0]  # the first 0 indicates test

    @property
    def test_function(self):
        # captures output of softmax so we can decode the output during visualization
        return K.function([self.input_data, K.learning_phase()], [self.y_pred, K.learning_phase()])

In [None]:
lipnet = LipNet()

In [None]:
# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adam')

In [None]:
g = BatchForCTC(params)

In [None]:
lipnet.model.fit_generator(generator=g.next_batch(2),
                        steps_per_epoch=2, epochs=1,
                        verbose=1,
                        max_q_size=1,
                        workers=1)