## Train a Model

In [1]:
import os
import math
import numpy as np
import tensorflow as tf

In [2]:
print(tf.__version__)
tf.config.list_physical_devices('GPU')

2.1.0


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
from myutils import *

In [4]:
data_train_path = os.path.join("train.csv")
data_test_path = os.path.join("test.csv")
batch_size = 256
max_audio_duration = 20 # in seconds
feature_height = 40
feature_choice = "mfcc"

In [5]:
train_gen = AudioTextGenerator(data_train_path,
                               batch_size=batch_size,
                               max_audio_length=max_audio_duration,
                               feature_height=feature_height,
                               feature_choice=feature_choice,
                               max_text_length=400)
test_gen = AudioTextGenerator(data_test_path,
                               batch_size=5, # a smaller batch size for test to reduce time
                               max_audio_length=max_audio_duration,
                               feature_height=feature_height,
                               feature_choice=feature_choice,
                               max_text_length=400)

AudioTextGenerator: self.data.shape = (103997, 3)
AudioTextGenerator: self.data.shape = (2528, 3)


In [6]:
# NN parameters
conv_filters = 16
kernel_size = (3, 3)
pool_size = 2
time_dense_size = 32
rnn_size = 512
max_audio_length = train_gen.max_audio_length
max_text_length = 400
input_shape = (max_audio_length, feature_height, 1)
starting_epoch = 5

In [7]:
tf.keras.backend.clear_session()

input_data = tf.keras.layers.Input(name="the_input", shape=input_shape, dtype="float32")

inner = tf.keras.layers.Conv2D(conv_filters, kernel_size, padding="same", activation="relu", kernel_initializer="he_normal", name="conv1")(input_data)
inner = tf.keras.layers.MaxPool2D(pool_size=(1, pool_size), name="max1")(inner)

inner = tf.keras.layers.Conv2D(conv_filters, kernel_size, padding="same", activation="relu", kernel_initializer="he_normal", name="conv2")(inner)
inner = tf.keras.layers.MaxPool2D(pool_size=(1, pool_size), name="max2")(inner)

inner = tf.keras.layers.Reshape(target_shape=(max_audio_length, (feature_height // (pool_size ** 2)) * conv_filters), name="reshape")(inner)
inner = tf.keras.layers.Dense(time_dense_size, activation="relu", name="dense1")(inner)

gru_1 = tf.keras.layers.GRU(rnn_size, return_sequences=True, kernel_initializer="he_normal", name="gru1")(inner)
gru_1b = tf.keras.layers.GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru1_b")(inner)
gru1_merged = tf.keras.layers.add([gru_1, gru_1b])

gru_2 = tf.keras.layers.GRU(rnn_size, return_sequences=True, kernel_initializer="he_normal", name="gru2")(gru1_merged)
gru_2b = tf.keras.layers.GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer="he_normal", name="gru2_b")(gru1_merged)

inner = tf.keras.layers.Dense(len(CHAR_LIST) + 1, kernel_initializer="he_normal", name="dense2")(tf.keras.layers.concatenate([gru_2, gru_2b]))
y_pred = tf.keras.layers.Activation('softmax', name='softmax')(inner)

labels = tf.keras.layers.Input(name="the_labels", shape=[max_text_length], dtype="float32")
input_length = tf.keras.layers.Input(name="input_length", shape=[1], dtype="int64")
label_length = tf.keras.layers.Input(name='label_length', shape=[1], dtype='int64')
loss_out = tf.keras.layers.Lambda(ctc_lambda_func, output_shape=(1,), name="ctc")([y_pred, labels, input_length, label_length])

model = tf.keras.Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
#optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001) # for epoch 0 ~ 5
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001) # for later
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=optimizer)
print(model.summary())

if starting_epoch > 0:
    weight_path = os.path.join("ckpt", "04_06", 'weights%02d.h5' % (starting_epoch - 1))
    model.load_weights(weight_path)

test_func = tf.keras.backend.function([input_data], [y_pred])
myCallback = ValCallback(ckpt_path=os.path.join("ckpt", "04_06"),
                         test_func=test_func,
                         next_val=test_gen.next_batch(),
                         num_display=5)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          [(None, 625, 40, 1)] 0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 625, 40, 16)  160         the_input[0][0]                  
__________________________________________________________________________________________________
max1 (MaxPooling2D)             (None, 625, 20, 16)  0           conv1[0][0]                      
__________________________________________________________________________________________________
conv2 (Conv2D)                  (None, 625, 20, 16)  2320        max1[0][0]                       
______________________________________________________________________________________________

In [None]:
model.fit(x=train_gen.next_batch(),
          steps_per_epoch=len(train_gen.data)//batch_size-1,
          #steps_per_epoch=2,
          epochs=20,
          validation_data=test_gen.next_batch(),
          validation_steps=5,
          callbacks=[myCallback],
          verbose=1,
          initial_epoch=starting_epoch)

  {'ctc': '...'}
    to  
  ['...']
  {'ctc': '...'}
    to  
  ['...']
Train for 405 steps, validate for 5 steps
Epoch 6/20
  3/405 [..............................] - ETA: 1:58:19 - loss: 244.2607