<a href="https://colab.research.google.com/github/tmontaj/scripter/blob/main/Notebooks/wave2letter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This module implements wav2letter paper 

Thanks to Chadrick for his [blog](https://chadrick-kwag.net/tf-keras-rnn-ctc-example/) It helped a lot clearong the meaning of the TF documentation 

In [15]:
import tensorflow as tf
import numpy as np
!pip install wandb
import wandb
from wandb.keras import WandbCallback
import tensorflow.keras.callbacks




In [None]:
class FirstBlock(tf.keras.layers.Layer):
  '''
  First layer of wav2letter for melspectrogem (not raw audio)
  '''
  def __init__(self, filters=250, kernel_size=48, strides=2, **kwargs):
    '''
    First layer of wav2letter for melspectrogem (not raw audio)
    Arguments:
    filters -- number of filters in first conv layer(Default: 250)
    kernel_size -- kernal size in first conv layer(Default: 48)
    strides -- strides in first conv layer(Default: 2)

    **For more details see tf.keras.layers.Conv1D Docs**
    '''
    super().__init__(**kwargs)
    
    self.conv = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size,
                                       strides=strides, padding='same',
                                       name="first")
    
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.relu = tf.keras.layers.ReLU()

  def call(self, input_):
    '''
    First layer of wav2letter architecture
    Arguments:
    input_ -- input tensor

    Returns:
    out -- output tensor
    '''
    conv = self.conv(input_)
    batch_norm = self.batch_norm(conv)
    relu = self.relu(batch_norm)
    return relu

In [None]:
class MidBlock(tf.keras.layers.Layer):
  '''
  Mid layers of wav2letter for melspectrogem (not raw audio)
  '''
  def __init__(self, name, filters=250, kernel_size=7, **kwargs):
    '''
    Mid layers of wav2letter
    Arguments:
    filters -- number of filters in mid conv layer(Default: 250)
    kernel_size -- kernal size in mid conv layer(Default: 48)
    name -- layer name

    **For more details see tf.keras.layers.Conv1D Docs**
    '''
    super().__init__(**kwargs)
    
    self.conv = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size,
                                       padding='same', name=name)
    
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.relu = tf.keras.layers.ReLU()

  def call(self, input_):
    '''
    Mid layer of wav2letter architecture
    Arguments:
    input_ -- input tensor

    Returns:
    out -- output tensor
    '''
    conv = self.conv(input_)
    batch_norm = self.batch_norm(conv)
    relu = self.relu(batch_norm)
    return relu

In [None]:
class LastBlock(tf.keras.layers.Layer):
  def __init__(self, output_size=40, **kwargs):
    '''
    Last layers of wav2letter
    Arguments:
    output_size -- number or char in language (Default: 40)
    '''
    super().__init__(**kwargs)
    self.conv1 = tf.keras.layers.Conv1D(filters=2000, kernel_size=32,
                                        padding='same', name="last_mid")
    
    self.conv2 = tf.keras.layers.Conv1D(filters=2000, kernel_size=1,
                                        padding='same', name="last1")
    
    self.conv3 = tf.keras.layers.Conv1D(filters=output_size, kernel_size=1,
                                        padding='same', name="last2")
    
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.batch_norm2 = tf.keras.layers.BatchNormalization()
    self.batch_norm3 = tf.keras.layers.BatchNormalization()
    
    self.relu = tf.keras.layers.ReLU()

  def call(self, input_):
    '''
    Last layer of wav2letter architecture
    Arguments:
    input_ -- input tensor

    Returns:
    out -- output tensor
    '''
    conv1 = self.conv1(input_)
    batch_norm1 = self.batch_norm(conv1)
    relu1 = self.relu(batch_norm1)
    
    conv2 = self.conv2(relu1)
    batch_norm2 = self.batch_norm2(conv2)
    relu2 = self.relu(batch_norm2)

    conv3 = self.conv3(relu2)
    batch_norm3 = self.batch_norm3(conv3)
    relu3 = self.relu(batch_norm3)

    return relu3

In [None]:
class Wav2Let(tf.keras.Model):
  '''wav2letter model'''
  def __init__(self, filters=250, kernel_size=48, strides=2, depth=7,
               mid_filters=250, mid_kernel_size=7, output_size=40, **kwargs):
    '''
    wav2letter model
    Arguments:
    filters -- number of filters in first conv layer(Default: 250)
    kernel_size -- kernal size in first conv layer(Default: 48)
    strides -- strides in first conv layer(Default: 2)
    mid_filters -- number of filters in mid conv layer(Default: 250)
    mid_kernel_size -- kernal size in mid conv layer(Default: 48)
    name -- layer name
    depth -- number od mid layers to use (Default: 7)
    output_size -- number or char in language (Default: 40)

    **For more details see tf.keras.layers.Conv1D Docs**
    '''
    super().__init__(**kwargs)
    self.first_block = FirstBlock()
    
    self.mid_block = []
    for i in range(depth):
      self.mid_block.append(
          MidBlock(name = "mid%d"%(i))
      )

    self.last_block = LastBlock()
        
  def call(self, input_):
    '''
    wav2letter architecture
    Arguments:
    input_ -- input tensor

    Returns:
    out -- output tensor
    '''
    block_out = self.first_block(input_)

    for layer in self.mid_block:
      block_out = layer(block_out)

    last_block = self.last_block(block_out)

    return last_block 


In [None]:
x = np.ones((1,700,200))
x

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]])

In [None]:
model = Wav2Let()
y = model(x)
y

<tf.Tensor: shape=(1, 350, 40), dtype=float32, numpy=
array([[[0.00181469, 0.00483916, 0.        , ..., 0.        ,
         0.        , 0.00222994],
        [0.00402963, 0.00410732, 0.        , ..., 0.        ,
         0.        , 0.00143265],
        [0.        , 0.00272821, 0.0018889 , ..., 0.        ,
         0.        , 0.00227772],
        ...,
        [0.02604784, 0.0221705 , 0.0040882 , ..., 0.00209263,
         0.00905962, 0.        ],
        [0.02925947, 0.0189012 , 0.00074666, ..., 0.00673655,
         0.00759327, 0.        ],
        [0.02432358, 0.01630323, 0.        , ..., 0.01044387,
         0.00522837, 0.        ]]], dtype=float32)>

In [None]:
# model.last_block.weights
def ctc_loss():
  def ctc_loss_(y_true, y_pred):

    label_length = y_true[0]
    true_labels  = y_true[0]

    batch = tf.shape(y_pred)[0] # shape=(batch, time, char)
    char = tf.shape(y_pred)[2] # shape=(batch, time, char)
    logit_length = tf.repeat([char], batch)

    return tf.nn.ctc_loss(labels=true_labels, logits=y_pred,label_length=label_length,
                          logit_length=logit_length)
  return ctc_loss_

In [None]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
  model = Wav2Let()
  model.compile(loss=ctc_loss,
                optimizer=tf.keras.optimizers.Adam())

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [17]:
wandb.init(project='audio2text')
wandb.login()
wandb.init()

In [None]:
# early stopping
patience = 2


In [None]:
# learning rate 

def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * tf.math.exp(-0.1)



In [None]:
# save at the end of the epoc
class ModelSave(tf.keras.callbacks.Callback):
  def __init__(self,**kwargs, path = ""):
    super().__init__(**kwargs)
    self.path = path
    
  def on_epoch_end(self, epoch, logs=None):
    self.model.save_weights(path+"/"+str(epoch)+".h5")
    



In [16]:

callbacks = [
    WandbCallback(),
    EarlyStopping(patience= patience ),
    LearningRateScheduler(schedule = scheduler),
    ModelSave(),  
     # pause and resume save
]

Error: ignored

In [None]:
model.fit(train_dataset, epochs=12, callbacks= callbacks)

NameError: ignored