In [82]:
import tensorflow as tf
import glob

## Input Layer

In [83]:
class TokenEmbedding(tf.keras.layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hidden=64):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            input_dim=num_vocab, output_dim=num_hidden)
        self.pos_embedding = tf.keras.layers.Embedding(
            input_dim=maxlen, output_dim=num_hidden)
        
    def call(self, x):
        x = self.embedding(x)
        pos = self.pos_embedding(
            tf.range(start=0, limit=tf.shape(x)[-1], delta=1))
        return x + pos

class SpeechFeatureEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen=100, num_hidden=64):
        super().__init__()
        self.conv1d = tf.keras.layers.Conv1D(
            filters=num_hidden, kernel_size=11, strides=2, padding="same",
            activation="relu")
        self.pos_embedding = tf.keras.layers.Embedding(
            input_dim=maxlen, output_dim=num_hidden)

    def call(self, x):
        print("before:", x)
        x = self.pos_embedding(x)
        x = self.conv1d(x)
        x = self.conv1d(x)
        x = self.conv1d(x)
        print("after:", x)
        return x

## Encoder Layer

In [84]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(feed_forward_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)])
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        # Attention
        x1 = self.attention(query=inputs, value=inputs)
        x1 = self.dropout(x1, training=training)
        x1 = self.layer_norm(inputs + x1)
        
        # Feed-forward network
        x2 = self.ffn(x1)
        x2 = self.dropout(x2, training=training)
        x2 = self.layer_norm(x1 + x2)  
        return x2

## Decoder Layer

In [85]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.self_dropout = tf.keras.layers.Dropout(0.5)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.fnn = tf.keras.Sequential([
            tf.keras.layers.Dense(feed_forward_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)])

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        multiples = tf.concat(
            values=[
                tf.expand_dims(batch_size, -1), 
                tf.constant([1, 1], dtype=tf.int32)],
            axis=0)
        return tf.tile(mask, multiples)

    def call(self, encoder_out, target):
        batch_size = tf.shape(target)[0]
        seq_len = tf.shape(target)[1]
        causal_mask = self.causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool)
        
        # Target flow
        x1 = self.attention(
            query=target, value=target, attention_mask=causal_mask)
        x1 = self.dropout(x1)
        x1 = self.layer_norm(target + x1)

        # Input flow
        x2 = self.attention(query=x1, value=encoder_out)
        x2 = self.dropout(x2)
        x2 = self.layer_norm(x1 + x2)

        # Feed-forward network
        x3 = self.ffn(x2)
        x3 = self.layer_norm(x2 + x3)
        return x3

## Transformer

In [86]:
class Transformer(tf.keras.Model):
    def __init__(
        self, num_hidden=64, num_heads=2, num_feed_forward=128,
        source_maxlen=100, target_maxlen=100, num_layers_enc=4,
        num_layers_dec=1, num_classes=10):

        super().__init__()
        self.num_classes = num_classes
        self.target_maxlen = target_maxlen
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        self.encoder_input = SpeechFeatureEmbedding(
            num_hidden=num_hidden, maxlen=source_maxlen)
        
        self.encoder_layer = Encoder(
            embed_dim=num_hidden, num_heads=num_heads, 
            feed_forward_dim=num_feed_forward)
        
        self.decoder_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hidden=num_hidden)
        
        self.decoder_layer = Decoder(
            embed_dim=num_hidden, num_heads=num_heads, 
            feed_forward_dim=num_feed_forward)
        
        self.classifier = tf.keras.layers.Dense(num_classes)
   
    def call(self, inputs):
        x = self.encoder_input(inputs[0])
        for i in range(self.num_layers_enc):
            x = self.encoder_layer(x)
        y = self.decoder_input(inputs[1])
        for i in range(self.num_layers_dec):
            x = self.decoder_layer(x, y)
        x = self.classifier(x)
        return x

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        decoder_input = target[:, :-1]
        decoder_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, decoder_input])
            one_hot = tf.one_hot(decoder_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(decoder_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
            self.loss_metric.update_state(loss)
            return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        decoder_input = target[:, :-1]
        decoder_target = target[:, 1:]
        preds = self([source, decoder_input])
        one_hot = tf.one_hot(decoder_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(decoder_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def infer(self, source, target_start_token_idx):
        batch_size = tf.shape(source)[0]
        encoder = self.encoder(source)
        decoder_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        decoder_logits = []
        for i in range(self.target_maxlen - 1):
            decoder_out = self.decoder(encoder, decoder_input)
            logits = self.classifier(decoder_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            decoder_logits.append(last_logit)
            decoder_input = tf.concat([decoder_input, last_logit], axis=-1)
        return decoder_input

In [87]:
def get_data(maxlen=50):
    data = []
    wavs = glob.glob("Datasets/LJSpeech-dataset/wavs/*.wav")
    with open("Datasets\LJSpeech-dataset\metadata.csv", encoding="utf-8") as f:
        id_to_text = {
            line.strip().split("|")[0]: line.strip().split("|")[2] for line in f}
    for w in wavs:
        id = w.split("\\")[-1].split(".")[0]
        if len(id_to_text[id]) < maxlen:
            data.append({"audio": w, "text": id_to_text[id]})
    return data

class Vectorizer:
    def __init__(self, maxlen=50):
        self.vocab = (
            ["-", "#", "<", ">"]
            + [chr(i + 96) for i in range(1, 27)]
            + [" ", ".", ",", "?"])
        self.maxlen = maxlen
        self.char_to_idx = {}
        for i, ch in enumerate(self.vocab):
            self.char_to_idx[ch] = i
        
    def __call__(self, text):
        text = text.lower()
        text = text[: self.maxlen - 2]
        text = "<" + text + ">"
        pad_len = self.maxlen - len(text)
        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len

    def get_vocab(self):
        return self.vocab

max_target_len = 200
data = get_data()
vectorizer = Vectorizer(max_target_len)
print("Vocab size:", len(vectorizer.get_vocab()))

Vocab size: 34


In [88]:
def create_text_ds(data):
    texts = [i["text"] for i in data]
    text_ds = [vectorizer(t) for t in texts]
    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
    return text_ds

def path_to_audio(path):
    audio = tf.io.read_file(path)
    audio = tf.audio.decode_wav(audio, 1)[0]
    audio = tf.squeeze(audio, axis=-1)
    stft = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
    x = tf.math.pow(tf.abs(stft), 0.5)
    mean = tf.math.reduce_mean(x, 1, keepdims=True)
    std = tf.math.reduce_std(x, 1, keepdims=True)
    x = (x - mean) / std
    audio_len = tf.shape(x)[0]
    pad_len = 2754
    paddings = tf.constant([[0, pad_len], [0, 0]])
    x = tf.pad(x, paddings, "CONSTANT")[:pad_len, :]
    return x

def create_audio_ds(data):
    flist = [i["audio"] for i in data]
    audio_ds = tf.data.Dataset.from_tensor_slices(flist)
    audio_ds = audio_ds.map(
        path_to_audio, num_parallel_calls=tf.data.AUTOTUNE)
    return audio_ds

def create_tf_dataset(data, batch_size=4):
    audio_ds = create_audio_ds(data)
    text_ds = create_text_ds(data)
    ds = tf.data.Dataset.zip((audio_ds, text_ds))
    ds = ds.map(lambda x, y: {"source": x, "target": y})
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

split = int(len(data) * 0.8)
train_data = data[:split]
val_data = data[split:]
ds = create_tf_dataset(train_data, batch_size=64)
val_ds = create_tf_dataset(val_data, batch_size=64)

In [89]:
class DisplayOutputs(tf.keras.callbacks.Callback):
    def __init__(self, batch, idx_to_token, target_start_token_idx=27,
        target_end_token_idx=28):
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 != 0:
            return
        source = self.batch["source"]
        target = self.batch["target"].numpy()
        batch_size = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx).numpy()
        for i in range(batch_size):
            target_text = "".join([self.idx_to_char[j] for j in target[i, :]])
            prediction = ""
            for idx in preds[i, :]:
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-', '')}")
            print(f"prediction: {prediction}\n")

In [90]:
# class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

In [91]:
batch = next(iter(val_ds))

idx_to_char = vectorizer.get_vocab()
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3)

model = Transformer(
    num_hidden=200,
    num_heads=2,
    num_feed_forward=400,
    target_maxlen=max_target_len,
    num_layers_enc=4,
    num_layers_dec=1,
    num_classes=34)

loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, label_smoothing=0.1)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss=loss_fn)

history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=1)

before: Tensor("IteratorGetNext:0", shape=(None, None, 129), dtype=float32)
after: Tensor("transformer_8/speech_feature_embedding_8/conv1d_8/Relu_2:0", shape=(None, None, 17, 200), dtype=float32)


ValueError: in user code:

    E:\Anaconda3\envs\tf-gpu\lib\site-packages\keras\engine\training.py:853 train_function  *
        return step_function(self, iterator)
    <ipython-input-86-d73e3bbee55a>:35 call  *
        x = self.decoder_layer(x, y)
    <ipython-input-25-c3c03673c8e4>:39 call  *
        x2 = self.attention(query=x1, value=encoder_out)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\keras\engine\base_layer.py:1037 __call__  **
        outputs = call_fn(inputs, *args, **kwargs)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\multi_head_attention.py:498 call
        key = self._key_dense(key)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\keras\engine\base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\keras\layers\einsum_dense.py:197 call
        ret = tf.einsum(self.equation, inputs, self.kernel)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\special_math_ops.py:751 einsum
        return _einsum_v2(equation, *inputs, **kwargs)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\special_math_ops.py:1180 _einsum_v2
        return gen_linalg_ops.einsum(inputs, resolved_equation)
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\gen_linalg_ops.py:1090 einsum
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\framework\op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\framework\func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\framework\ops.py:3561 _create_op_internal
        ret = Operation(
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\framework\ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    E:\Anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\framework\ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Shape must be rank 3 but is rank 4
    	 for 0th input and equation: abc,cde->abde for '{{node transformer_8/decoder_7/multi_head_attention_15/key/einsum_1/Einsum}} = Einsum[N=2, T=DT_FLOAT, equation="abc,cde->abde"](transformer_8/encoder_7/layer_normalization_14/batchnorm_7/add_1, transformer_8/decoder_7/multi_head_attention_15/key/einsum_1/Einsum/ReadVariableOp)' with input shapes: [?,?,17,200], [200,2,200].
