In [1]:
import os
import json
import argparse

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.utils import plot_model
from functools import partial
from collections import Counter
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold

import tensorflow_io as tfio

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

d = pd.read_csv("Datasets\TIMIT-dataset\data.csv")
d

Unnamed: 0,wav_paths
0,Datasets\TIMIT-dataset\data\DR1\FAKS0\SA1.wav
1,Datasets\TIMIT-dataset\data\DR1\FAKS0\SA2.wav
2,Datasets\TIMIT-dataset\data\DR1\FAKS0\SI1573_1...
3,Datasets\TIMIT-dataset\data\DR1\FAKS0\SI1573_2...
4,Datasets\TIMIT-dataset\data\DR1\FAKS0\SI2203.wav
...,...
6984,Datasets\TIMIT-dataset\data\DR8\MTCS0\SX172.wav
6985,Datasets\TIMIT-dataset\data\DR8\MTCS0\SX262.wav
6986,Datasets\TIMIT-dataset\data\DR8\MTCS0\SX352.wav
6987,Datasets\TIMIT-dataset\data\DR8\MTCS0\SX442.wav


In [2]:
def ArgParser():
    parser = argparse.ArgumentParser()

    # RNN layer
    parser.add_argument("--units", dest="units", type=int, default=50)
    parser.add_argument("--n_layers", dest="n_layers", type=int, default=2)
    parser.add_argument("--dropout", dest="dropout", type=int, default=0.1)
    parser.add_argument("--bidirectional", dest="bidirectional", type=bool, default=True, choices=[True, False])

    # Segmentor
    parser.add_argument("--n_classes", dest="n_classes", type=int, default=61)
    parser.add_argument("--batch_size", dest="batch_size", type=int, default=8)
    parser.add_argument("--n_mels", dest="n_mels", type=int, default=32)
    parser.add_argument("--max_seg_size", dest="max_seg_size", type=int, default=100)
    parser.add_argument("--min_seg_size", dest="min_seg_size", type=int, default=0)
    parser.add_argument("--n_fft", dest="n_fft", type=int, default=2048)
    parser.add_argument("--window_size", dest="window_size", type=int, default=480)
    parser.add_argument("--hop_length", dest="hop_length", type=int, default=160) # 160 samples = 10ms
    parser.add_argument("--sample_rate", dest="sample_rate", type=int, default=16000)

    # Dataset
    parser.add_argument("--main_dir", dest="main_dir", type=str, default="Datasets/TIMIT-dataset/tfrec_data")
    parser.add_argument("--buffer_size", dest="buffer_size", type=int, default=512)
    parser.add_argument("--test_size", dest="test_size", type=float, default=0.2)
    parser.add_argument("--n_splits", dest="n_splits", type=int, default=5)
    parser.add_argument("--max_samples", dest="max_samples", type=int, default=67200)

    args = parser.parse_known_args()[0]
    seq_len = int(np.ceil(args.max_samples / args.hop_length))
    input_shape = [args.max_samples // args.hop_length, args.n_mels]
    parser.add_argument("--seq_len", type=int, default=seq_len)
    parser.add_argument("--input_shape", type=list, default=input_shape)

    return parser.parse_known_args()[0]

args = ArgParser()
args

Namespace(batch_size=8, bidirectional=True, buffer_size=512, dropout=0.1, hop_length=160, input_shape=[420, 32], main_dir='Datasets/TIMIT-dataset/tfrec_data', max_samples=67200, max_seg_size=100, min_seg_size=0, n_classes=61, n_fft=2048, n_layers=2, n_mels=32, n_splits=5, sample_rate=16000, seq_len=420, test_size=0.2, units=50, window_size=480)

In [3]:
class TFRWriter():
    def __init__(self, args):
        self.samples = d['wav_paths'].tolist()
        self.args = args
        self.dict_path = "Datasets\TIMIT-dataset\phoneme_dict.json"
        self.phoneme_dict = self.get_dict()

    def _bytes_feature(self, value):
        """Returns a bytes_list from a string / byte."""
        if isinstance(value, type(tf.constant(0))):
            value = value.numpy()
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def _int64_feature(self, value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

    def serialize_example(self, *args):
        feature = {
            'audio': self._bytes_feature(args[0]),
            'binary_labels': self._bytes_feature(args[1]),
            'framewise_labels': self._bytes_feature(args[2]),
            'attention_mask': self._bytes_feature(args[3]),
            'filename': self._bytes_feature(args[4])}

        example_proto = tf.train.Example(
            features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()

    def get_binary_labels(self, p_frames): # TODO
        p_frames = tf.convert_to_tensor(p_frames)
        labels = tf.tensor_scatter_nd_update(
            tensor=tf.zeros([p_frames[-1]+1], dtype=tf.int32), 
            indices=tf.expand_dims(p_frames, axis=1), 
            updates=tf.ones([p_frames.shape[0]], dtype=tf.int32))
        padding = tf.zeros([self.args.seq_len-len(labels)], dtype=tf.int32)
        return tf.concat([labels, padding], axis=0)

    def get_framewise_labels(self, p_frames, phonemes): # TODO
        labels = []
        for i in range(1, len(p_frames)):
            for j in range(p_frames[i-1], p_frames[i]):
                labels.append(phonemes[i-1])
        labels = tf.convert_to_tensor(labels)
        padding = tf.zeros([self.args.seq_len-len(labels)], dtype=tf.int32)
        return tf.concat([labels, padding], axis=0)

    def get_attention_mask(self, p_frames):
        mask = tf.convert_to_tensor(
            [True if i < p_frames[-1] else False for i in range(self.args.seq_len)])
        return mask

    def get_shards(self):
        speaker_id = [sample.split('\\')[4] for sample in self.samples]
        skf = StratifiedKFold(
            n_splits=self.args.n_splits, shuffle=True, random_state=42)
        return [
            list(map(lambda x: self.samples[x], j)) 
            for i, j in skf.split(self.samples, speaker_id)]

    def get_dict(self):
        phonemes = set()
        markers = ['h#', 'pau', 'epi']
        for sample in self.samples:
            base_path = os.path.splitext(sample)[0]
            with open(base_path + '.phn', "r") as f:
                for line in f.readlines():
                    phoneme = line.split()[-1]
                    if not phoneme in markers:
                        phonemes.add(phoneme)
        phonemes = markers + sorted(Counter(phonemes), key=Counter(phonemes).get, reverse=True)
        phonemes_dict = {v: i+1 for i, v in enumerate(phonemes)}
        with open(self.dict_path, "w") as f:
            json.dump(phonemes_dict, f, sort_keys=False, indent=4)
        return phonemes_dict 
    
    def get_shard_data(self, samples, shard):
        for sample in tqdm(
                samples, total=len(samples), desc=f"Writing shard {shard}"):
            base_path = os.path.splitext(sample)[0]
            p_frames, phonemes = [0], []
            with open(base_path + ".phn") as f:
                for line in f.readlines():
                    p_frame, phoneme = line.split()[1::]
                    p_frames.append(int(p_frame) // self.args.hop_length)
                    phonemes.append(str(phoneme))
            phonemes = list(map(self.phoneme_dict.get, phonemes))
            binary_labels = self.get_binary_labels(p_frames)
            framewise_labels = self.get_framewise_labels(p_frames, phonemes)
            waveform = tf.io.read_file(base_path + ".wav")
            spec_mask = self.get_attention_mask(p_frames)
            filename = str.encode("/".join(sample.split('\\')[-3::]))
            yield {
                "audio": waveform,
                "binary_labels": tf.io.serialize_tensor(binary_labels),
                "framewise_labels": tf.io.serialize_tensor(framewise_labels),
                "attention_mask": tf.io.serialize_tensor(spec_mask),
                "filename": filename}

    def write(self):
        for shard, samples in enumerate(self.get_shards()):
            with tf.io.TFRecordWriter(
                    f"Datasets/TIMIT-dataset/tfrec_data/train_{shard+1}.tfrec") as f:
                for sample in self.get_shard_data(samples, shard+1):
                    example = self.serialize_example(
                        sample['audio'], sample['binary_labels'], 
                        sample['framewise_labels'], sample['attention_mask'],
                        sample['filename'])
                    f.write(example)

# TFRWriter(args).write()

In [4]:
class TIMITDataset():
    def __init__(self, args):
        self.files = [os.path.join(args.main_dir, f) for f in os.listdir(args.main_dir)]
        self.args = args
        self.AUTOTUNE = tf.data.experimental.AUTOTUNE
        self.train_files, self.val_files = train_test_split(
            self.files, test_size=0.2, shuffle=True)

    def decode_audio(self, string):
        audio = tf.audio.decode_wav(string, desired_samples=self.args.max_samples)[0]
        return tf.squeeze(audio, axis=-1)

    def read_tfrecord(self, example):
        feature_description = {
            'audio': tf.io.FixedLenFeature([], tf.string),
            'binary_labels': tf.io.FixedLenFeature([], tf.string),
            'framewise_labels': tf.io.FixedLenFeature([], tf.string),
            'attention_mask': tf.io.FixedLenFeature([], tf.string)}
        
        example = tf.io.parse_single_example(example, feature_description)
        example['audio'] = self.decode_audio(example['audio'])
        example['binary_labels'] = tf.io.parse_tensor(
            example['binary_labels'], out_type=tf.int32)
        example['framewise_labels'] = tf.io.parse_tensor(
            example['framewise_labels'], out_type=tf.int32)
        example['attention_mask'] = tf.io.parse_tensor(
            example['attention_mask'], out_type=tf.bool)
        return example

    def load_dataset(self, files):
        ignore_order = tf.data.Options()
        ignore_order.experimental_deterministic = False
        dataset = tf.data.TFRecordDataset(files, num_parallel_reads=self.AUTOTUNE)
        dataset = dataset.with_options(ignore_order)
        dataset = dataset.map(self.read_tfrecord, num_parallel_calls=self.AUTOTUNE)
        return dataset
 
    def SpecAugment(self, sample, training):
        waveform = sample['audio'] / 32678
        if training == True:
            waveform = tfio.audio.fade(
                waveform, fade_in=1000, fade_out=2000, mode="logarithmic")
        spectrogram = tf.abs(tfio.audio.spectrogram(
            waveform, nfft=self.args.n_fft, window=self.args.window_size, 
            stride=self.args.hop_length))
        mel_spectrogram = tfio.audio.melscale(
            spectrogram, rate=self.args.sample_rate, mels=self.args.n_mels, 
            fmin=0, fmax=8000)
        mel_spectrogram = tfio.audio.dbscale(mel_spectrogram, top_db=80)
        if training == True:
            mel_spectrogram = tfio.audio.freq_mask(mel_spectrogram, param=7)
            mel_spectrogram = tfio.audio.time_mask(mel_spectrogram, param=10)
        inputs = {"spectrogram": mel_spectrogram, "mask": sample['attention_mask']}
        outputs = {"binary_labels": sample['binary_labels'], "framewise_labels": sample['framewise_labels']}
        return inputs, outputs

    def train(self):
        dataset = self.load_dataset(self.train_files)
        dataset = dataset.map(
            partial(self.SpecAugment, training=True), num_parallel_calls=self.AUTOTUNE)
        dataset = dataset.shuffle(self.args.buffer_size)
        dataset = dataset.batch(self.args.batch_size)
        dataset = dataset.prefetch(self.AUTOTUNE)
        return dataset

    def val(self):
        dataset = self.load_dataset(self.val_files)
        dataset = dataset.map(
            partial(self.SpecAugment, training=False), num_parallel_calls=self.AUTOTUNE)
        dataset = dataset.batch(self.args.batch_size)
        dataset = dataset.cache()
        dataset = dataset.prefetch(self.AUTOTUNE)
        return dataset

train = TIMITDataset(args).train()
val = TIMITDataset(args).val()

inputs, outputs = next(iter(train))
print("spectrogram shape:", inputs['spectrogram'].shape)
print("mask shape:", inputs['mask'].shape)
print("binary_labels shape:", outputs['binary_labels'].shape)
print("framewise_labels shape:", outputs['framewise_labels'].shape)

spectrogram shape: (8, 420, 32)
mask shape: (8, 420)
binary_labels shape: (8, 420)
framewise_labels shape: (8, 420)


In [9]:
class Segmentor(Model):
    def __init__(self, args):
        super(Segmentor, self).__init__(name="Segmentor")
        self.args = args
        self.rnn = self.rnn_block()
        self.scorer = self.scorer_block()
        self.classifier = self.classifier_block()
        self.bi_classifier = self.bi_classifier_block()

    def rnn_block(self):
        spectrogram = Input(
            shape=self.args.input_shape, dtype=tf.float32, name='spectrogram')
        mask = Input(
            shape=self.args.input_shape[0], dtype=tf.bool, name='mask') 
        inputs = [spectrogram, mask]

        x = Bidirectional(
                LSTM(self.args.units, return_sequences=True), name="layer_1")(
                    inputs=spectrogram, mask=mask)
        x = Bidirectional(
                LSTM(self.args.units, return_sequences=True), name="layer_2")(x)
        outputs = Bidirectional(
                LSTM(self.args.units, return_sequences=True), name="layer_3")(x)
        return Model(inputs=inputs, outputs=outputs, name="rnn_block")

    def scorer_block(self):
        return Sequential([
            PReLU(),
            Dense(100),
            PReLU(),
            Dense(1)], name="scorer")

    def classifier_block(self):
        return Sequential([
            PReLU(),
            Dense(self.args.n_classes * 2),
            PReLU(),
            Dense(self.args.n_classes)], name="classifier")

    def bi_classifier_block(self):
        return Sequential([
            PReLU(),
            Dense(self.args.n_classes * 2),
            PReLU(),
            Dense(2)], name="bi_classifier")

    def compute_phi(self, rnn_out):
        batch_size, (seq_len, feat_dim) = self.args.batch_size, rnn_out.shape[1:]

        rnn_cum = tf.math.cumsum(rnn_out, axis=1)
        output_shape = [batch_size, seq_len, seq_len, feat_dim]
        
        a = tf.repeat(rnn_cum, [1, seq_len, 1])
        b = tf.reshape(tf.repeat(rnn_cum, [1, 1, seq_len]), [batch_size, -1, feat_dim])
        c = tf.reshape(tf.math.subtract(a, b), output_shape)
        d = tf.reshape(tf.repeat(rnn_out, [1, 1, seq_len]), output_shape)
        e = tf.reshape(tf.repeat(rnn_out, [1, seq_len, 1]), output_shape)
        return tf.concat([c, d, e], axis=-1)

    def segment_search(self, scores, lengths):
        """
        Dynamic search algorithm
        """
        batch_size, seq_len = self.args.batch_size, scores.shape[1]

        best_scores = tf.zeros([batch_size, seq_len])
        segmentations =  [[tf.zeros([2,], dtype=tf.int32)] for _ in range(batch_size)]

        for i in range(1, seq_len):
            start_idx = max(0, i - self.args.max_seg_size)
            end_idx = i
            current_scores = tf.zeros([batch_size, end_idx - start_idx])

            for j in range(start_idx, end_idx):
                index = tf.constant([[k, (j - start_idx)] for k in range(batch_size)])
                update = (best_scores[:, j] + scores[:, j, i])
                tf.tensor_scatter_nd_update(current_scores, index, update)

            best_score, best_index = tf.math.top_k(current_scores, k=1)
            best_score = tf.squeeze(best_score, axis=1)
            best_index += start_idx
            best_indices = tf.constant([[m, i] for m in range(batch_size)])
            tf.tensor_scatter_nd_update(best_scores, best_indices, best_score)

            for n in range(batch_size):
                current_segment = tf.concat(
                    [best_index[n], tf.constant([i])], axis=-1)
                segmentations[n].append(current_segment)
        
        for i, segments in enumerate(segmentations):
            segments = tf.stack(segments, axis=1)
            print(segments)
        
        return batch_segments

    def compute_segmentation_score(self, scores, segments):
        out_scores = tf.zeros([scores.shape[0]])
        return out_scores

    def call(self, inputs):
        lengths = tf.math.reduce_sum(tf.cast(inputs[1], dtype=tf.int32), axis=1)    
        rnn_out = self.rnn(inputs)
        phi = self.compute_phi(rnn_out)
        scores = tf.squeeze(self.scorer(phi), axis=-1)
        segments = self.segment_search(scores, lengths)

        return {
            "classifier_out": self.classifier(rnn_out),
            "bi_classifier_out": self.bi_classifier(rnn_out),
            "segments": segments,
            "segmentation_scores": self.compute_segmentation_score(scores, segments)
        }

model = Segmentor(args)
model([Input([420, 32]), Input([420], dtype=tf.bool)])
model.summary()

Tensor("Segmentor/stack:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_1:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_2:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_3:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_4:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_5:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_6:0", shape=(2, 420), dtype=int32)
Tensor("Segmentor/stack_7:0", shape=(2, 420), dtype=int32)


NameError: in user code:

    <ipython-input-7-876ccf167df9>:102 call  *
        segments = self.segment_search(scores, lengths)
    <ipython-input-9-d38635c68e31>:91 segment_search  *
        return batch_segments

    NameError: name 'batch_segments' is not defined


In [None]:
preds = model.predict(val, steps=699, verbose=1)
preds

In [None]:
preds.keys()

In [None]:
preds['segments'].shape