In [None]:
import tensorflow as tf
import glob
import tensorflow_io as tfio

In [None]:
import glob
import os

# Download the UrbanSound8K dataset
if not os.path.exists("./data/urban_sound/"):
    os.makedirs("./data/urban_sound/")
    !kaggle datasets download chrisfilo/urbansound8k
    !mkdir -p data/urban_sound_raw
    !mv urbansound8k.zip data/urban_sound_raw/
    !unzip data/urban_sound_raw/urbansound8k.zip -d data/urban_sound_raw/
    !rm data/urban_sound_raw/

    cnt = 0
    sum = len(glob.glob("./data/urban_sound_raw/*/*"))
    for file in glob.glob("./data/urban_sound_raw/*/*"):
        fixed_file = "./data/urban_sound/" + file.split("/")[-1]
        os.system(f"ffmpeg -i {file} -acodec pcm_s16le -ar 16000 {fixed_file} > /dev/null 2>&1")
        cnt += 1
        if cnt % 100 == 0:
            print(f"Processed {cnt/sum} files")
    !rm -rf data/urban_sound_raw

In [None]:
sampling_rate = 44100
def get_audio_from_wav_file(file):
    audio, sample_rate = tf.audio.decode_wav(
        tf.io.read_file(file),
        desired_channels=1
    )
    return tf.squeeze(audio, axis=-1)

def adjust_frequncy(audio, new_sampling_rate=16000, old_sampling_rate=44100):
    audio = tf.cast(audio, tf.float32)
    audio = tfio.audio.resample(
        audio,
        rate_in=old_sampling_rate,
        rate_out=new_sampling_rate
    )
    return audio

In [None]:
def pad_and_align(audio, target_length=16000):
    if tf.shape(audio)[0] > target_length:
        starting_idx = tf.random.uniform(
            shape=(),
            minval=0,
            maxval=tf.shape(audio)[0] - target_length,
            dtype=tf.int32
        )
        audio = audio[starting_idx: starting_idx + target_length]
    else:
        audio = tf.concat([audio, tf.zeros(target_length - tf.shape(audio)[0], dtype=tf.float32)], axis=0)
        # ensure the audio is in the correct shape
        audio.set_shape((target_length,))
    return audio

In [None]:
def get_background_noise_dataset(datapath, size):
    noises = glob.glob(f"{datapath}/*.wav")
    # If the size of noises is less than the required size, we will repeat the noises
    new_noises = []
    for i in range(size):
        new_noises.append(noises[i % len(noises)])
    files_ds = tf.data.Dataset.from_tensor_slices(new_noises)
    audio_ds = files_ds.map(
        get_audio_from_wav_file,
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    adjusted_audio_ds = audio_ds.map(
        lambda x: adjust_frequncy(x, 16000, 44100),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    fixed_length_audio_ds = adjusted_audio_ds.map(
        lambda x: pad_and_align(x, 16000),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    return fixed_length_audio_ds.shuffle(1024)


In [None]:
noise_dataset = get_background_noise_dataset("data/urban_sound", 100)

In [None]:
from dataset import get_datasets

In [None]:
train_ds, valid_ds, test_ds = get_datasets(batch_size=64)

In [4]:
train_ds

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 124, 39), dtype=tf.float32, name=None), TensorSpec(shape=(None, 31), dtype=tf.float32, name=None))>

In [3]:
N = 41
D = 32
H = 4
layers = 2

layers*3*(3*N*D*H + (N**2)*D*H + 4*N*D**2)

6801408