# 1.方法总结

- Fast Fourier Transform(FFT)
- Audio stream data
- Audio classification
- `ffmpg`

# 2.Setup

In [1]:
import config
import os
import shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
from IPython.display import display, Audio
print(tf.__version__)

2.4.1


In [2]:
# Percentage of samples to use for validation
VALID_SPLIT = 0.1

# Seed to use when shuffling the dataset and the noise
SHUFFLE_SEED = 43

# The sampling rate to use.
# This is the one used in all of the audio samples.
# We will resample all of the noise to this sampling rate.
# This will also be the output size of the audio wave samples
# (since all samples are of 1 second long)
SAMPLING_RATE = 16000

# The factor to multiply the noise with according to:
#   noisy_sample = sample + noise * prop * scale
#      where prop = sample_amplitude / noise_amplitude
SCALE = 0.5

BATCH_SIZE = 128
EPOCHS = 100

# 3.数据加载、数据预处理

The dataset is composed of 7 folders, divided into 2 groups:

- Speech samples, with 5 folders for 5 different speakers. Each folder contains 1500 audio files, each 1 second long and sampled at 16000 Hz.
- Background noise samples, with 2 folders and a total of 6 files. These files are longer than 1 second (and originally not sampled at 16000 Hz, but we will resample them to 16000 Hz). We will use those 6 files to create 354 1-second-long noise samples to be used for training.

Let's sort these 2 categories into 2 folders:

- An audio folder which will contain all the per-speaker speech sample folders
- A noise folder which will contain all the noise samples

## 3.1 数据准备

### 3.1.1 数据分类

In [3]:
if os.path.exists(config.DATASET_AUDIO_PATH) is False:
    os.makedirs(config.DATASET_AUDIO_PATH)

if os.path.exists(config.DATASET_NOISE_PATH) is False:
    os.makedirs(config.DATASET_NOISE_PATH)

In [4]:
for folder in os.listdir(config.data_dir):
    if os.path.isdir(os.path.join(config.data_dir, folder)):
        if folder in [config.AUDIO_SUBFOLDER, config.NOISE_SUBFOLDER]:
            continue
        elif folder in ["other", "_background_noise_"]:
            shutil.move(
                os.path.join(config.data_dir, folder), 
                os.path.join(config.DATASET_NOISE_PATH, folder)
            )
        else:
            shutil.move(
                os.path.join(config.data_dir, folder), 
                os.path.join(config.DATASET_AUDIO_PATH, folder)
            )

### 3.1.2 噪音数据准备

In [5]:
noise_paths = []
for subdir in os.listdir(config.DATASET_NOISE_PATH):
    subdir_path = Path(config.DATASET_NOISE_PATH) / subdir
    if os.path.isdir(subdir_path):
        noise_paths += [
            os.path.join(subdir_path, filepath)
            for filepath in os.listdir(subdir_path)
            if filepath.endswith(".wav")
        ]

print(
    "Found {} files belonging to {} directories".format(
        len(noise_paths), len(os.listdir(config.DATASET_NOISE_PATH))
    )
)

Found 6 files belonging to 2 directories


In [6]:
command = (
    "for dir in `ls -1 " + config.DATASET_NOISE_PATH + "`; do "
    "for file in `ls -1 " + config.DATASET_NOISE_PATH + "/$dir/*.wav`; do "
    "sample_rate=`ffprobe -hide_banner -loglevel panic -show_streams "
    "$file | grep sample_rate | cut -f2 -d=`; "
    "if [ $sample_rate -ne 16000 ]; then "
    "ffmpeg -hide_banner -loglevel panic -y "
    "-i $file -ar 16000 temp.wav; "
    "mv temp.wav $file; "
    "fi; done; done"
)
os.system(command)

0

In [7]:
def load_noise_sample(path):
    """
     Split noise into chunks of 16000 each
    """
    sample, sampling_rate = tf.audio.decode_wav(
        tf.io.read_file(path), desired_channels=1
    )
    if sampling_rate == SAMPLING_RATE:
        # Number of slices of 16000 each that can be generated from the noise sample
        slices = int(sample.shape[0] / SAMPLING_RATE)
        sample = tf.split(sample[: slices * SAMPLING_RATE], slices)
        return sample
    else:
        print("Sampling rate for {} is incorrect. Ignoring it".format(path))
        return None

In [8]:
noises = []
for path in noise_paths:
    sample = load_noise_sample(path)
    if sample:
        noises.extend(sample)
noises = tf.stack(noises)
# print(f"{len(noise_paths)} noise files were split into {noises.shape[0]} noise samples where each is {noises.shape[1] // SAMPLING_RATE} sec. long")

Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/speaker_recognition/data/16000_pcm_speeches/noise/other/pink_noise.wav is incorrect. Ignoring it
Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/speaker_recognition/data/16000_pcm_speeches/noise/other/exercise_bike.wav is incorrect. Ignoring it
Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/speaker_recognition/data/16000_pcm_speeches/noise/_background_noise_/dude_miaowing.wav is incorrect. Ignoring it
Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/speaker_recognition/data/16000_pcm_speeches/noise/_background_noise_/doing_the_dishes.wav is incorrect. Ignoring it
Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/speaker_recognition/data/16000_pcm_speeches/noise/_background_noise_/10convert.com_Audience-Claps_daSG5fwdA7o.wav is incorrect. Ignoring it
Sampling rate for /Users/zfwang/project/machinelearning/computer_vision/sp

## 3.14 数据集生成

In [28]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:
        # Create a random tensor of the same size as audio ranging from
        # 0 to the number of noise stream samples that we have.
        tf_rnd = tf.random.uniform(
            (tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
        )
        noise = tf.gather(noises, tf_rnd, axis=0)

        # Get the amplitude proportion between the audio and the noise
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)

        # Adding the rescaled noise to audio
        audio = audio + noise * prop * scale

    return audio


def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

In [31]:
# Get the list of audio file paths along with their corresponding labels

class_names = os.listdir(config.DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))

audio_paths = []
labels = []
for label, name in enumerate(class_names):
    print("Processing speaker {}".format(name,))
    dir_path = Path(config.DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

Our class names: ['Jens_Stoltenberg', 'Benjamin_Netanyau', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
Processing speaker Jens_Stoltenberg
Processing speaker Benjamin_Netanyau
Processing speaker Julia_Gillard
Processing speaker Magaret_Tarcher
Processing speaker Nelson_Mandela
Found 7501 files belonging to 5 classes.


### Shuffle

In [32]:
# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

### 训练数据、验证数据分割

In [33]:
# Split into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

Using 6751 files for training.
Using 750 files for validation.


### 噪音数据加入训练集

In [34]:
# Add noise to the training set
train_ds = train_ds.map(
    lambda x, y: (add_noise(x, noises, scale=SCALE), y),
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
)

ValueError: in user code:

    <ipython-input-34-11c36dd92a78>:3 None  *
        lambda x, y: (add_noise(x, noises, scale=SCALE), y),
    <ipython-input-22-cf597efba746>:26 add_noise  *
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:2746 reduce_max
        _ReductionDims(input_tensor, axis))
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:2757 reduce_max_with_dims
        gen_math_ops._max(input_tensor, dims, keepdims, name=name))
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py:5628 _max
        name=name)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:750 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py:592 _create_op_internal
        compute_device)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:3536 _create_op_internal
        op_def=op_def)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:2016 __init__
        control_input_ops, op_def)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: Invalid reduction dimension 1 for input with 1 dimensions. for '{{node Max_1}} = Max[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](GatherV2, Max_1/reduction_indices)' with input shapes: [?], [] and with computed input tensors: input[1] = <1>.


### 训练集、验证集快速傅里叶变换(FFT)

In [27]:
train_ds = train_ds.map(lambda x, y: (audio_to_fft(x), y), num_parallel_calls = tf.data.experimental.AUTOTUNE)
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)


valid_ds = valid_ds.map(lambda x, y: (audio_to_fft(x), y), num_parallel_calls = tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)

# 4.模型构建

## 4.1 模型构建

In [35]:
def residual_block(x, filters, conv_num = 3, activation = "relu"):
    s = keras.layers.Conv1D(filters, 1, padding = "same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding = "same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding = "same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size = 2, strides = 2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape = input_shape, name = "input")
    
    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)
    
    x = keras.layers.AveragePooling1D(pool_size = 3, strides = 3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation = "relu")(x)
    x = keras.layers.Dense(128, activation = "relu")(x)
    
    outputs = keras.layers.Dense(num_classes, activation = "softmax", name = "output")(x)
    return keras.models.Model(inputs = inputs, outputs = outputs)


model = build_model((SAMPLING_RATE // 2, 1), len(class_names))
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 8000, 1)]    0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 8000, 16)     64          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 8000, 16)     0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 8000, 16)     784         activation[0][0]                 
______________________________________________________________________________________________

## 4.2 模型编译、训练

In [37]:
model.compile(optimizer = "Adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])
callbacks = [
    keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True),
    keras.callbacks.ModelCheckpoint("model.h5", monitor = "val_accuracy", save_best_only = True)
]
history = model.fit(
    train_ds,
    epochs = EPOCHS,
    validation_data = valid_ds,
    callbacks = callbacks,
)

Epoch 1/100


ValueError: in user code:

    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /Users/zfwang/.pyenv/versions/3.7.10/envs/tf2/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:274 assert_input_compatibility
        ', found shape=' + display_shape(x.shape))

    ValueError: Input 0 is incompatible with layer model: expected shape=(None, 8000, 1), found shape=(None, 16000, 1)


## 4.3 模型评估

In [None]:
print(model.evaluate(valid_ds))

# 5.模型使用示例

In [None]:
SAMPLES_TO_DISPLAY = 10
test_ds = paths_and_labels_to_dataset(validation_audio_paths, valid_labels)
test_ds = test_ds.shuffle(buffer_size = BATCH_SIZE * 8, seed = SHUFFLE_SEED).batch(BATCH_SIZE)
test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale = SCALE), y))

for audios, labels in test_ds.take(1):
    # singal FFT
    ffts = audio_to_fft(audios)
    # predict
    y_pred = model.predict(ffts)
    # take random samples
    rnd = np.random.randint(0, BATCH_SIZE, SAMPLES_TO_DISPLAY)
    audios = audios.numpy()[rnd, :, :]
    labels = labels.numpy()[rnd]
    y_pred = np.argmax(y_pred, axis = -1)[rnd]
    for index in range(SAMPLES_TO_DISPLAY):
        print(f"Speaker: {class_names[labels[index]]} - Predicted: {class_names[y_pred[index]]}")
        display(Audio(audios[index, :, :].squeeze(), rate = SAMPLING_RATE))