In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
cd 'drive/MyDrive/Uni/UniPD/HumanDataProject/Code'

[Errno 2] No such file or directory: 'drive/MyDrive/Uni/UniPD/HumanDataProject/Code'
/content/drive/MyDrive/Uni/UniPD/HumanDataProject/Code


In [22]:
import sys
import pandas as pd
import os
import numpy as np
from config import PREPROCESSING_PATH,DATASET_SPLIT_PATH
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from scipy.io import wavfile


from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [23]:
!pip install tensorflow-io



In [24]:
sys.path.append("drive/MyDrive/Uni/UniPD/HumanDataProject/Code")

In [25]:
import preprocessing_tf

In [26]:
print("GPU Available:", tf.test.is_gpu_available())
print("Version:", tf.__version__)

GPU Available: False
Version: 2.15.0


# Create train and validation dataset

Construct dataframes that includes the file paths and the corresponding spoken command (label) for each audio sample. The dataset comprises audio samples of 25 keywords: `backward`, `down`, `eight`, `five`, `follow`, `forward`, `four`, `go`, `learn`, `left`, `nine`, `no`, `off`, `on`, `one`, `right`, `seven`, `six`, `stop`, `three`, `two`, `up`, `visual`, `yes`, `zero`. Additionally, it contains 10 words `bed`,`bird`,`cat`, `dog`,`happy`,`house`,`marvin`,`sheila`,`tree`,`wow` that the model should not recognize as keywords.

To facilitate model training and evaluation, the labels are appropriately mapped: labels corresponding to the 25 keywords are retained in their original form, signifying that these are the commands the model is expected to recognize. Conversely, the labels for the 10 non-keyword words are mapped to a single class named "unknown". This approach consolidates these distinct non-keyword labels into a single category, simplifying the model's task by reducing the classification scope to the keywords and an "unknown" class for any non-keyword utterances.

In [27]:
train_df = preprocessing_tf.get_file_list(os.path.join(DATASET_SPLIT_PATH,"train"))
val_df = preprocessing_tf.get_file_list(os.path.join(DATASET_SPLIT_PATH,"validation"))

In [28]:
train_df.head()

Unnamed: 0,filepath,label,mapped_label
0,/content/drive/MyDrive/Uni/UniPD/HumanDataProj...,backward,backward
1,/content/drive/MyDrive/Uni/UniPD/HumanDataProj...,backward,backward
2,/content/drive/MyDrive/Uni/UniPD/HumanDataProj...,backward,backward
3,/content/drive/MyDrive/Uni/UniPD/HumanDataProj...,backward,backward
4,/content/drive/MyDrive/Uni/UniPD/HumanDataProj...,backward,backward


In [29]:
file_paths = tf.constant(train_df['filepath'].values)
labels = tf.constant(train_df['mapped_label'].values)

In [30]:
# Create a StringLookup layer
#label_lookup = label_lookup = tf.keras.layers.StringLookup(num_oov_indices=0)
label_lookup = tf.keras.layers.StringLookup()
label_lookup.adapt(labels)
# Transform labels into numeric
numeric_labels = label_lookup(labels)

# Create a TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((file_paths, numeric_labels))

In [31]:
file_paths_val = tf.constant(val_df['filepath'].values)
labels_val = tf.constant(val_df['mapped_label'].values)
numeric_labels_val = label_lookup(labels_val)
validation_dataset = tf.data.Dataset.from_tensor_slices((file_paths_val, numeric_labels_val))

# Preprocessing the Datasets

The training and validation datasets undergo preprocessing through our established pipeline. For the baseline model, the preprocessing involves two steps: padding the data to ensure uniformity in size, which is essential for the model's input requirements, and converting the audio files into spectrograms.

In [32]:
#With parameters
#train_spectrogram_ds = train_dataset.map(lambda fp, lbl: preprocessing_tf.preprocess_map_new(fp, lbl,resample=True,mfcc=True),
#                               num_parallel_calls=tf.data.AUTOTUNE)

train_spectrogram_ds = train_dataset.map(lambda fp, lbl: preprocessing_tf.preprocess_map_new(fp, lbl),
                               num_parallel_calls=tf.data.AUTOTUNE)
train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)


val_spectrogram_ds = validation_dataset.map(lambda fp, lbl: preprocessing_tf.preprocess_map_new(fp, lbl),
                               num_parallel_calls=tf.data.AUTOTUNE)

Data shape: (None,)
Padding shape: (16000,)
Signal shape: (16000,)
Spectrogram shape: (124, 129, 1)
Final shape: (124, 129, 1)
Data shape: (None,)
Padding shape: (16000,)
Signal shape: (16000,)
Spectrogram shape: (124, 129, 1)
Final shape: (124, 129, 1)


In [33]:
batch_size = 32
train_spectrogram_ds = train_spectrogram_ds.batch(batch_size)
val_spectrogram_ds = val_spectrogram_ds.batch(batch_size)

In [34]:
input_shape =train_spectrogram_ds.element_spec[0].shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_lookup.get_vocabulary())
print('Number of labels:', num_labels)

Input shape: (124, 129, 1)
Number of labels: 27


# Model

The baseline model adopted for our analysis originates from the TensorFlow tutorial designed for a mini version of the speech dataset https://www.tensorflow.org/tutorials/audio/simple_audio. This model serves primarily as a benchmark for comparison purposes. Our objective is to demonstrate that subsequent models, which incorporate more intricate architectures or employ additional preprocessing steps, will surpass the performance of this baseline model.

In [None]:
# Instantiate the `tf.keras.layers.Normalization` layer.
norm_layer = layers.Normalization()
# Fit the state of the layer to the spectrograms
# with `Normalization.adapt`.
norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))

model = models.Sequential([
    layers.Input(shape=input_shape),
    # Downsample the input.
    layers.Resizing(32, 32),
    # Normalize.
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
    run_eagerly=True  # Enable eager execution for debugging
)

In [None]:
EPOCHS = 8
history = model.fit(
    train_spectrogram_ds,
    validation_data=val_spectrogram_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

# Evaluation
The model's performance on our test dataset is assessed using the top-1 error rate, which is the same metric employed by the dataset's creator.

In [None]:
test_df = preprocessing_tf.get_file_list(os.path.join(DATASET_SPLIT_PATH,"test"))
file_paths_test = tf.constant(test_df['filepath'].values)
labels_test = tf.constant(test_df['mapped_label'].values)
numeric_labels_test = label_lookup(labels_test)
test_dataset = tf.data.Dataset.from_tensor_slices((file_paths_test, numeric_labels_test))


In [None]:
test_spectrogram_ds = test_dataset.map(lambda fp, lbl: preprocessing_tf.preprocess_map_new(fp, lbl),
                               num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
predictions = model.predict(test_spectrogram_ds)
predicted_classes = np.argmax(predictions, axis=1)

true_labels = np.concatenate([y for x, y in test_spectrogram_ds], axis=0)
accuracy = np.mean(predicted_classes == true_labels)

print(f"Manual Test Accuracy: {accuracy}")

# Calculate the Top-1 Error Rate
top_1_error_rate = 1 - accuracy
print(f"Manual Top-1 Error Rate: {top_1_error_rate}")