# Name: Tejaswa Singh
# Email: tsingh2_be21@thapar.edu
# Roll No: 102103691
# Group: 4CO24

Question:

Consider the paper: <https://arxiv.org/abs/1804.03209>

  1. Read and summarise the paper in about 50 words.
  2. Download the dataset in the paper, statistically analyse and
     describe it, so that it may be useful for posterity. (Include code
     snippets in your .ipynb file to evidence your analysis.)
  3. Train a classifier so that you are able to distinguish the commands
     in the dataset.
  4. Report the performance results using standard benchmarks.
  5. Record about 30 samples of each command in your voice and create a
     new dataset (including a new user id for yourself).  You may use a
     timer on your computer to synchronise.
  6. Fine tune your classifier to perform on your voice.
  7. Report the results.

Solution:

In [None]:
import os
import tarfile
import urllib.request

In [None]:
# Download the dataset
data_url = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
filename = 'speech_commands_v0.02.tar.gz'
urllib.request.urlretrieve(data_url, filename)

('speech_commands_v0.02.tar.gz', <http.client.HTTPMessage at 0x7932e439c850>)

In [None]:
# Extract the dataset
if not os.path.exists('./speech_commands'):
    tar = tarfile.open(filename, 'r:gz')
    tar.extractall(path='./speech_commands')
    tar.close()

In [None]:
# Check the extracted directory
data_path = './speech_commands'
print(os.listdir(data_path))

['right', 'eight', 'two', 'on', 'dog', 'bed', 'no', 'nine', 'cat', 'one', 'up', 'five', 'backward', 'left', 'learn', 'marvin', 'README.md', 'go', 'follow', 'tree', 'off', 'validation_list.txt', 'testing_list.txt', 'stop', 'zero', 'six', 'visual', '.DS_Store', 'down', 'forward', 'LICENSE', 'happy', 'house', 'three', '_background_noise_', 'sheila', 'wow', 'seven', 'four', 'yes', 'bird']


In [None]:
import librosa
import numpy as np
import glob

In [None]:
def load_audio_file(file_path):
    signal, sr = librosa.load(file_path, sr=16000)  # Load audio with a sample rate of 16kHz

    # Compute Mel-spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=40, fmax=8000)

    # Convert Mel-spectrogram to MFCC
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrogram), sr=sr, n_mfcc=13)

    return mfcc

In [None]:
# Function to pad or truncate MFCC features to a fixed length
def pad_features(mfcc, max_length=44):
    if mfcc.shape[1] > max_length:
        return mfcc[:, :max_length]  # Truncate if it's too long
    elif mfcc.shape[1] < max_length:
        pad_width = max_length - mfcc.shape[1]
        return np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        return mfcc

In [None]:
# Load all files
audio_files = glob.glob(f'{data_path}/**/*.wav', recursive=True)

In [None]:
# Extract MFCC features
data = []
labels = []
valid_labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go',
                'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
                'bed', 'bird', 'cat', 'dog', 'happy', 'house', 'marvin', 'sheila',
                'tree', 'wow', 'forward', 'backward', 'follow', 'learn', 'visual']

In [None]:
for file in audio_files:
    label = file.split('/')[-2]  # Extract label from the file path
    if label in valid_labels:
        mfcc = load_audio_file(file)
        padded_mfcc = pad_features(mfcc)  # Pad or truncate to a consistent length
        data.append(padded_mfcc)
        labels.append(label)

In [None]:
# Convert to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Reshape the data to fit model input (batch_size, height, width, channels)
data = np.expand_dims(data, axis=-1)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, models

In [None]:
# Define the input shape
input_shape = (13, 44, 1)

# Create the model
inputs = layers.Input(shape=input_shape)

# Use Resizing layer to adjust input to 32x32 for ResNet50
x = layers.Resizing(32, 32)(inputs)

# Load the base model
base_model = ResNet50(weights=None, include_top=False, input_shape=(32, 32, 1))

In [None]:
# Build the full model
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(valid_labels), activation='softmax')(x)

model = models.Model(inputs, outputs)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data, encoded_labels, test_size=0.2, random_state=42)

In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m1694/2646[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m33:16[0m 2s/step - accuracy: 0.0432 - loss: 3.6822

In [None]:
# Save the trained model to a file
model.save('speech_command_model.h5')

In [None]:
from google.colab import files

# Download the model to your local machine
files.download('speech_command_model.h5')

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('speech_command_model.h5')