In [1]:
# Import Python packages
import os
import random

import librosa
import numpy as np
import python_speech_features

In [2]:
# Settings
percent_of_files_to_keep = 1
validation_ratio = 0.1
test_ratio = 0.1

In [3]:
# Get the absolute path of the dataset
dataset_path = os.path.abspath('speech_commands_v0.02')

In [4]:
# Create a list of all of the targets in the dataset (except for background noise)
targets = [entry for entry in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, entry))]
targets.remove('_background_noise_')
targets.sort()

In [5]:
# Create a list of files and a corresponding list of labels
files = []
labels = []

for index, target in enumerate(targets):
    files.append(os.listdir(os.path.join(dataset_path, target)))
    labels.append(np.ones(len(files[index])) * index)

files = [item for sublist in files for item in sublist]
labels = [item for sublist in labels for item in sublist]

files_and_labels = list(zip(files, labels))
random.shuffle(files_and_labels)
files, labels = zip(*files_and_labels)

In [6]:
# Shorten dataset (if percent_of_files_to_keep != 1)
files = files[0 : int(len(files) * percent_of_files_to_keep)]

In [7]:
# Calculate validation and test set sizes
validation_set_size = int(len(files) * validation_ratio)
test_set_size = int(len(files) * test_ratio)

In [8]:
# Split files into train, validation, and test sets
files_validation = files[0 : validation_set_size]
files_test = files[validation_set_size : (validation_set_size + test_set_size)]
files_train = files[(validation_set_size + test_set_size) : ]

In [9]:
# Split labels into train, validation, and test sets
labels_validation = labels[0 : validation_set_size]
labels_test = labels[validation_set_size : (validation_set_size + test_set_size)]
labels_train = labels[(validation_set_size + test_set_size) : ]

In [10]:
# Function to compute MFCC features from audio file
def compute_mfcc(file_path):
    y, sr = librosa.load(file_path, sr=8000)

    mfcc_features = python_speech_features.base.mfcc(signal=y, samplerate=sr, winlen=0.256, winstep=0.050, numcep=16, nfilt=26, nfft=2048, preemph=0.0, ceplifter=0, appendEnergy=False, winfunc=np.hanning)
    return mfcc_features.transpose()

In [11]:
# Function to compute MFCC features for list of audio files, removing results from undesirable files
def compute_mfccs(files, labels):
    files_out = []
    labels_out = []

    for index, file in enumerate(files):
        file_path = os.path.join(dataset_path, targets[int(labels[index])], file)

        mfcc_features = compute_mfcc(file_path)

        if mfcc_features.shape[1] == 16:
            files_out.append(mfcc_features)
            labels_out.append(labels[index])

    return files_out, labels_out

In [12]:
# Extract features for train, validation, and test sets
x_train, y_train = compute_mfccs(files_train, labels_train)
x_validation, y_validation = compute_mfccs(files_validation, labels_validation)
x_test, y_test = compute_mfccs(files_test, labels_test)

In [13]:
# Save features and labels to file
np.savez('mfcc_features.npz', x_train=x_train, y_train=y_train, x_validation=x_validation, y_validation=y_validation, x_test=x_test, y_test=y_test)