Author: Matthew Kinsley

Class: CIS663

Date: 2021-02-25


# Overview

This file will build on the code and teaching by Valerio Velardo in which he provide a bare bones and simple example code for MFCC feature extraction using librosa.

His github post can be found at.
https://github.com/musikalkemist/Deep-Learning-Audio-Application-From-Design-to-Deployment/tree/master/3-%20Implementing%20a%20Speech%20Recognition%20System%20in%20TensorFlow%202

We have adapted his code to work with our dataset.

# Global Imports
The following contains the global imports for the notebook.

In [16]:
import os
import json
import wave
import shutil
import librosa
import collections
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.model_selection import train_test_split

# Global Constants
The following code contains the constants that are used throughout the file.

In [2]:
DATASET_PATH = "../../data/wav48"
WORK_PATH = '../../data/working'    # Working folder to use for blocks.
JSON_PATH = "."
EXP_SUBJECTS = 10                   # The number of subjects to process so we don't 
                                    # test on all which would take a long time.

# Shared Methods
Teh follwing code block defines shared methods that can be used throughout this notebook.

In [12]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This function finds the next file in the folder that hasn't been processed
# yet.
#
# Inputs:
#    data_path - The path where files are stored
#    idx - The index of the suspected next file
#
# Output:
#    is_good - True if a file was found, otherwise False
#    file_path - The full file name and path string.
#    idx - The suspected idx of the next file
# ------------------------------------------------------------------------------------------------
def get_next_file_name(data_path, idx):
    is_good = False
    file_path = data_path + '_' + str(idx).zfill(3) + '.wav'

    # Give it 3 attempts to find a good file because some file ids are missing
    x = 0
    while not Path(file_path).exists() and x < 3:
        idx = idx+1
        file_path = data_path + '_' + str(idx).zfill(3) + '.wav'
        x = x+1

    # If we found a file make sure is_good is true
    if Path(file_path).exists():
        is_good = True

    return is_good, file_path, idx+1

In [25]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This function loads the original waveforms and then blocks them into the  
# specified number of blocks eccach of a specified length.
#
# Paramaters:
#    data_path - The path where files are stored
#    working_path - The path were the working files are created.
#    blocks - The number of files to create for each subject.
#    len - The number of seconds each files should be.
#    max_subj - Specifies the maximum number of subjects to process
#
# Outputs:
#    DataFrame containing the subject ID and file name for each file created.
# ------------------------------------------------------------------------------------------------
def prep_wave_files(data_path, working_path, len_sec, max_subj=-1):
    # Define named tuple
    wparams = collections.namedtuple('WParams', 'nchannels sampwidth, framerate, nframes, comptype, compname')

    # Delete and files in the working path that may be left over
    try:
        shutil.rmtree(working_path);
    except:
        print('No working path')
    os.mkdir(working_path)
    
    # Get a listing of all dir entries in the datapath    
    entries = os.scandir(data_path)

    # Each directory represents a subject.  We need to load those files an build
    # the necessary amount of 60 second files.  
    subj = 0
    for entry in entries:
        if subj < max_subj or max_subj==-1:
            subj = subj+1
            subject_id = entry.name
            path = entry.path

            out_path = working_path + '/' + subject_id
            os.mkdir(out_path)

            print('Processing Subject ', subject_id)

            j = 1
            bcnt = 0
            audiof = []
            done = False
            while not done:
                out_file_name = out_path + '/' + subject_id + '_' + str(bcnt).zfill(3) + '.wav'
                is_good, in_file_name, j = get_next_file_name(path + '/' + subject_id, j)

                # If good then read in the wave file
                if is_good:
                    # read in the file
                    wf = wave.open(in_file_name, 'rb')

                    # Downsample it
                    p = wf.getparams()
                    data = wf.readframes(wf.getnframes())

                    # Append it to the working stream
                    audiof.append([p, data])
                    sr = p.framerate
                    wf.close()

                    # Check to see if we have exceeded 60 seconds yet
                    length = 0
                    for i in range(0,len(audiof)):
                        length = length + len(audiof[i][1])

                    if length > len_sec*sr*2:
                        print('    writing block: ', bcnt)

                        # Calculate important parameters for writing the file
                        l_block = len(audiof)-1
                        l_block_overflow = (length-len_sec*sr*2)
                        l_block_len = len(audiof[l_block][1])-l_block_overflow

                        # Write the block
                        output = wave.open(out_file_name, 'wb')
                        output.setparams(audiof[0][0])
                        for i in range(0, len(audiof)-1):
                            output.writeframes(audiof[i][1])

                        output.writeframes(audiof[l_block][1][0:l_block_len])
                        output.close()

                        # Increment the block count
                        # Save the remainder
                        audiof = []
                        bcnt = bcnt + 1
                else:
                    done = True

            print('Subject ', subject_id, ' Complete')

In [4]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This method orginally produced by Valerio Velardo uses librosa to extract MFCC from all the 
# audo files in a given folder and specified parameters.  All extracted features are exprted to a
# json file.  It has been modified to work with our dataset.
#
# Note: This method requires that all audio files for each individual are stored in the same folder
# It uses the folder name to determine that a file belongs to an individual.  This aligns with the
# structure of the dataset and can be run without chagnes.
#
# Parameters:
#    dataset_path - The path to the root folder to search for audio files.
#    jason_path - Full path to the json file to fill with the extacted features.
#    num_mfcc - The number of coefficients to extract
#    n_fft - Interval to use to apply the FFT.  Measured in $ of samples.
#    hop_length - Slidding window for FFT measured in # of samples.
#    max_subj - Specifies the maximum number of subjects to process
# ------------------------------------------------------------------------------------------------
def preprocess_dataset(dataset_path, json_path, min_samples=22050, num_mfcc=13, n_fft=2048, hop_length=512,
                       max_subj=-1):
    # dictionary where we'll store mapping, labels, MFCCs and filenames
    data = {
        "mapping": [],
        "labels": [],
        "MFCCs": [],
        "files": []
    }

    # loop through all sub-dirs
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # make sure we don't process too many subjects
        if i < max_subj or max_subj == -1:
            # ensure we're at sub-folder level
            if dirpath is not dataset_path:

                # save label (i.e., sub-folder name) in the mapping
                label = dirpath.split("/")[-1]
                data["mapping"].append(label)
                print("\nProcessing: '{}'".format(label))

                # process all audio files in sub-dir and store MFCCs
                for f in filenames:
                    file_path = os.path.join(dirpath, f)

                    # Original code stopped processing when it hit a non audio file.  Adding the 
                    # exception handler allows the file to be logged and the processing to continue.
                    try:
                        # load audio file and slice it to ensure length consistency among different files
                        signal, sample_rate = librosa.load(file_path)

                        # drop audio files with less than pre-decided number of samples
                        if len(signal) >= min_samples:

                            # ensure consistency of the length of the signal
                            signal = signal[:min_samples]

                            # extract MFCCs
                            MFCCs = librosa.feature.mfcc(signal, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft,
                                                        hop_length=hop_length)

                            # store data for analysed track
                            data["MFCCs"].append(MFCCs.T.tolist())
                            data["labels"].append(i-1)
                            data["files"].append(file_path)
                            print("{}: {}".format(file_path, i-1))
                    except:
                        print('Can not load: {}: {}".format(file_path, i-1)')
        
    # save data in json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [5]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This method loads the input and target data from the specified json file.  
# Originally created by Valerio Velardo.
#
# Parameters:
#    data_path - Path to the json file containing the data
#
# Outputs: 
#    X - input data
#    y - target data
# ------------------------------------------------------------------------------------------------
def load_data(data_path):
    with open(data_path, "r") as fp:
        data = json.load(fp)

    X = np.array(data["MFCCs"])
    y = np.array(data["labels"])
    print("Training sets loaded!")
    return X, y


In [6]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This creates the training and validation splits that will be used for this generic training
# pass.  Adapted from a method created by Valerio Velardo.
#
# Parameters:
#    data_path - Path to the json file containing data
#    validation_size - Describes what percentage to use for the validation set size.
#
# Outputs: 
#    x_train - The training inputs
#    y_train  - The training targets
#    x_val - The validation inputs
#    y_val - The validation targets
# ------------------------------------------------------------------------------------------------
def prepare_dataset(data_path, validation_size=0.2):
    # load dataset
    X, y = load_data(data_path)

    # create train, validation, test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # add an axis to nd array
    X_train = X_train[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]

    return X_train, y_train, X_validation, y_validation, X_test, y_test

In [7]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This method orginally produced by Valerio Velardo ubuilds a simple convolutional model in 
# keras that can be used to train for speaker identification from the dataset stored in a json 
# file.
#
# Parameters:
#    input_shape - Is a touple representing the shape of a training sample.
#    output_shape - Is the shape of the output layer
#    loss - A string representing which loss funciton to use
#    learning_rate - A fload specifying the learning rate.
#
# Outputs:
#    model - The tensorflow model
# ------------------------------------------------------------------------------------------------
def build_model(input_shape, output_shape, loss="sparse_categorical_crossentropy", learning_rate=0.0001):
    # build network architecture using convolutional layers
    model = tf.keras.models.Sequential()

    # 1st conv layer
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape,
                                     kernel_regularizer=tf.keras.regularizers.l2(0.001)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))

    # 2nd conv layer
    model.add(tf.keras.layers.Conv2D(16, (3, 3), activation='relu',
                                     kernel_regularizer=tf.keras.regularizers.l2(0.001)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))

    # 3rd conv layer
    model.add(tf.keras.layers.Conv2D(32, (2, 2), activation='relu',
                                     kernel_regularizer=tf.keras.regularizers.l2(0.001)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D((2, 2), strides=(2,2), padding='same'))

    # flatten output and feed into dense layer
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    tf.keras.layers.Dropout(0.3)

    # softmax output layer
    model.add(tf.keras.layers.Dense(output_shape, activation='softmax'))

    optimiser = tf.optimizers.Adam(learning_rate=learning_rate)

    # compile model
    model.compile(optimizer=optimiser,
                  loss=loss,
                  metrics=["accuracy"])

    # print model parameters on console
    model.summary()

    return model

In [8]:
# ------------------------------------------------------------------------------------------------
# Description:
#    This method orginally produced by Valerio Velardo trains a model and returns the training
# history.
#
# Parameters:
#    model - The model to train
#    epochs - The number of epochs
#    batch_size - The size of each batch
#    patience - The number of epochs before early stop is allowed
#    x_train - The trianing inputs
#    y_train - The training targets
#    x_validation - The validation inputs
#    y_validation - The validation targets
#
# Outputs:
#    history the training history
# ------------------------------------------------------------------------------------------------
def train(model, epochs, batch_size, patience, X_train, y_train, X_validation, y_validation):
    # Setup the earlystop callback
    earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="accuracy", min_delta=0.001, patience=patience)

    # train model
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(X_validation, y_validation),
                        callbacks=[earlystop_callback])
    return history

# Extract Features
The following code extracts a series of inputs that can be used to build models.  To do this we will extract the files in a couple of different ways.  We will then mix them up all the way.  Once that is done we will test each of those inputs by running them through a simple model with minimumal training iterations to see which set of data creates the best generalized model.  

For this section the inputs will be as follows.
- Length of speach
    - Raw Files
    - 15 seconds
    - 30 seconds
    - 45 seconds
    - 60 seconds


- num_mfccs
    - 13
    - 20
    - 30


- n_fft
    - 1024
    - 2048
    - 4096


- hop_length
    - 256
    - 512
    - 1024
    
    
The first step is to extract all audo files into various jason files.

In [26]:
# List of constants to use to vary the extraction
audio_length = [0, 15, 30, 45, 30]
num_mfccs = [13, 20, 30]
n_fft = [1024, 2048, 4096]
hop_length = [256, 512, 1024]

# List of all create json files
json_files = []

# Extract the files to various json files
for al in audio_length:
    for nm in num_mfccs:
        for nf in n_fft:
            for hl in hop_length:
                json_file_name = 'data_{0}_{1}_{2}_{3}.json'.format(al, nm, nf, hl)
                json_files.append(json_file_name)
                
                dsPath = DATASET_PATH
                
                # Merge files to the specified length as needed
                if al != 0:
                    prep_wave_files(dsPath, WORK_PATH, al, max_subj=EXP_SUBJECTS)
                    dsPath = WORK_PATH
                    
                # Extract the features into the json file
                preprocess_dataset(dsPath, json_file_name, num_mfcc=nm, n_fft=nf, hop_length=hl, 
                                   max_subj=EXP_SUBJECTS)

for fName in jason_files:
    print(fName)

Processing Subject  p226
    writing block:  0
    writing block:  1
    writing block:  2
    writing block:  3
    writing block:  4
    writing block:  5
    writing block:  6
    writing block:  7
    writing block:  8
    writing block:  9
    writing block:  10
    writing block:  11
    writing block:  12
    writing block:  13
    writing block:  14
    writing block:  15
    writing block:  16
    writing block:  17
    writing block:  18
    writing block:  19
    writing block:  20
    writing block:  21
    writing block:  22
    writing block:  23
    writing block:  24
    writing block:  25
    writing block:  26
    writing block:  27
    writing block:  28
    writing block:  29
    writing block:  30
    writing block:  31
    writing block:  32
    writing block:  33
    writing block:  34
    writing block:  35
    writing block:  36
    writing block:  37
    writing block:  38
    writing block:  39
    writing block:  40
    writing block:  41
    writing block: 

    writing block:  19
    writing block:  20
    writing block:  21
    writing block:  22
    writing block:  23
    writing block:  24
    writing block:  25
    writing block:  26
    writing block:  27
    writing block:  28
    writing block:  29
    writing block:  30
    writing block:  31
    writing block:  32
    writing block:  33
    writing block:  34
    writing block:  35
    writing block:  36
    writing block:  37
    writing block:  38
    writing block:  39
    writing block:  40
    writing block:  41
    writing block:  42
    writing block:  43
    writing block:  44
    writing block:  45
    writing block:  46
    writing block:  47
    writing block:  48
    writing block:  49
    writing block:  50
    writing block:  51
    writing block:  52
    writing block:  53
    writing block:  54
    writing block:  55
    writing block:  56
    writing block:  57
    writing block:  58
    writing block:  59
    writing block:  60
    writing block:  61
    writing

    writing block:  63
    writing block:  64
    writing block:  65
    writing block:  66
    writing block:  67
    writing block:  68
    writing block:  69
    writing block:  70
    writing block:  71
    writing block:  72
    writing block:  73
    writing block:  74
    writing block:  75
    writing block:  76
    writing block:  77
    writing block:  78
    writing block:  79
    writing block:  80
    writing block:  81
    writing block:  82
    writing block:  83
    writing block:  84
    writing block:  85
    writing block:  86
    writing block:  87
    writing block:  88
    writing block:  89
    writing block:  90
    writing block:  91
    writing block:  92
    writing block:  93
    writing block:  94
    writing block:  95
    writing block:  96
    writing block:  97
    writing block:  98
    writing block:  99
Subject  p276  Complete
Processing Subject  p362
    writing block:  0
    writing block:  1
    writing block:  2
    writing block:  3
    writing 

NameError: name 'WORKING_PATH' is not defined