In [8]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import pandas as pd
from datetime import datetime
from termcolor import colored

In [9]:
# Timer.
startTime = datetime.now()

# Path to created json file from mel preprocess and feature extraction script.
DATA_PATH = "E:/Acoustic/mfcc_data.json"

# Path to save model.
MODEL_SAVE = 'E:/Acoustic/model_1.h5'

# Path to save training history and model accuracy performance at end of training.
HISTORY_SAVE = "E:/Acoustic/history_1.csv"
ACC_SAVE = "E:/Acoustic/models_acc_1.json"


def load_data(data_path):
    """Loads training dataset from json file.
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    """

    with open(data_path, "r") as fp:
        data = json.load(fp)
    # Convert lists to numpy arrays.
    X = np.array(data["mfcc"])  # The name in brackets is changed to "mfccs" if MFCC features are used to train.
    y = np.array(data["labels"])
    return X, y


def prepare_datasets(test_size, validation_size):
    # Load extracted features and labels data.
    X, y = load_data(DATA_PATH)

    # Create train/test split.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # Create train/validation split.
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    # 3D array.
    X_train = X_train[..., np.newaxis]  # 4-dim array: (# samples, # time steps, # coefficients, 1)
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]

    return X_train, X_validation, X_test, y_train, y_validation, y_test


def build_model(input_shape):
    # Create model
    model = keras.Sequential()

    # 1st convolutional layer.
    model.add(keras.layers.Conv2D(8, (5, 5), activation='relu', input_shape=input_shape))
        # 8 kernals, and 5x5 grid size of kernal
    model.add(keras.layers.MaxPool2D((5, 5), strides=(2, 2), padding='same'))
        # pooling size 5x5
    model.add(keras.layers.BatchNormalization())
        # Batch Normalization allows model to be more accurate and computations are faster.

    # 2nd convolutional layer.
    model.add(keras.layers.Conv2D(32, (5, 5), activation='relu', input_shape=input_shape))
    model.add(keras.layers.MaxPool2D((5, 5), strides=(2, 2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    # Flatten the output and feed into dense layer.
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(32, activation='relu'))
        # 32 = number of neurons
    model.add(keras.layers.Dropout(0.3))
    # Reduces chances of over fitting.

    # Output layer that uses softmax activation.
    model.add(keras.layers.Dense(2, activation='softmax'))
        # 2 neurons --> depends on how many categories we want to predict.

    return model


def predict(model, X, y):
    # Random prediction post-training.
    X = X[np.newaxis, ...]

    prediction = model.predict(X)

    # Extract index with max value.
    predicted_index = np.argmax(prediction, axis=1)
    print("Expected index: {}, Predicted index: {}".format(y, predicted_index))


if __name__ == "__main__":
    # Create train, validation and test sets.
    X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25, 0.2)  # (test size, val size)

    # Early stopping.
    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

    # Checkpoint.
    checkpoint = keras.callbacks.ModelCheckpoint(MODEL_SAVE, monitor='val_loss',
                                                 mode='min', save_best_only=True, verbose=1)

    # Build the CNN network.
    input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
    model = build_model(input_shape)

    # Compile the network.
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer,
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])

    model.summary()

    # Train the CNN.
    history = model.fit(X_train, y_train, validation_data=(X_validation, y_validation), batch_size=16, epochs=1000,
                        callbacks=[callback, checkpoint])

    # Save history.
    hist = pd.DataFrame(history.history)

    # Save to csv:
    hist_csv = HISTORY_SAVE
    with open(hist_csv, mode='w') as f:
        hist.to_csv(f)

    print(
        colored("CRNN model has been trained and its training history has been saved to {}.".format(hist_csv), "green"))

    # Evaluate the CNN on the test set.
    test_error, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
    print("Accuracy on test set is: {}".format(test_accuracy))

    # Timer output.
    time = datetime.now() - startTime
    print(time)

    # Make prediction on a random sample.
    X = X_test[100]
    y = y_test[100]
    predict(model, X, y)

    # Save model accuracies on test set (for weight calculations later on).
    accuracy = {
        "model_acc": [],
        "total_train_time": [],
    }

    accuracy["model_acc"].append(test_accuracy)
    accuracy["total_train_time"].append(str(time))

    with open(ACC_SAVE, "w") as fp:
        json.dump(accuracy, fp, indent=4)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 40, 16, 8)         208       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 20, 8, 8)         0         
 )                                                               
                                                                 
 batch_normalization (BatchN  (None, 20, 8, 8)         32        
 ormalization)                                                   
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 4, 32)         6432      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 2, 32)         0         
 2D)                                                             
                                                        

Epoch 21/1000
Epoch 21: val_loss improved from 0.01517 to 0.01226, saving model to E:/Acoustic\model_1.h5
Epoch 22/1000
Epoch 22: val_loss improved from 0.01226 to 0.00990, saving model to E:/Acoustic\model_1.h5
Epoch 23/1000
Epoch 23: val_loss improved from 0.00990 to 0.00727, saving model to E:/Acoustic\model_1.h5
Epoch 24/1000
Epoch 24: val_loss improved from 0.00727 to 0.00595, saving model to E:/Acoustic\model_1.h5
Epoch 25/1000
Epoch 25: val_loss improved from 0.00595 to 0.00504, saving model to E:/Acoustic\model_1.h5
Epoch 26/1000
Epoch 26: val_loss improved from 0.00504 to 0.00436, saving model to E:/Acoustic\model_1.h5
Epoch 27/1000
Epoch 27: val_loss improved from 0.00436 to 0.00399, saving model to E:/Acoustic\model_1.h5
Epoch 28/1000
Epoch 28: val_loss improved from 0.00399 to 0.00365, saving model to E:/Acoustic\model_1.h5
Epoch 29/1000
Epoch 29: val_loss improved from 0.00365 to 0.00288, saving model to E:/Acoustic\model_1.h5
Epoch 30/1000
Epoch 30: val_loss improved from

Epoch 73/1000
Epoch 73: val_loss improved from 0.00018 to 0.00017, saving model to E:/Acoustic\model_1.h5
Epoch 74/1000
Epoch 74: val_loss improved from 0.00017 to 0.00017, saving model to E:/Acoustic\model_1.h5
Epoch 75/1000
Epoch 75: val_loss improved from 0.00017 to 0.00016, saving model to E:/Acoustic\model_1.h5
Epoch 76/1000
Epoch 76: val_loss improved from 0.00016 to 0.00016, saving model to E:/Acoustic\model_1.h5
Epoch 77/1000
Epoch 77: val_loss did not improve from 0.00016
Epoch 78/1000
Epoch 78: val_loss improved from 0.00016 to 0.00016, saving model to E:/Acoustic\model_1.h5
Epoch 79/1000
Epoch 79: val_loss improved from 0.00016 to 0.00015, saving model to E:/Acoustic\model_1.h5
Epoch 80/1000
Epoch 80: val_loss improved from 0.00015 to 0.00015, saving model to E:/Acoustic\model_1.h5
Epoch 81/1000
Epoch 81: val_loss improved from 0.00015 to 0.00014, saving model to E:/Acoustic\model_1.h5
Epoch 82/1000
Epoch 82: val_loss improved from 0.00014 to 0.00014, saving model to E:/Acou

Epoch 125: val_loss did not improve from 0.00004
Epoch 126/1000
Epoch 126: val_loss improved from 0.00004 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 127/1000
Epoch 127: val_loss did not improve from 0.00003
Epoch 128/1000
Epoch 128: val_loss did not improve from 0.00003
Epoch 129/1000
Epoch 129: val_loss did not improve from 0.00003
Epoch 130/1000
Epoch 130: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 131/1000
Epoch 131: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 132/1000
Epoch 132: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 133/1000
Epoch 133: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 134/1000
Epoch 134: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epoch 135/1000
Epoch 135: val_loss improved from 0.00003 to 0.00003, saving model to E:/Acoustic\model_1.h5
Epo

In [10]:
# Make prediction on a random sample.
X = X_test[45]
y = y_test[45]
predict(model, X, y)

    # Save model accuracies on test set (for weight calculations later on).
accuracy = {
    "model_acc": [],
    "total_train_time": [],
}

accuracy["model_acc"].append(test_accuracy)
accuracy["total_train_time"].append(str(time))

with open(ACC_SAVE, "w") as fp:
    json.dump(accuracy, fp, indent=4)

Expected index: 0, Predicted index: [0]


In [11]:
y

0

In [1]:
import os
import json
import librosa
import tensorflow as tf
import numpy as np
from termcolor import colored

# Read and save parameters.
DATASET_PATH = "E:/Acoustic_test"  # Path of folder with testing audios
SAVED_MODEL_PATH = "E:/Acoustic/model_1.h5"  # Path of trained model
SAMPLE_RATE = 22050  # Sample rate in Hz.
DURATION = 1  # Length of audio files fed. Measured in seconds.
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

# Predictions (1 or 0)
JSON_PATH = "E:/Acoustic_test/predictions.json"
# Performance scores (accuracy, precision, recall, f1 score)
JSON_PERFORMANCE = "E:/Acoustic_test/model_scores.json"

# Prediction of fed audio


class _Class_Predict_Service:
    """Singleton class for keyword spotting inference with trained models.
    :param model: Trained model
    """
    # Mapping so drone = 1.
    model = None
    _mapping = [
        1,
        0
    ]
    _instance = None

    # Predict hard values (1 or 0).
    def predict(self, file_path):
        """
        :param file_path (str): Path to audio file to predict
        :return predicted_keyword (str): Keyword predicted by the model
        """
        print("here")
        # Extract mels from testing audio.
        log_mel = self.preprocess(file_path)

        # We need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1).
        log_mel = log_mel[np.newaxis, ..., np.newaxis]

        # Get the predicted label.
        predictions = self.model.predict(log_mel)
        predicted_index = np.argmax(predictions)
        predicted_class = self._mapping[predicted_index]
        print(predicted_class)
        return predicted_class

    # Outputs certainty values for soft voting (1-0).

    def preprocess(self, file_path, n_mels=20, n_fft=2048, hop_length=512, num_segments=1):

        num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)

        # Load audio file.
        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Process segments extracting mels and storing data.
        for s in range(num_segments):
            start_sample = num_samples_per_segment * s  # s=0 --> 0
            # s=0 --> num_samples_per_segment
            finish_sample = start_sample + num_samples_per_segment

            # Extract mel specs.
            mel = librosa.feature.melspectrogram(y=signal[start_sample:finish_sample], sr=sr, n_mels=n_mels, n_fft=n_fft,
                                                 hop_length=hop_length)
            log_mel = librosa.power_to_db(mel)

        return log_mel.T

def Keyword_Spotting_Service():
    """Factory function for Keyword_Spotting_Service class.
    :return _Keyword_Spotting_Service._instance (_Keyword_Spotting_Service):
    """

    # Ensure an instance is created only the first time the factory function is called.
    if _Class_Predict_Service._instance is None:
        _Class_Predict_Service._instance = _Class_Predict_Service()
        _Class_Predict_Service.model = tf.keras.models.load_model(
            SAVED_MODEL_PATH)
    return _Class_Predict_Service._instance


# Saving results into a json file.
def save_mfcc(dataset_path, json_path):
    # Dictionary to store data.
    data = {
        # Maps different class labels --> background is mapped to 0.
        "mapping": [],
        "names": [],
        "results": [],  # mels are the training input, labels are the target.
    }

    # Loop through all the classes.
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # Ensure that we're not at the root level.
        if dirpath is not dataset_path:

            # Save the semantic label.
            # class/background => ["class", "background"]
            dirpath_components = dirpath.split("/")
            # considering only the last value
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))

            # Process files for a specific class.
            for f in filenames:
                file_path = os.path.join(dirpath, f)

                # Create 2 instances of the keyword spotting service.
                kss = Keyword_Spotting_Service()
                kss1 = Keyword_Spotting_Service()

                # Check that different instances of the keyword spotting service point back to the same object.
                assert kss is kss1

                # Classify unseen audio.
                keyword = kss.predict(file_path)

                # Store mel for segment if it has the expected length.
                data["names"].append(f)
                data["results"].append(keyword)
                print("{}".format(file_path))

    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)


# Calculating performance scores (accuracy, precision, recall, f-score).
def performance_calcs(performance_path):
    # Dictionary to store model performance results.
    performance = {
        "TP": [],
        "FN": [],
        "TN": [],
        "FP": [],
        "Accuracy": [],
        "Precision": [],
        "Recall": [],
        "F1 Score": [],
    }

    with open(JSON_PATH, "r") as fp:
        data = json.load(fp)

    # Convert lists to numpy arrays.
    y = np.array(data["results"])

    a = float(sum(y[0:int(len(y) / 2)]))
    b = float(sum(y[int(len(y) / 2):int(len(y))]))

    # Calculating TP, TN, FP, FN.
    TP = a
    FN = int(len(y) / 2) - a
    FP = b
    TN = int(len(y) / 2) - b

    # Performance result calcs.
    Accuracy = (TP + TN) / (TP + TN + FN + FP)
    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    F1 = (2 * Precision * Recall) / (Precision + Recall)

    performance["TP"].append(TP)
    performance["FN"].append(FN)
    performance["TN"].append(TN)
    performance["FP"].append(FP)
    performance["Accuracy"].append(Accuracy)
    performance["Precision"].append(Precision)
    performance["Recall"].append(Recall)
    performance["F1 Score"].append(F1)

    with open(performance_path, "w") as fp:
        json.dump(performance, fp, indent=4)


if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH)
    performance_calcs(JSON_PERFORMANCE)

    print(
        colored("CRNN model performance scores have been saved to {}.".format(JSON_PERFORMANCE), "green"))


C:\Users\Vidyuth\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\Vidyuth\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception