# Test Audio Sender

This file is used for sending audio data to the device. It measures the accuracy based on individual audio files.

**Note that the software on the MCU must match this!**

In [379]:
AUDIO_FOLDER_PATH = "dataset_mix"
POSITIVE_PATH = f"{AUDIO_FOLDER_PATH}/positive"
NEGATIVE_PATH = f"{AUDIO_FOLDER_PATH}/negative"

In [380]:
import numpy as np
import pandas as pd
import gc
import os
import librosa
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import serial
import struct

In [381]:
# Collecte positive audio files and their classes
positive_audio_files = []
positive_audio_file_classes = []
# Look each file in the audio folder
audio_class_folders = os.listdir(POSITIVE_PATH)
# Loop each class folder
for audio_class_folder in audio_class_folders:
    # Assemble full audio class folder path.
    audio_class_folder_path = os.path.join(POSITIVE_PATH, audio_class_folder)
    print("Processing class folder: ", audio_class_folder_path)
    # Get all files in the audio class folder
    audio_class_files = os.listdir(audio_class_folder_path)
    # Loop each audio file in the audio class folder
    for audio_class_file in audio_class_files:
        # Assemble full audio file path.
        audio_file_path = os.path.join(audio_class_folder_path, audio_class_file)
        # Append the audio file path to the positive_audio_files list
        positive_audio_files.append(audio_file_path)
        # Append the audio class to the audio_file_classes list
        positive_audio_file_classes.append(audio_class_folder)
print("positive_audio_files length: ", len(positive_audio_files))
print("positive_audio_file_classes length: ", len(positive_audio_file_classes))

Processing class folder:  dataset_mix/positive/Car
Processing class folder:  dataset_mix/positive/Comm
Processing class folder:  dataset_mix/positive/Motorcycle
positive_audio_files length:  18597
positive_audio_file_classes length:  18597


In [382]:
positive_audio_file_classes[0]

'Car'

In [383]:
# Collecte negative audio files and their classes
negative_audio_files = []
negative_audio_file_classes = []
# Look each file in the audio folder
audio_class_folders = os.listdir(NEGATIVE_PATH)
# Loop each class folder
for audio_class_folder in audio_class_folders:
    # Assemble full audio class folder path.
    audio_class_folder_path = os.path.join(NEGATIVE_PATH, audio_class_folder)
    print("Processing class folder: ", audio_class_folder_path)
    # Get all files in the audio class folder
    audio_class_files = os.listdir(audio_class_folder_path)
    # Loop each audio file in the audio class folder
    for audio_class_file in audio_class_files:
        # Assemble full audio file path.
        audio_file_path = os.path.join(audio_class_folder_path, audio_class_file)
        # Append the audio file path to the negative_audio_files list
        negative_audio_files.append(audio_file_path)
        # Append the audio class to the audio_file_classes list
        negative_audio_file_classes.append(audio_class_folder)
print("negative_audio_files length: ", len(negative_audio_files[0:10]))
print("negative_audio_file_classes length: ", len(negative_audio_file_classes[0:10]))

Processing class folder:  dataset_mix/negative/background
negative_audio_files length:  10
negative_audio_file_classes length:  10


In [384]:
# Shuffle the audio files and classes with the same seed.
seed = 42
np.random.seed(seed)
np.random.shuffle(positive_audio_files)

np.random.seed(seed)
np.random.shuffle(positive_audio_file_classes)

np.random.seed(seed)
np.random.shuffle(negative_audio_files)

In [385]:
# Hot end code the labels.
label_encoder = LabelEncoder()
positive_audio_file_classes_categorical = to_categorical(label_encoder.fit_transform(positive_audio_file_classes))
print("Example of audio_file_classes_categorial: ", positive_audio_file_classes_categorical[0:10])

Example of audio_file_classes_categorial:  [[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [386]:
# Connect to MCU via serial.
print("Configuring serial port...")
ser = serial.Serial(
    port='/dev/ttyACM0',  # Change this to your actual port, e.g., 'COM3' on Windows, '/dev/ttyS0' on Linux
    baudrate=921600,       # Set baud rate to 921600
    bytesize=serial.EIGHTBITS,
    parity=serial.PARITY_NONE,
    stopbits=serial.STOPBITS_ONE,
    timeout=0.1,           # Set timeout for reading
    write_timeout=None     # Wait indefinitely until all data is sent
)

Configuring serial port...


In [387]:
if ser.is_open:
    print(f"Serial port {ser.port} opened at {ser.baudrate} baud.")

Serial port /dev/ttyACM0 opened at 921600 baud.


In [388]:
def calc_accuracy(realResults, results):
    correct = 0
    total = min(len(realResults), len(results))  # Ensure we only compare up to the shorter length
    for real, predicted in zip(realResults[:total], results[:total]):
        if real == predicted:
            correct += 1
    return correct / total if total > 0 else 0.0

import numpy as np

def confusion_matrix_manual(y_true, y_pred):

    labels = sorted(set(y_true + y_pred))  # All unique labels
    matrix = {label: {l: 0 for l in labels} for label in labels}

    for real, pred in zip(y_true, y_pred):
        matrix[real][pred] += 1

    return matrix


def print_confusion_matrix(cm):
    labels = list(cm.keys())
    print("    " + " ".join(f"{l:>4}" for l in labels))
    for real in labels:
        row = " ".join(f"{cm[real][pred]:>4}" for pred in labels)
        print(f"{real:>4} {row}")

In [389]:
audioSent = 0
audioFilesProcessed = 0
results = []
realResults = []

In [390]:
positive_audio_file_classes_categorical[0]

array([0., 1., 0.])

In [391]:
#For each extra added, it had to be padded!


def streamAudioFile(file_path, realClass, i):
    global audioSent
    global results
    global realResults
    global audioBuffer

    # Append the real result to the results list.
    #if len(realResults) > 0 and len(realResults) == len(results) and realResults[-1] == 9:
    #    realResults[-1] = np.argmax(realClass)
    #else:
    realResults.append(np.argmax(realClass))

    # Check file format.
    if not file_path.endswith('.wav'):
        print("Error: Only .wav files are supported.")
        return

    # Load the audio file.
    audio_data, sr = librosa.load(file_path, sr=None)

    # Convert sample rate to 16kHz.
    if sr != 16000:
        audio_data = librosa.resample(y=audio_data, orig_sr=sr, target_sr=16000)
        sr = 16000

    # Convert the audio data to a numpy array.
    audio_data = np.array(audio_data, dtype=np.float32)

    # Send the audio data to the MCU.
    ser.write(audio_data.tobytes())

    classifications = []

    while ser.in_waiting > 0:
        response = ser.readline().decode('utf-8').strip()
        if response.startswith("fin:"):
            matrix = confusion_matrix_manual(realResults, results)
            print_confusion_matrix(matrix)
        if response.startswith("e:"):
            print(response)
            return
        if response.startswith("c:"):
            print(response)
        if response.startswith("s:"):
            print(response)
        if response.startswith("v:"):
            print(response)
            response = int(response[2:])
            classifications.append(response)

    j = 0
    if len(results) > 0:
        while results[-j-1] == 9 and j < len(results):
            j += 1

    if j > 0:

        while j > 0 and len(classifications) > 0:
            results[-j] = classifications.pop(0)
            j -= 1

    while len(classifications) > 0:
        results.append(classifications.pop(0))
        if len(results) > len(realResults):
            realResults.append(realResults[-1])
    
    #if (len(results) > 0 and results[-1] == 9):
    #    results[-1] = response
    #else:
    #    results.append(response)
    #    if (len(results) > len(realResults)):
    #        realResults.append(9)
    #print(f"Accuracy: {calc_accuracy(realResults, results)}")

    # If no classification was received, append None to results.
    if i > 0 and len(results) < len(realResults):
        results.append(9)
    print(f"Accuracy: {calc_accuracy(realResults, results)}")

In [None]:

total_positive_audio_files = len(positive_audio_files)

# Loop each audio file.
negative_audio_file_pointer = 0
for i in range(total_positive_audio_files):
    # Get audio file path.
    audio_file = positive_audio_files[i]
    # Print audio file path and class.
    print("Processing audio file: ", audio_file)
    print("Processing audio file class: ", positive_audio_file_classes[i])
    # Stream positive audio file to MCU.
    streamAudioFile(audio_file, positive_audio_file_classes_categorical[i], i)
    # Print nagtive audio file path and class.
    print("Processing negative audio file: ", negative_audio_files[negative_audio_file_pointer])
    # Stream negative audio file to MCU.
    streamAudioFile(negative_audio_files[negative_audio_file_pointer], [0,0,0,1], i)
    # Increment the negative audio file pointer.
    negative_audio_file_pointer = (negative_audio_file_pointer + 1) % len(negative_audio_files)

Processing audio file:  dataset_mix/positive/Comm/Y99CehZOeqlQ_30.000_40.000.wav
Processing audio file class:  Comm
s:Heap: 1096 / 469792 bytes
s:Tensor arena size: 17056/37000 bytes
s:Input tensor shape: 1, 16, 8, 1
s:Output tensor shape: 1, 4
c: [-95,11,-60,-112] voted for: 1 max value: 11
c: [-196,81,-170,-226] voted for: 1 max value: 70
c: [-288,132,-270,-341] voted for: 1 max value: 51
c: [-402,220,-385,-456] voted for: 1 max value: 88
c: [-465,174,-422,-566] voted for: 2 max value: -37
c: [-565,241,-532,-679] voted for: 1 max value: 67
c: [-673,320,-645,-794] voted for: 1 max value: 79
c: [-759,367,-749,-907] voted for: 1 max value: 47
c: [-850,418,-852,-1020] voted for: 1 max value: 51
c: [-895,419,-949,-1135] voted for: 1 max value: 1
c: [-1001,322,-885,-1252] voted for: 2 max value: 64
c: [-926,219,-997,-1368] voted for: 0 max value: 75
c: [-848,112,-1109,-1483] voted for: 0 max value: 78
c: [-821,50,-1218,-1595] voted for: 0 max value: 27
c: [-762,-50,-1318,-1709] voted for: 

In [None]:
print(results)
print(list(map(int, realResults)))

#[1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 2, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 9, 9, 2, 9, 3, 0, 3, 1, 3, 0, 3, 1, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0]
#[1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 2, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 2, 3, 0, 3, 1, 3, 0, 3, 1, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 1]

[1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 2, 3, 0, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 2, 3, 1, 3, 1, 9]
[1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 1, 3, 0, 3, 0, 3, 2, 3, 1, 3, 1, 3, 0, 3, 0, 3, 0, 3, 0, 3, 2, 3, 0, 3, 1, 3]


In [None]:
# Close the serial port
ser.close()