In this notebook, we build on the model training of Exploration 2 in the real-time listening and processing of Exploration 3, to build a single unified workflow. Running this allows you to, in this one self-contained notebook,
* specify the noises you want your model to recognize
* record training and testing data for those noises
* train a model on the data, and evaluate its performance
* use the model with a listener to recognize noises in real-time and act on them

As in Exploration 3, we will use sounddevice to listen and record audio. This outputs numpy arrays. We will use this to construct training and testing datasets, which will pass mel spectrograms on to the network. Once trained, this network will be used to recognize noises, again by converting incoming audio into spectrograms and generating predictions.

In [1]:
import sounddevice as sd
import torch
import torchaudio.transforms as ta
import time
import numpy as np
import queue
from IPython.display import display, clear_output

# Listening functions

Functions to continuously listen for noises, and pass them to a processing function.

In [2]:
########### PARAMETERS ###########

device = 2 # select the microphone. Use sd.query_devices() to see options
print(sd.query_devices())

BATCH_DURATION = 0.02      # listen for noises BATCH_DURATION (seconds) at a time
THRESHOLD_MULTIPLIER = 5   # detect a spike when the next batch is at least THRESHOLD_MULTIPLIER times bigger
THRESHOLD_ABSOLUTE = 0.005 # ignore any spikes that don't rise above this. Too many false positives without this
BATCHES_PER_NOISE = 3      # collect BATCHES_PER_NOISE batches of audio input per detected noise

samplerate = sd.query_devices(device, 'input')['default_samplerate']
# optional for future: set the FFT window size based on the sample rate

blocksize = int(samplerate * BATCH_DURATION) # get the block (batch) size in frames

  0 Built-in Microphone, Core Audio (2 in, 0 out)
< 1 Built-in Output, Core Audio (0 in, 2 out)
> 2 SpeechMatic USB MultiAdapter, Core Audio (1 in, 2 out)


In [3]:
########### Functions for continuous listening and processing ###########

# bundling these is easier than declaring them 'global' in the below
class listen:
    """ Helper variables for processing continuous audio input """
    
    def reset():
        listen.prev_max = 1.
        listen.batches_to_collect = 0
        listen.batches_collected = 0
        listen.current_noise = None
        listen.start = time.time()

        listen.processing_start = 0 # for timing the total processing time
        listen.processing_end = 0

        listen.q_batches = queue.Queue() # a FIFO queue
        listen.all_audio = []  # could use this to collect all audio (uncomment line in callback)
        listen.all_noises = [] # could use this to collect all noises. Use the processing_function to append
    
def callback(indata, frames, time_pa, status):
    """ Detect if a noise has been made, and add audio to the queue. """
    if status:
        print('STATUS: ', str(status))
    if any(indata):
        indata_copy = indata.copy()
        new_max = np.absolute(indata_copy).max()
        # listen.all_audio.append(indata_copy)
        
        # Gather audio data if more is required. Make sure to *copy* the input data.
        if listen.batches_to_collect > 0:
            listen.q_batches.put_nowait(indata_copy)
            listen.batches_collected  += 1
            listen.batches_to_collect -= 1
                
        # Otherwise, see if a new noise has been detected
        elif ( new_max > THRESHOLD_ABSOLUTE and
               new_max > THRESHOLD_MULTIPLIER * listen.prev_max ):
            
            listen.processing_start = time.time()
            
            listen.q_batches.put_nowait(indata_copy)
            listen.batches_collected += 1
            listen.batches_to_collect = BATCHES_PER_NOISE - 1 # get more batches
               
        listen.prev_max = new_max
        
    else:
        print('no input')

def time_elapsed(duration):
    def _time_elapsed():
        return time.time() - listen.start > duration
    return _time_elapsed

def print_processing_time():
    listen.processing_end = time.time()
    print('Processing took {:.4f} sec\n'.format(
        listen.processing_end - listen.processing_start))
        
def listen_and_process(processing_function, stop_condition=time_elapsed(3),
                       device=device, print_after_processing=None):
    """ Listen continuously for noises until stop_condition() returns True (default: wait 3 sec).
    As each noises heard, process is using processing_function. Return all noises at the end. """

    listen.reset() # reinitialize helper variables
    
    with sd.InputStream(device=device, channels=1, callback=callback,
                        blocksize=blocksize,
                        samplerate=samplerate):
        print('Listening...')
        while True:
            
            # data collects if it meets the threshold. Process when enough data is in queue:
            if listen.batches_collected >= BATCHES_PER_NOISE:
                data = []
                for _ in range(BATCHES_PER_NOISE):
                    data.append( listen.q_batches.get_nowait() )
                listen.batches_collected -= BATCHES_PER_NOISE
                
                listen.current_noise = np.concatenate( data, axis=None )
                
                processing_function( listen.current_noise )
                
                # print something after processing, if desired
                print_after_processing() if print_after_processing else None
                    
            # listen until the condition is met
            if stop_condition():
                break
        print('Done.')

We now apply this to make a listener to record training and testing data for our model:

In [4]:
########## A listener to record training/testing data. ##########

# a helper function to get nonnegative integer input
def get_int_input():
    while True:
        response = input() # response is a string
        try:
            val = int(response)
            if val >= 0:
                break
            print('Integer must be non-negative.')
        except:
            print('Please enter an integer.')
        
    return val

def record_model_data(device=device):
    """ Prompts the user to label and record noise samples. Returns a dictionary with labels as keys
    and lists of flattened numpy arrays (one array per noise sample) as values. """
    
    noise_data_dict = {}
    noise_count = 0
    
    def gather_and_progress(label, total):
        def _gather_and_progress(rec):
            nonlocal noise_count
            
            listen.all_noises.append( rec )
            noise_count += 1
            print(noise_count, end=', ')

        return _gather_and_progress
        
    while True:
        print('Enter text label for next noise (leave blank to exit):')
        label = input()
        if not label:
            return noise_data_dict
        if label in noise_data_dict:
            print('You have already recorded {} samples of this noise. You may now record more.'.format(
                    len(noise_data_dict[label]))
                 )
        
        print('How many noise samples would you like to record?')
        num = get_int_input()
        if num == 0:
            continue
        
        clear_output() # clear jupyter output
        print('Please start recording.\n')
        print('"{}" noises recorded (out of {}): '.format(label, num))
        noise_count = 0
        listen_and_process(processing_function=gather_and_progress(label, num), 
                           stop_condition=lambda: noise_count >= num,
                           device=device,
                           print_after_processing=None)
        print('')

        # save the list of recorded noises
        if label in noise_data_dict:
            noise_data_dict[label] += listen.all_noises.copy()
        else:
            noise_data_dict[label] = listen.all_noises.copy()
        
    return noise_data_dict

In [5]:
record_model_data()

Please start recording.

"p" noises recorded (out of 50): 
Listening...
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, Done.

Enter text label for next noise (leave blank to exit):



{'t': [array([0.00042725, 0.00033569, 0.00024414, ..., 0.03942871, 0.04013062,
         0.04064941], dtype=float32),
  array([-0.00021362, -0.00033569, -0.00036621, ..., -0.00057983,
         -0.00079346, -0.00097656], dtype=float32),
  array([-0.00021362, -0.00021362, -0.00039673, ..., -0.05438232,
         -0.05169678, -0.04858398], dtype=float32),
  array([-3.05175781e-05,  3.05175781e-05, -6.10351562e-05, ...,
          1.26953125e-02,  1.32141113e-02,  1.42211914e-02], dtype=float32),
  array([-0.00012207, -0.00012207, -0.00018311, ..., -0.0630188 ,
         -0.05944824, -0.05529785], dtype=float32)],
 'p': [array([-0.00015259, -0.00015259, -0.00018311, ...,  0.00820923,
          0.00799561,  0.00775146], dtype=float32),
  array([ 3.0517578e-05,  6.1035156e-05,  9.1552734e-05, ...,
         -6.7138672e-04, -7.0190430e-04, -6.1035156e-04], dtype=float32),
  array([-0.00021362, -0.00021362, -0.00012207, ...,  0.00518799,
          0.00521851,  0.00506592], dtype=float32),
  array([

# Audio processing: generating spectrograms

In [None]:
N_MELS = 28                # the number of mel filterbanks in the spectrogram