In [1]:
import pyaudio
import matplotlib.mlab as mlab
import numpy as np
import sys
import time

from queue import Queue
from threading import Thread

In [33]:
SAMPLE_DURATION_MS = 5000
N_SAMPLES = 300

KERNEL_SIZE = 15
STRIDE = 4
FRAME_RATE = 48000
NFFT = 512
TX = int(FRAME_RATE * 0.0195)
FX = int(NFFT / 2) + 1
TY = round((TX - KERNEL_SIZE + STRIDE) / STRIDE)

MULTRIGGER_MODE = False

if MULTRIGGER_MODE:
    N_CLASSES = len({Path(k).parent for k in glob.glob("{}/positives/*/*.wav".format(RAW_DATA_DIR))}) + 1
else:
    N_CLASSES = 2


CHUNK_DURATION = 0.5 # Each read length in seconds from mic.
FS = 48000 # sampling rate for mic
CHUNK_SAMPLES = int(FS * CHUNK_DURATION) # Each read length in number of samples.

# Each model input data duration in seconds, need to be an integer numbers of chunk_duration
FEED_DURATION = 5
FEED_SAMPLES = int(FS * FEED_DURATION)

In [34]:
import tensorflow as tf


def seq_model(input_shape, n_classes, kernel_size, stride):
    """
    Function creating the model's graph in Keras.

    :param input_shape: shape of the model's input data (using Keras conventions)
    :param n_classes: n_classes to predict for the last dense layer
    :param kernel_size: kernel size of the first conv layer
    :param stride : stride_size of the first conv layer
    :return: Keras model instance
    """

    X_input = tf.keras.layers.Input(shape=input_shape)

    # Step 1: CONV layer (≈4 lines)
    X = tf.keras.layers.Conv1D(256, kernel_size=kernel_size, strides=stride)(X_input)  # CONV1D
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization
    X = tf.keras.layers.Activation('relu')(X)  # ReLu activation
    X = tf.keras.layers.Dropout(0.1)(X)  # dropout (use 0.8)

    # Step 2: First GRU Layer (≈4 lines)
    X = tf.keras.layers.GRU(units=256, return_sequences=True)(X)  # GRU (use 128 units and return the sequences)
    X = tf.keras.layers.Dropout(0.1)(X)  # dropout (use 0.8)
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization

    # Step 3: Second GRU Layer (≈4 lines)
    X = tf.keras.layers.GRU(units=256, return_sequences=True)(X)  # GRU (use 128 units and return the sequences)
    X = tf.keras.layers.Dropout(0.1)(X)  # dropout (use 0.8)
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization

    # Step 4: Time-distributed dense layer (≈1 line)
    X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_classes, activation="softmax"))(X)  # time distributed  (sigmoid)

    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model


In [35]:
def load_model(weights_dir):
    """
    Load our seq_model with the latest checkpoint
    :param weights_dir: directory where we have our checkpoints from our training
    :return: our sequence model with weights
    """
    latest = tf.train.latest_checkpoint(str(weights_dir))
    print(latest)
    model = seq_model(input_shape=(TX, FX),
                      n_classes=N_CLASSES,
                      kernel_size=KERNEL_SIZE,
                      stride=STRIDE)

    model.load_weights(latest)
    
    return model

In [36]:
model = load_model("../logs/checkpoints/")

../logs/checkpoints/cp-0020.ckpt


W0806 11:29:57.787000 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer
W0806 11:29:57.788345 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer.iter
W0806 11:29:57.789480 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer.beta_1
W0806 11:29:57.790293 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer.beta_2
W0806 11:29:57.791121 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer.decay
W0806 11:29:57.792778 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer.learning_rate
W0806 11:29:57.793840 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).layer_with_weights-0.kernel
W0806 11:29:57.794796 140734794945984 util.py:244] Unresolved object in checkpoint: (root).optimizer's state 'm' for (root).layer_with_weights-0.bias
W0806 11:29:57.795711 140734794945984 util.py:244] Un

In [37]:
def get_audio_input_stream(callback):
    stream = pyaudio.PyAudio().open(
        format=pyaudio.paInt16,
        channels=1,
        rate=FS,
        input=True,
        frames_per_buffer=CHUNK_SAMPLES,
        input_device_index=0,
        stream_callback=callback)
    return stream

In [38]:
def get_spectrogram(data, fs=2):
    """
    Function to compute a spectrogram.
    :param data: one channel / dual channel audio data as numpy array
    :return: spectrogram, 2-D array, columns are the periodograms of successive segments.
    """

    noverlap = 256
    nchannels = data.ndim
    if nchannels == 1:
        pxx, _, _ = mlab.specgram(data, NFFT, fs, noverlap=noverlap)
    elif nchannels == 2:
        pxx, _, _ = mlab.specgram(data[:, 0], NFFT, fs, noverlap=noverlap)

    return np.swapaxes(pxx, 0, 1)

In [39]:
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 936, 257)]        0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 231, 256)          987136    
_________________________________________________________________
batch_normalization_9 (Batch (None, 231, 256)          1024      
_________________________________________________________________
activation_3 (Activation)    (None, 231, 256)          0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 231, 256)          0         
_________________________________________________________________
gru_6 (GRU)                  (None, 231, 256)          393984    
_________________________________________________________________
dropout_10 (Dropout)         (None, 231, 256)          0   

In [40]:
def detect_triggerword_spectrum(model, x):
    """
    Function to predict the location of the trigger word.

    :param model: neural network that is use for inference
    :param x: spectrum of shape (TX, FX)
    :return: predictions -- numpy ndarray to shape (number of output time steps per num_classes)
    """
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)
    return predictions[0]

In [41]:
def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.3):
    """
    Function to detect new trigger word in the latest chunk of input audio.
    It is looking for the rising edge of the predictions data belongs to the
    last/latest chunk.

    :param predictions:  predicted labels from model
    :param chunk_duration: time in second of a chunk
    :param feed_duration: time in second of the input to model
    :param threshold: threshold for probability above a certain to be considered positive
    :return: True if new trigger word detected in the latest chunk
    """

    predictions = predictions[:,1] > threshold
    chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
    chunk_predictions = predictions[-chunk_predictions_samples:]
    level = chunk_predictions[0]
    for pred in chunk_predictions:
        if pred > level:
            return True
        else:
            level = pred
    return False

In [48]:
# Queue to communiate between the audio callback and main thread
q = Queue()

run = True

silence_threshold = 100

# Run the demo for a timeout seconds
timeout = time.time() + 0.5*60  # 0.5 minutes from now

# Data buffer for the input wavform
data = np.zeros(FEED_SAMPLES, dtype='int16')

def callback(in_data, frame_count, time_info, status):
    global run, timeout, data, silence_threshold    
    if time.time() > timeout:
        run = False        
    data0 = np.frombuffer(in_data, dtype='int16')
    if np.abs(data0).mean() < silence_threshold:
        sys.stdout.write('-')
        return (in_data, pyaudio.paContinue)
    else:
        sys.stdout.write('.')
    data = np.append(data,data0)    
    if len(data) > FEED_SAMPLES:
        data = data[-FEED_SAMPLES:]
        # Process data async by sending a queue.
        q.put(data)
    return (in_data, pyaudio.paContinue)

stream = get_audio_input_stream(callback)
stream.start_stream()

try:
    while run:
        data = q.get()
        spectrum = get_spectrogram(data)
        preds = detect_triggerword_spectrum(model, spectrum)
        print(preds[-1,1])
        new_trigger = has_new_triggerword(preds, CHUNK_DURATION, FEED_DURATION)
        if new_trigger:
            sys.stdout.write('1')
except (KeyboardInterrupt, SystemExit):
    stream.stop_stream()
    stream.close()
    timeout = time.time()
    run = False

stream.stop_stream()
stream.close()

---.0.008725041
-.0.0022164478
.0.0037720846
.0.0068239453
.0.00038079498
--------.0.0060485685
.0.024215013
-----.1.0
1.0.020444132
----.0.0055989707
.0.0040698904
.0.0031244028
.0.0046375897
-.0.006597542
---.0.006719475
.0.005733959
.0.0077943127
----------

In [45]:
stream.stop_stream()
stream.close()