In [1]:
import pyaudio
import matplotlib.mlab as mlab
import numpy as np
import tensorflow as tf
import time

from queue import Queue
from threading import Thread

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import sys
import os

sys.path.append("/".join(os.getcwd().split("/")[:-1]))

In [3]:
from src.settings.general import FRAME_RATE

In [4]:
from src.settings.trigger import TX, FX, N_CLASSES, TRIGGER_KERNEL_SIZE, TRIGGER_STRIDE

In [5]:
def trigger_model(input_shape, n_classes, kernel_size, stride):
    """
    Function creating the model's graph in Keras.

    :param input_shape: shape of the model's input data (using Keras conventions)
    :param n_classes: n_classes to predict for the last dense layer
    :param kernel_size: kernel size of the first conv layer
    :param stride : stride_size of the first conv layer
    :return: Keras model instance
    """

    X_input = tf.keras.layers.Input(shape=input_shape)

    # Step 1: CONV layer (≈4 lines)
    X = tf.keras.layers.Conv1D(512, kernel_size=kernel_size, strides=stride)(X_input)  # CONV1D
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization
    X = tf.keras.layers.Activation('relu')(X)  # ReLu activation
    X = tf.keras.layers.Dropout(0.2)(X)  # dropout (use 0.8)

    # Step 2: First GRU Layer (≈4 lines)
    X = tf.keras.layers.GRU(units=256, return_sequences=True)(X)  # GRU (use 128 units and return the sequences)
    X = tf.keras.layers.Dropout(0.2)(X)  # dropout (use 0.8)
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization

    # Step 3: Second GRU Layer (≈4 lines)
    X = tf.keras.layers.GRU(units=256, return_sequences=True)(X)  # GRU (use 128 units and return the sequences)
    X = tf.keras.layers.Dropout(0.2)(X)  # dropout (use 0.8)
    X = tf.keras.layers.BatchNormalization()(X)  # Batch normalization

    # Step 4: Time-distributed dense layer (≈1 line)
    X = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_classes, activation="softmax"))(X)  # time distributed (sigmoid)

    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    print(model.summary())

    return model

In [6]:
latest = tf.train.latest_checkpoint("../logs/trigger/checkpoints/")

model = trigger_model(input_shape=(TX, FX),
                      n_classes=N_CLASSES,
                      kernel_size=TRIGGER_KERNEL_SIZE,
                      stride=TRIGGER_STRIDE)

model.load_weights(latest)

W0814 11:30:58.582641 140734894839232 deprecation.py:506] From /Users/az01640/Projets/multrigger-word/.venv/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 860, 257)]        0         
_________________________________________________________________
conv1d (Conv1D)              (None, 212, 512)          1974272   
_________________________________________________________________
batch_normalization (BatchNo (None, 212, 512)          2048      
_________________________________________________________________
activation (Activation)      (None, 212, 512)          0         
_________________________________________________________________
dropout (Dropout)            (None, 212, 512)          0         
_________________________________________________________________
gru (GRU)                    (None, 212, 256)          590592    
_________________________________________________________________
dropout_1 (Dropout)          (None, 212, 256)          0     

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x14199ecf8>

In [7]:
def get_audio_input_stream(callback):
    stream = pyaudio.PyAudio().open(
        format=pyaudio.paInt16,
        channels=1,
        rate=FRAME_RATE,
        input=True,
        frames_per_buffer=CHUNK_SAMPLES,
        input_device_index=0,
        stream_callback=callback)
    return stream

In [8]:
from src.utils.audio import load_raw_audio

In [9]:
positives, backgrounds = load_raw_audio()
positive_labels = sorted(positives.keys())

MAP_DICT = dict(enumerate(positive_labels, 1))
MAP_DICT[0] = "background"

In [17]:
def detect_triggerword_spectrum(model, x):
    """
    Function to predict the location of the trigger word.

    :param model: neural network that is use for inference
    :param x: spectrum of shape (TX, FX)
    :return: predictions -- numpy ndarray to shape (number of output time steps per num_classes)
    """
    x = np.expand_dims(x, axis=0)
    predictions = model.predict(x)[0]
    return predictions

In [18]:
def has_new_triggerword(predictions, chunk_duration, feed_duration, threshold=0.3):
    """
    Function to detect new trigger word in the latest chunk of input audio.
    It is looking for the rising edge of the predictions data belongs to the
    last/latest chunk.

    :param predictions:  predicted labels from model
    :param chunk_duration: time in second of a chunk
    :param feed_duration: time in second of the input to model
    :param threshold: threshold for probability above a certain to be considered positive
    :return: True if new trigger word detected in the latest chunk
    """

    predictions = predictions[:,1] > threshold
    chunk_predictions_samples = int(len(predictions) * chunk_duration / feed_duration)
    chunk_predictions = predictions[-chunk_predictions_samples:]
    level = chunk_predictions[0]
    for pred in chunk_predictions:
        if pred > level:
            return True
        else:
            level = pred
    return False

In [19]:
from src.utils.audio import get_spectrogram
from src.trigger.make_dataset import transform_labels

In [20]:
import seaborn as sns

In [30]:
# Queue to communiate between the audio callback and main thread
q = Queue()

run = True

silence_threshold = 100

CHUNK_DURATION = 5 # Each read length in seconds from mic.
CHUNK_SAMPLES = int(FRAME_RATE * CHUNK_DURATION) # Each read length in number of samples.FEED_SAMPLES = int(FRAME_RATE * FEED_DURATION)

FEED_DURATION = .5 # Each read length in seconds from mic.
FEED_SAMPLES = int(FRAME_RATE * FEED_DURATION)

# Run the demo for a timeout seconds
timeout = time.time() + 0.5*60  # 0.5 minutes from now

# Data buffer for the input wavform
data = np.zeros(CHUNK_SAMPLES, dtype='int16')

def callback(in_data, frame_count, time_info, status):
    global run, timeout, data, silence_threshold    
    if time.time() > timeout:
        run = False        
    data0 = np.frombuffer(in_data, dtype='int16')
    if np.abs(data0).mean() < silence_threshold:
        sys.stdout.write('-')
        return (in_data, pyaudio.paContinue)
    else:
        sys.stdout.write('.')
    data = np.append(data,data0)    
    if len(data) > CHUNK_SAMPLES:
        data = data[-CHUNK_SAMPLES:]
        # Process data async by sending a queue.
        q.put(data)
    return (in_data, pyaudio.paContinue)

stream = get_audio_input_stream(callback)
stream.start_stream()

try:
    while run:
        data = q.get()
        print(timeout - time.time())
        spectrum = get_spectrogram(data)
        preds = detect_triggerword_spectrum(model, np.swapaxes(spectrum, 0,1))
        new_trigger = has_new_triggerword(preds, CHUNK_DURATION, FEED_DURATION)
        if new_trigger:
            sys.stdout.write('1')
except (KeyboardInterrupt, SystemExit):
    stream.stop_stream()
    stream.close()
    timeout = time.time()
    run = False

stream.stop_stream()
stream.close()

.27.761600017547607
.24.97511386871338
.22.745832204818726
.19.95950198173523
--.12.715651988983154
.9.928544998168945
.7.699174165725708
--.-0.00853109359741211
---------------

In [28]:
stream.stop_stream()
stream.close()