In [1]:
import pyaudio
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.ops import gen_audio_ops as audio_ops
from datetime import datetime

model = keras.models.load_model("fully_trained.model")

FORMAT = pyaudio.paFloat32
RATE = 16000
CHANNELS = 1
NOFFRAMES = 8000

audio = pyaudio.PyAudio()

info = audio.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
    if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
        print("Input Device id ", i, " - ",
              audio.get_device_info_by_host_api_device_index(0, i).get('name'))


samples = np.zeros((8000))


def callback(input_data, frame_count, time_info, flags):
    global samples
    # print("Got audio " + str(frame_count))
    new_samples = np.frombuffer(input_data, np.float32)
    samples = np.concatenate((samples, new_samples))
    samples = samples[-16000:]

    if len(samples) == 16000:
        start = time.perf_counter()
        # normalise the samples
        normalised = samples - np.mean(samples)
        max = np.max(normalised)
        if max > 0:
            normalised = normalised / max

        # create the spectrogram
        spectrogram = audio_ops.audio_spectrogram(
            np.reshape(normalised, (16000, 1)),
            window_size=320,
            stride=160, 
            magnitude_squared=True)
        # reduce the number of frequency bins in our spectrogram to a more sensible level
        spectrogram = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, 6],
            strides=[1, 6],
            pooling_type='AVG',
            padding='SAME')
        # remove the first 1 index
        spectrogram = tf.squeeze(spectrogram, axis=0)
        spectrogram = np.log10(spectrogram + 1e-6)
        prediction = model.predict(np.reshape(spectrogram, (1, 99, 43, 1)))        
        
        index = 0
        max_value = 0
        for i in range(4):
            if prediction[0][i] > max_value:
                max_value = prediction[0][i]
                index = i
        
        dict_predict = {0 : 'backward', 1: 'right', 2:'down', 3:'invalid'}        
        if max_value > 0.95 and index != 3:
            print(
                f"{datetime.now().time()} - {dict_predict[index]}, and score: {max_value}")
        
        end = time.perf_counter()
        # print((end-start)*1000)

    return input_data, pyaudio.paContinue

Input Device id  0  -  Microsoft Sound Mapper - Input
Input Device id  1  -  Microphone Array (טכנולוגיית In


In [2]:
stream = audio.open(
    input_device_index=0,
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    stream_callback=callback,
    frames_per_buffer=NOFFRAMES)

stream.start_stream()
print("speak")

# wait for stream to finish (5)
while stream.is_active() and time.time:
    time.sleep(0.1)

speak
22:36:30.877372 - down, and score: 0.9674051403999329
22:36:31.887035 - down, and score: 0.9973248243331909
22:36:32.382826 - down, and score: 0.9923639297485352
22:36:39.382699 - backward, and score: 0.982703447341919
22:36:45.381325 - backward, and score: 0.9749098420143127
22:36:46.886326 - backward, and score: 0.9987333416938782
22:36:47.386016 - backward, and score: 0.9990378618240356
22:36:57.383696 - right, and score: 0.9990003705024719
22:37:01.399340 - right, and score: 0.9968649744987488
22:37:07.900947 - down, and score: 0.997103750705719
22:37:08.384736 - down, and score: 0.9983419179916382
22:37:08.901313 - down, and score: 0.9936980605125427
22:37:09.392738 - down, and score: 0.9899314641952515
22:37:11.887270 - right, and score: 0.9983471632003784
22:37:13.384424 - backward, and score: 0.9752418398857117
22:37:13.909998 - backward, and score: 0.9944887161254883
22:37:17.402691 - right, and score: 0.9790919423103333
22:37:19.411366 - down, and score: 0.9972470402717

KeyboardInterrupt: 

22:39:02.391551 - backward, and score: 0.9864878058433533
22:39:24.891513 - backward, and score: 0.9630892872810364
22:40:20.879542 - down, and score: 0.9740668535232544
22:40:24.380457 - backward, and score: 0.9653353691101074
22:40:44.903426 - right, and score: 0.9759765267372131
22:41:13.387851 - down, and score: 0.9795274138450623
22:41:34.413244 - backward, and score: 0.9782441854476929
22:41:35.903555 - down, and score: 0.9762193560600281
22:41:58.385180 - backward, and score: 0.9918127059936523
22:45:16.391935 - backward, and score: 0.9854515194892883
22:49:25.902941 - backward, and score: 0.9937897324562073
22:49:45.421312 - backward, and score: 0.9984399676322937
22:49:47.396686 - backward, and score: 0.9588635563850403
22:53:23.932221 - down, and score: 0.9914925694465637
22:55:09.414979 - down, and score: 0.9571579694747925
22:55:56.426337 - backward, and score: 0.9627906084060669
22:56:09.399993 - backward, and score: 0.9858469367027283
22:59:45.900233 - backward, and score

In [None]:
# stream.stop_stream()
# stream.close()
# p.terminate()
# print('done')
# plt.plot(decoded)
# plt.show()