In [1]:
import pyaudio
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.ops import gen_audio_ops as audio_ops
from datetime import datetime

model = keras.models.load_model("fully_trained.model")

FORMAT = pyaudio.paFloat32
RATE = 16000
CHANNELS = 1
NOFFRAMES = 8000

audio = pyaudio.PyAudio()

info = audio.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
for i in range(0, numdevices):
    if (audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
        print("Input Device id ", i, " - ",
              audio.get_device_info_by_host_api_device_index(0, i).get('name'))


samples = np.zeros((8000))


def callback(input_data, frame_count, time_info, flags):
    global samples
    # print("Got audio " + str(frame_count))
    new_samples = np.frombuffer(input_data, np.float32)
    samples = np.concatenate((samples, new_samples))
    samples = samples[-16000:]

    if len(samples) == 16000:
        start = time.perf_counter()
        # normalise the samples
        normalised = samples - np.mean(samples)
        max = np.max(normalised)
        if max > 0:
            normalised = normalised / max

        # create the spectrogram
        spectrogram = audio_ops.audio_spectrogram(
            np.reshape(normalised, (16000, 1)),
            window_size=320,
            stride=160, 
            magnitude_squared=True)
        # reduce the number of frequency bins in our spectrogram to a more sensible level
        spectrogram = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, 6],
            strides=[1, 6],
            pooling_type='AVG',
            padding='SAME')
        # remove the first 1 index
        spectrogram = tf.squeeze(spectrogram, axis=0)
        spectrogram = np.log10(spectrogram + 1e-6)
        prediction = model.predict(np.reshape(spectrogram, (1, 99, 43, 1)))        
        
        index = 0
        max_value = 0
        for i in range(7):
            if prediction[0][i] > max_value:
                max_value = prediction[0][i]
                index = i
        
        dict_predict = {0 : 'forward', 1: 'backward', 2:'left', 3:'right', 4:'one', 5:'zero', 6:'invalid'}        
        if max_value > 0.95 and index != 6:
            print(
                f"{datetime.now().time()} - {dict_predict[index]}, and score: {max_value}")
        
        end = time.perf_counter()
        # print((end-start)*1000)

    return input_data, pyaudio.paContinue

Input Device id  0  -  Microsoft Sound Mapper - Input
Input Device id  1  -  Microphone Array (טכנולוגיית In


In [2]:
stream = audio.open(
    input_device_index=0,
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    stream_callback=callback,
    frames_per_buffer=NOFFRAMES)

stream.start_stream()
print("speak")

# wait for stream to finish (5)
while stream.is_active() and time.time:
    time.sleep(0.1)

speak
18:21:33.751207 - right, and score: 0.9937392473220825
18:21:34.241819 - right, and score: 0.9969931840896606
18:21:37.742705 - zero, and score: 0.9659212827682495
18:21:46.747116 - left, and score: 0.9734567999839783
18:21:51.741806 - backward, and score: 0.9999552965164185
18:22:08.781565 - zero, and score: 0.9893720746040344
18:22:09.346619 - zero, and score: 0.9988598823547363


KeyboardInterrupt: 

18:23:13.242907 - backward, and score: 0.9757394790649414
18:23:15.752092 - zero, and score: 0.9983800649642944
18:24:05.252164 - zero, and score: 0.9548250436782837
18:24:35.751509 - backward, and score: 0.999591052532196
18:24:42.744052 - right, and score: 0.9584351778030396
18:24:48.246540 - backward, and score: 0.9875519275665283
18:24:50.259382 - backward, and score: 0.9970458149909973
18:24:51.745676 - right, and score: 0.9999769926071167
18:24:57.252624 - backward, and score: 0.9999302625656128
18:25:02.744146 - one, and score: 0.999485969543457
18:25:03.249482 - zero, and score: 0.9833173751831055
18:25:03.781106 - zero, and score: 0.9986594915390015
18:25:32.741961 - backward, and score: 0.998521625995636
18:25:33.278212 - backward, and score: 0.9921344518661499
18:25:40.745534 - right, and score: 0.9997754693031311
18:25:49.741331 - zero, and score: 0.9603376388549805
18:26:00.258973 - backward, and score: 0.9545948505401611
18:26:02.245844 - left, and score: 0.95601075887680

In [None]:
# stream.stop_stream()
# stream.close()
# p.terminate()
# print('done')
# plt.plot(decoded)
# plt.show()