In [1]:
import pyaudio
import numpy as np
import matplotlib.pyplot as plt
from scipy import fft
from scipy import signal
import tensorflow as tf
from tensorflow.python.keras import layers
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D, Dropout
from tensorflow.python.keras import regularizers
import tensorflow_io as tfio

%matplotlib tk

CHUNK = 16000
FORMAT = pyaudio.paFloat32
CHANNELS = 2
RATE = 16000
INDEX = 1

p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Headset (Obsidian)', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Headphones (Obsidian)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChan

In [2]:
p = pyaudio.PyAudio()
FORMAT = pyaudio.paFloat32
CHANNELS = 2
RATE = 48000
CHUNK = 1 * RATE
INDEX = 1
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    input_device_index= INDEX,
    frames_per_buffer=CHUNK
)

fig, ax = plt.subplots()
x = np.arange(0, 2 * CHUNK, 2)
line, = ax.plot(x, np.random.rand(CHUNK))
ax.set_ylim(-0.05, 0.05)
ax.set_xlim(0, CHUNK)
while True:
    data = stream.read(CHUNK)
    wave = np.frombuffer(data, dtype=np.float32)
    wave = np.reshape(wave, (int(len(wave) / 2), 2)).T
    wave = wave[1,:]
    line.set_ydata(wave)
    fig.canvas.draw()
    fig.canvas.flush_events()

KeyboardInterrupt: 

In [2]:
is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

print("Opening model")
loaded_model = tf.keras.models.load_model('checkpoint.model')
print("Model Loaded")

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
2.10.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Opening model
Model Loaded


In [4]:
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv_layer1 (Conv2D)        (None, 78, 129, 4)        40        
                                                                 
 max_pooling1 (MaxPooling2D)  (None, 39, 64, 4)        0         
                                                                 
 conv_layer2 (Conv2D)        (None, 39, 64, 6)         222       
                                                                 
 max_pooling2 (MaxPooling2D)  (None, 19, 32, 6)        0         
                                                                 
 conv_layer3 (Conv2D)        (None, 19, 32, 4)         220       
                                                                 
 max_pooling3 (MaxPooling2D)  (None, 9, 16, 4)         0         
                                                                 
 flatten (Flatten)           (None, 576)               0

In [5]:
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    input_device_index= INDEX,
    
    frames_per_buffer=CHUNK
)

while True:
    data = stream.read(CHUNK)
    wave = np.frombuffer(data, dtype=np.float32)
    wave = np.reshape(wave, (int(len(wave) / 2), 2)).T
    wave = wave[1,:]
    mean = np.mean(wave)
    std = np.std(wave)
    wave = (wave - mean) / std
    spectrogram = tf.signal.stft(
        wave, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    spectrogram = np.expand_dims(spectrogram, axis=2)
    spectrogram = np.expand_dims(spectrogram, axis=0)
    result = loaded_model.predict(spectrogram)
    result = np.squeeze(result, axis=0)
    print(result)
    estimate = np.argmax(result, axis=0)
    if result[estimate] > 0.9:
        if estimate == 0:
            print("Noise")
        elif estimate == 1:
            print("Not target")
        elif estimate == 2:
            print("Sheila")
        else:
            print("Marvin")
    else:
        print("Nothing")

[0.00821732 0.44531557 0.13744988 0.4090173 ]
Nothing
[0.00607893 0.42552778 0.13850312 0.42989016]
Nothing
[0.00894144 0.4229139  0.14124767 0.42689696]
Nothing
[0.00555039 0.42607743 0.13784315 0.43052906]
Nothing
[0.00146997 0.38474882 0.02925497 0.5845263 ]
Nothing
[0.00971079 0.85882133 0.05734537 0.07412254]
Nothing
[5.4679532e-04 1.2557599e-01 4.1930112e-03 8.6968422e-01]
Nothing
[0.00305041 0.42297304 0.10979445 0.46418217]
Nothing
[0.00890016 0.4609491  0.13859177 0.39155897]
Nothing
[5.9843973e-05 5.5367183e-02 7.4283736e-05 9.4449872e-01]
Marvin
[0.01646283 0.8351952  0.0848319  0.06351005]
Nothing
[0.00286257 0.4231351  0.12818311 0.4458192 ]
Nothing
[0.01975333 0.48706865 0.24780539 0.24537264]
Nothing
[6.8600662e-04 7.2616786e-01 2.4465458e-02 2.4868073e-01]
Nothing
[0.00249892 0.32476416 0.08851971 0.58421725]
Nothing
[0.00137677 0.2880545  0.0300378  0.68053085]
Nothing
[0.03844836 0.59590095 0.11840009 0.24725053]
Nothing
[6.8575004e-04 1.2956339e-01 1.1017948e-02 8.58

KeyboardInterrupt: 

In [3]:
from scipy.io.wavfile import write
CHUNK = 16000
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000
INDEX = 1

p = pyaudio.PyAudio()
RECORD_SECONDS = 3
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    input_device_index= INDEX,
    
    frames_per_buffer=CHUNK
)
print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()
# wave = np.reshape((wave.shape[0] * RECORD_SECONDS, ))
# print(wave.shape)

* recording
* done recording


In [4]:
fig, ax = plt.subplots()
wave = np.frombuffer(frames[0], dtype=np.float32)
wave = np.reshape(wave, (int(len(wave) / 2), 2)).T
wave = wave[1,:]
temp_frames = wave
mean = np.mean(temp_frames)
# std = np.std(temp_frames)
# temp_frames = (temp_frames - mean) / std
total = np.sum((temp_frames - mean)**2)
temp_frames
print(temp_frames.shape)
ax.plot(np.arange(temp_frames .shape[0]), temp_frames )
plt.show()

(8000,)


In [58]:
from IPython import display
display.display(display.Audio(temp_frames, rate=16000))

In [25]:
play=pyaudio.PyAudio()
stream_play=play.open(format=FORMAT,
                      channels=CHANNELS,
                      rate=RATE,
                      output=True)
for data in frames: 
    stream_play.write(data)
stream_play.stop_stream()
stream_play.close()
play.terminate()