# Inference Audio Recognition using Tensorflow
### Speech Recognition for Controlling Robot (THAI COMMAND EDITION)
#### By. Arunwat Moonbung

In [1]:
import json
import math
import os
import pyaudio
#import alsaaudio
import time
import wave

import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelBinarizer

import tensorflow as tf
import tensorflow.keras as keras

In [2]:
class Keyword_realtime_demo:
    def __init__(self, model_path, text_labels, plot=False, conf=0.7):
        if os.path.exists(model_path):
            self.model = keras.models.load_model(model_path)
            #self.model.summary()
        else:
            print("!: Unable to Load model, Model directory not found.")
            self.model = None
        if os.path.exists(text_labels):
            with open(text_labels, "r", encoding="utf8") as f:
                data = json.load(f)
            self.text_labels = [k for k in data.keys()]
        else:
            self.text_labels = None
        self.plot = plot
        self.conf = conf
    
    def start(self, DURATION):
        # LOAD MFCCs LIST
        # MFCCs = self.record_audio(DURATION=DURATION)
        MFCCs = self.record_audio_alsa(DURATION=DURATION)
        sT = time.time()
        PREDICTED_LABELS = []
        PREDICTED_CONFS = []
        # EXTRACT IT ONE BY ONE
        for idx, MFCC in enumerate(MFCCs):
            # FEED IT TO PREDICT
            #print(f"#: PREDICTED RESULT {idx+1} of {len(MFCCs)}")
            predicted_label, predicted_conf = self.predict(MFCC)
            PREDICTED_LABELS.append(predicted_label)
            PREDICTED_CONFS.append(predicted_conf)
        # MAKE A COMPARE BETWEEN CONF OF EACH LABEL OUTPUT
        FINAL_RESULTS_INDEX = np.argmax(PREDICTED_CONFS)
        FINAL_RESULTS = [PREDICTED_LABELS[FINAL_RESULTS_INDEX], PREDICTED_CONFS[FINAL_RESULTS_INDEX]]
        print(f"R: SUMMARIZE KEYWORDS DETECTED '{FINAL_RESULTS[0]}' with Confidence {FINAL_RESULTS[1]*100:.2f}%\n")
        eT = time.time()
        print(f"INFERENCE USING {eT - sT}")
        return FINAL_RESULTS
    
    def predict(self, MFCCs_INPUT):
        # PREDICT -> OUTPUT PROBABILITY
        predictions = self.model.predict(MFCCs_INPUT)
        predicted_index = np.argmax(predictions)
        predicted_conf = predictions[0][predicted_index]
        predicted_label = self.text_labels[predicted_index]
        predicted_label_fix = self.text_labels[predicted_index]
        if predicted_conf < self.conf:
            predicted_label = "ไม่มั่นใจ"
            print(f"!: ERROR TO DETECT KEYWORD | '{predicted_label_fix}' WITH LOW CONFIDENCE {predicted_conf*100:.2f}%")
        else:
            print(f"#: KEYWORD DETECTED | '{predicted_label}' WITH CONFIDENCE: {predicted_conf*100:.2f}%")
            pass
        return predicted_label, predicted_conf
    
    def preprocess_data_test(self, file_path, DURATION=1, n_mfcc=40, n_fft=4096, hop_length=512, NUM_SAMPLES_TO_CONSIDER=16000):
        # CALCULATION FOR AUDIO FILES
        SAMPLES_PER_TRACK = NUM_SAMPLES_TO_CONSIDER * DURATION
        NUM_SAMPLES_PER_SEGMENT = int(SAMPLES_PER_TRACK/DURATION) # OR REPLACE DURATION WITH NUM_SEGMENTS
        EXPECTED_MFCC = math.ceil(NUM_SAMPLES_PER_SEGMENT / hop_length) # EXPECTED NUMBERS OF MFCCs PER SEGMENT
        # READ AUDIO FILE FROM .WAV FILE TO GET SIGNAL AND SR
        signal, sr = librosa.load(file_path, sr=NUM_SAMPLES_TO_CONSIDER)
        signal = self.pad_audio_sec(signal, DURATION, sr)
        scaler = StandardScaler()
        MFCCs_scaled = []
        for s in range(DURATION): # DURATION = NUM_SEGMENTS
            START_SAMPLE = int(NUM_SAMPLES_PER_SEGMENT * s)
            END_SAMPLE = int(START_SAMPLE + NUM_SAMPLES_PER_SEGMENT)
            MFCC = librosa.feature.mfcc(y=signal[START_SAMPLE:END_SAMPLE],
                                        sr=sr, n_mfcc=n_mfcc,
                                        hop_length=hop_length,n_fft=n_fft)
            MFCC = MFCC.T
            
            if self.plot:
                librosa.display.waveshow(signal[START_SAMPLE:END_SAMPLE], sr=sr)
                #librosa.display.specshow(MFCC, sr=sr, hop_length=hop_length)
                plt.title(f"Audio Signal of {file_path} {s+1} of {DURATION}")
                plt.xlabel("Time (sec)")
                plt.ylabel("Amplitude")
                plt.show()
                
            MFCC_scaled = scaler.fit_transform(MFCC)
            MFCC_scaled = MFCC_scaled.reshape(MFCC_scaled.shape[0], MFCC_scaled.shape[1], 1)
            MFCC_scaled = MFCC_scaled[np.newaxis, ...]
            if len(MFCC) == EXPECTED_MFCC:
                MFCCs_scaled.append(MFCC_scaled)
            else:
                pass
        return MFCCs_scaled

    def record_audio(self, DURATION, CHANNELS=1, FORMAT=pyaudio.paInt16, n_fft=4096, NUM_SAMPLES_TO_CONSIDER=16000):
        print(f"####################################################################")
        print(f"#: START RECORDING FOR {DURATION} SEC.. | PLEASE SPEAK KEYWORD NOW!!")
        print(f"####################################################################")
        #time.sleep(0.5)
        self.pyA = pyaudio.PyAudio()
        self.pyAstream = self.pyA.open(format=FORMAT,channels=CHANNELS,
                                    rate=NUM_SAMPLES_TO_CONSIDER, input=True, output=False,
                                    frames_per_buffer=n_fft)
        frames = [] #int(NUM_SAMPLES_TO_CONSIDER/n_fft * RECORD_SECONDS)
        for i in range(0, int(round(NUM_SAMPLES_TO_CONSIDER/n_fft * DURATION))): 
            data = self.pyAstream.read(n_fft)
            frames.append(data)
        print("#: STOP RECORDING, SAVING CACHE FILE..")
        #frames_bytes = b''.join(frames)
        #print('Size of Frames_bytes', len(frames_bytes))
        self.pyAstream.stop_stream()
        self.pyAstream.close()
        self.pyA.terminate()
        
        # SAVE RECORDING FILE TO cacheSound.wav TO READ IT
        try:
            wf = wave.open('cacheSound.wav','wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(self.pyA.get_sample_size(FORMAT))
            wf.setframerate(NUM_SAMPLES_TO_CONSIDER)
            wf.writeframes(b''.join(frames))
            wf.close()
            return self.preprocess_data_test('cacheSound.wav', DURATION=DURATION)
        except:
            print("!: An Error occuring while saving .wav file")
            print("!: This will occur non keywords detected.")
            return [np.zeros((1, 32, 40, 1))]
    
    def record_audio_alsa(self, DURATION=1, CHANNELS=1, FORMAT='int16', n_fft=4096, NUM_SAMPLES_TO_CONSIDER=16000, device='default'):
        print(f"####################################################################")
        print(f"#: START RECORDING FOR {DURATION} SEC.. | PLEASE SPEAK KEYWORD NOW!!")
        print(f"####################################################################")
        try:
            with open("cacheSound.raw", "wb") as rf:
                res = []
                recorder = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                channels=CHANNELS, rate=NUM_SAMPLES_TO_CONSIDER, format=alsaaudio.PCM_FORMAT_S16_LE, periodsize=2000, device=device)
                while len(res) < 16000 * DURATION:
                    l, data = recorder.read()
                    a = np.frombuffer(data, dtype='int16')
                    if len(data) != 0:
                        rf.write(a)
                        res.extend(a)
                        time.sleep(.001)
                rf.close()
        except Exception as e:
            print("!: An Error occuring while recording & saving .raw file")
            print(e)
        print(f"#: STOP RECORDING, SAVING AUDIO .RAW FILE..")
        print(f"#: CONVERTING AUDIO RAW FILE -> .WAV FILE..")
        
        try:
            with open("cacheSound.raw", "rb") as inp_f:
                data = inp_f.read()
                with wave.open("cacheSound.wav", "wb") as out_f:
                    out_f.setnchannels(CHANNELS)
                    out_f.setsampwidth(2)
                    out_f.setframerate(NUM_SAMPLES_TO_CONSIDER)
                    out_f.writeframesraw(data)
                    out_f.close()
                    print(f"#: WAV FILES HAS BEEN CONVERTED, SENDING TO AUDIO PREPROCESSING.")
                    print(f"####################################################################")
            inp_f.close()
            return self.preprocess_data_test('cacheSound.wav', DURATION=DURATION)
        except Exception as e:
            print("!: An Error occuring while converting raw file to .wav file")
            print("!: This will occur non keywords detected.")
            return [np.zeros((1, 32, 40, 1))]
        
    def pad_audio(self, signal, NUM_SAMPLES_TO_CONSIDER):
        if len(signal) >= NUM_SAMPLES_TO_CONSIDER:
            return signal[:NUM_SAMPLES_TO_CONSIDER]
        else:
            #return np.pad(signal, pad_width=(0, TOTAL_SAMPLE - len(signal)), mode='constant', constant_values=(0, 0)) # PAD หลัง
            return np.pad(signal, pad_width=(NUM_SAMPLES_TO_CONSIDER - len(signal), 0), mode='constant', constant_values=(0, 0)) # PAD หน้า
    
    def pad_audio_sec(self, signal, DURATION, NUM_SAMPLES_TO_CONSIDER):
        TOTAL_SAMPLE = DURATION*NUM_SAMPLES_TO_CONSIDER
        if len(signal) >= TOTAL_SAMPLE:
            return signal[:TOTAL_SAMPLE]
        else:
            #return np.pad(signal, pad_width=(0, TOTAL_SAMPLE - len(signal)), mode='constant', constant_values=(0, 0)) # PAD หลัง
            return np.pad(signal, pad_width=(TOTAL_SAMPLE - len(signal), 0), mode='constant', constant_values=(0, 0)) # PAD หน้า

In [None]:
Keyword_test = Keyword_realtime_demo(model_path='models/model.h5', text_labels='Data_Thai/classmap.json', plot=False, conf=0.85)

In [None]:
try: 
    while True:
        print("")
        Keyword_test.start(DURATION=1)
        print("")
        time.sleep(3)
except KeyboardInterrupt:
    print("PROCESS END")

In [None]:
# python SpeechControlTH_RT_ZCU104.py -m models/model.h5 -l Data_Thai/classmap.json -c 0.7 -dl 3 -d 1

In [None]:
# TEST ONCE WITH TEST SET
for (dirpath, dirnames, filenames) in os.walk(os.path.join("Data_Thai", "test")):
    test_set = filenames
    for idx, filename in enumerate(test_set):
        print(f"PREDICTING FILE: {filename}")
        MFCCs_TEST = Keyword_test.preprocess_data_test(os.path.join("Data_Thai","test",filename), DURATION=1)
        for i in range(len(MFCCs_TEST)):
            result = Keyword_test.predict(MFCCs_TEST[i])
            #print(result)

### DEBUG - TEST PART

In [None]:
# DEBUGGING | SOUND RECORDING SYSTEM V1 (PYAUDIO)
Keyword_test.record_audio(DURATION=1)
print("FINISH")

In [None]:
# DEBUGGING | SOUND RECORDING SYSTEM V2 (PYALSAAUDIO)
# Keyword_test.record_audio_alsa(DURATION=1)
# print("FINISH")

In [None]:
# DEBUGGING | PREPROCESS AUDIO DATA INTO MFCCs READY SHAPE
# DEBUGGING | PREDICTION
MFCCs_DEBUG = Keyword_test.preprocess_data_test('cacheSound.wav', DURATION=1)
print(len(MFCCs_DEBUG))
print(MFCCs_DEBUG[0].shape)
for i in range(len(MFCCs_DEBUG)):
    result = Keyword_test.predict(MFCCs_DEBUG[i])
    print(result)

In [None]:
fileQuantizeWAV = []
for (dirpath, dirnames, filenames) in os.walk(os.path.join("Data_Thai", "quantizedata")):
    fileQuantizeWAV.extend(filenames)
    break

audio_dumps = []
for i, filename in enumerate(fileQuantizeWAV):
    MFCCs_DEBUG = Keyword_test.preprocess_data_test(os.path.join("Data_Thai", "quantizedata", filename), DURATION=1)
    MFCCs_ONCE = MFCCs_DEBUG[0]
    audio_dumps.append(MFCCs_ONCE)
    
numpy_dumps = np.array(audio_dumps)
np.save('audio_array', numpy_dumps)

# load_np = np.load('audio_array.npy')

In [None]:

#Keyword_test.record_audio(DURATION=1)
signal, sr = librosa.load('cacheSound.wav', sr=16000)

if len(signal) >= 16000:
    signal = signal[:16000]
else:
    signal = np.pad(signal, pad_width=(16000 - len(signal), 0), mode='constant', constant_values=(0, 0))
    
MFCCs = librosa.feature.mfcc(y=signal, n_mfcc=40,
                                    hop_length=512,
                                    n_fft=4096)
MFCCs = MFCCs.T
scaler = StandardScaler()
MFCCs_scaled = scaler.fit_transform(MFCCs)
MFCCs_scaled = MFCCs_scaled.reshape(MFCCs_scaled.shape[0], MFCCs_scaled.shape[1], 1)
MFCCs_scaled = MFCCs_scaled[np.newaxis, ...]
librosa.display.waveshow(signal, sr=16000)
#librosa.display.specshow(MFCCs, sr=16000, hop_length=512)
plt.title(f"Sample of Cache Sound Signal")
plt.xlabel("Time (sec)")
plt.ylabel("Amplitude")
#plt.colorbar()
plt.show()
#print(np.min(signal), np.max(signal))
#print(len(signal))
#print(MFCCs_scaled.shape)
#print(MFCCs_scaled)

In [None]:
signal2, sr2 = librosa.load('cacheSound.wav', sr=16000) # sr * T -> 22050 * 30
MFCCs = librosa.feature.mfcc(y=signal2, n_mfcc=40,
                                    hop_length=512,
                                    n_fft=4096)
librosa.display.waveshow(signal2, sr=sr2)
plt.xlabel("TIME")
plt.ylabel("AMPLITUDE")
plt.show()

##### DEBUG ALSAAUDIO (RECORD) + (PLAYBACK)

In [None]:
import wave
import alsaaudio
import time
import numpy as np
from playsound import playsound

def record_audio_alsa(DURATION=1, CHANNELS=1, FORMAT='int16', n_fft=4096, NUM_SAMPLES_TO_CONSIDER=16000, device='default'):
        print(f"####################################################################")
        print(f"#: START RECORDING FOR {DURATION} SEC.. | PLEASE SPEAK KEYWORD NOW!!")
        print(f"####################################################################")
        try:
            with open("cacheSound.raw", "wb") as rf:
                res = []
                recorder = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                channels=CHANNELS, rate=NUM_SAMPLES_TO_CONSIDER, format=alsaaudio.PCM_FORMAT_S16_LE, periodsize=2000, device=device)
                while len(res) < 16000 * DURATION:
                    l, data = recorder.read()
                    a = np.frombuffer(data, dtype='int16')
                    if len(data) != 0:
                        rf.write(a)
                        res.extend(a)
                        time.sleep(.001)
                rf.close()
        except Exception as e:
            print("!: An Error occuring while recording & saving .raw file")
            print(e)
        print(f"#: STOP RECORDING, SAVING AUDIO .RAW FILE..")
        print(f"#: CONVERTING AUDIO RAW FILE -> .WAV FILE..")
        
        try:
            with open("cacheSound.raw", "rb") as inp_f:
                data = inp_f.read()
                with wave.open("cacheSound.wav", "wb") as out_f:
                    out_f.setnchannels(CHANNELS)
                    out_f.setsampwidth(2)
                    out_f.setframerate(NUM_SAMPLES_TO_CONSIDER)
                    out_f.writeframesraw(data)
                    out_f.close()
                    print(f"#: WAV FILES HAS BEEN CONVERTED, SENDING TO AUDIO PREPROCESSING.")
                    print(f"####################################################################")
            inp_f.close()
            return "RECORDING COMPLETED
            #return self.preprocess_data_test('cacheSound.wav', DURATION=DURATION)
        except Exception as e:
            print("!: An Error occuring while converting raw file to .wav file")
            print("!: This will occur non keywords detected.")
            return "RECORDING FAILED"

In [None]:
record_audio_alsa(DURATION=5)
print("FINISH")


In [None]:
playsound('cacheSound.wav')

In [3]:
#CUT LAST LAYER
#new_model = model.layers[-1].output
model = keras.models.load_model('models_v3/model-best.h5')
model_new = keras.models.Model(inputs=model.inputs, outputs=model.layers[-2].output)
model_new.set_weights(model.get_weights())
optimizer = keras.optimizers.Adam(learning_rate = 0.01)
loss_fn = keras.losses.SparseCategoricalCrossentropy()
acc_metric = keras.metrics.SparseCategoricalAccuracy()
model_new.compile(optimizer=optimizer, loss=loss_fn, metrics=[acc_metric])
model_new.summary()
model_new.save('model_bestepoch_cut_nd.h5')

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 40, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 32, 40, 32)        320       
                                                                 
 activation (Activation)     (None, 32, 40, 32)        0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 20, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 20, 64)        18496     
                                                                 
 activation_1 (Activation)   (None, 16, 20, 64)        0         
                                                             

In [None]:
'''
def play_audio_alsa(CHANNELS=1, NUM_SAMPLES_TO_CONSIDER=16000, PERIODSIZE=2000, device='default'):    
    with wave.open('cacheSound.wav', 'rb') as sf:
        total_sample = sf.getnframes()
        sample_rate = sf.getframerate()
        playback = alsaaudio.PCM(alsaaudio.PCM_PLAYBACK, alsaaudio.PCM_NORMAL, channels=CHANNELS, rate=NUM_SAMPLES_TO_CONSIDER, format=alsaaudio.PCM_FORMAT_S16_LE, periodsize=PERIODSIZE, device=device)
        data = sf.readframes(PERIODSIZE)
        for i in range(0, int(round(total_sample/PERIODSIZE))):
            if data and i < int(round(total_sample/PERIODSIZE - 1)):
                playback.write(data)
                data = sf.readframes(PERIODSIZE)
                time.sleep(0.001)
            else:
                print("OUT OF DATA")
                break
        sf.close()
'''