In [2]:
import librosa
import matplotlib.pyplot as plt
import scipy.io.wavfile
import numpy as np
import math
from scipy.fftpack import dct
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps

In [3]:
#all the audio samples are single-channel(mono)
def load_audio(path):
    sampleRate, audio = scipy.io.wavfile.read(path)
    audio=audio[0:int(3.5*sampleRate)]
    sampleRate=audio.size
    return sampleRate,audio

In [4]:
def plot_audio_init(audio):
    plt.figure(figsize=(12,5))
    plt.plot(audio)
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.title("Audio Signal")
    plt.show()
    plt.close()

In [5]:
def vad(sampleRate,audio,path):
    model = load_silero_vad()
    wav = read_audio(path)
    speech_timestamps = get_speech_timestamps(wav,model)
    try:
        x=speech_timestamps[0]['start']
    except:
        return -1,audio
    if speech_timestamps[0]['start']+6000>sampleRate:
        sampleRate=-1
        return sampleRate,audio
    else:
        speech_timestamps[0]['start']+6000
    audio=audio[speech_timestamps[0]['start']:speech_timestamps[0]['start']+6000]
    sampleRate=audio.size
    return sampleRate,audio

In [6]:
#Time vs Amplitude Graph of wav file
def plot_audio_vad(audio):
    plt.figure(figsize=(12,5))
    plt.plot(audio)
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.title("Audio Signal after VAD")
    plt.show()
    plt.close()

In [7]:
# # Containing and isolating only speech by removing silence or hums.
# discard_threshold=max(audio)*0.15
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
# ax1.plot(audio)
# boundLeft=0
# for i in range(0,sampleRate):
#     if(audio[i]>=discard_threshold):
#         if(i>1000):
#             boundLeft=i-1000
#         else:
#             boundLeft=i
#         break
# boundRight=0
# for i in range(sampleRate-1,-1,-1):
#     if(audio[i]>=discard_threshold):
#         if(i<15000):
#             boundRight=i+1000
#         else:
#             boundRight=i
#         break

# audio=audio[boundLeft:boundRight]
# sampleRate=audio.size
# plt.plot(audio)

In [8]:
# To generate MFCC, we follow the following steps:
# audioInput -> pre-emphasis -> framing -> windowing -> fourier transform -> Inverse Mel Scale Filter Bank -> Log() -> DCT ->
# Derivatives -> Feature Vector

In [9]:
# Pre-emphasis layer
# Amplifies higher frequencies in order to balance the spectrum (higher frequencies have lower energies)
def pre_emphasize(sampleRate,audio):
    pre_emphasis = 0.97
    audio_preemphasized=[]
    for i in range(1,sampleRate):
        audio_preemphasized=np.append(audio_preemphasized,audio[i]-(audio[i-1]*pre_emphasis))

    return audio_preemphasized

def plot_audio_pre_emphasis(audio_preemphasized):
    # Plot the pre-emphasized signal
    plt.figure(figsize=(14, 5))
    plt.plot(audio_preemphasized)
    plt.title('Pre-emphasized Signal')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.show()
    plt.close()

In [10]:
# Framing Layer
# Since the audio wave is more than a second, windowing is necesarry in order to fully capture the features and allow for correct
# calculations to be performed. Thus, for ease of calculations, we slice the wave.
# The signal/wave is separated into sections or frames of 25-30 milliseconds.
# Since some parts of the signal are always at the ends of the frames, and we have to perform hamming window, this may result in data loss.
# To tackle this, we frame-shift with a stride of 15ms. This ensures that parts of signals get to be in the center of the signal.

def frame_audio(sampleRate,audio_preemphasized):
    shift_stride=220  # ~10 millisecond of stride
    frame_size=650 # ~30 millisecond frame
    audio_frames=[]

    # Produces 65 audio frames
    for i in range(0,sampleRate-frame_size,shift_stride):
        audio_frames.append(audio_preemphasized[i:i+frame_size])

    return frame_size,audio_frames

def plot_audio_frame(audio_frames):
    plt.figure(figsize=(12,4))
    plt.plot(audio_frames[2])
    plt.title('Framed Signal')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.show()
    plt.close()

In [11]:
# Windowing Layer
# Since sudden increase/decrease of amplitude at the edges of the frames create noisy outcomes, we have to smoothen it.
# Thus, we apply hamming window

def window_audio(sampleRate,frame_size,audio_frames):
    hammed_audio=[]
    for frame in audio_frames:
        temp_hammed_audio=[]
        for i in range(0,frame_size):
            temp_hammed_audio.append(frame[i]*(0.54-0.46*math.cos(2*math.pi*i/(frame_size-1))))
        
        hammed_audio.append(temp_hammed_audio)

    return hammed_audio

def plot_audio_hammed(hammed_audio):
    plt.figure(figsize=(12,4))
    plt.plot(hammed_audio[2])
    plt.title('Windowed Signal')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.show()
    plt.close()

In [12]:
# FFT(Fast Fourier Transform) Layer
# Used to convert time-domain signal to frequency-domain to analyze frequency components of speech.
# Output of FFT gives complex frequency spectrum (both magnitude and phase)
# Since we only need magnitude, we evaluate the power spectrum from the output of FFT
# NFFT specifies number of points for the FFT. The output is NFFT/2 points

def pow_spec(hammed_audio):
    NFFT=2048
    complex_power_spectrums=np.fft.rfft(hammed_audio,NFFT)
    power_spectrum=(1/NFFT)*pow(np.abs(complex_power_spectrums),2)
    return NFFT,power_spectrum

def plot_power_spectrum(power_spectrum):
    plt.figure(figsize=(12,4))
    plt.plot(power_spectrum[2])
    plt.title("Power Spectral Density")
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Power/Frequency (dB/Hz)")
    plt.show()
    plt.close()

In [13]:
# Mel-filter banks
# Mel-scale related to human-percieved frequency to its actual frequency. Since humans do not hear sound linearly,
# i.e, linear gaps in frequency does not amount to linear change in pitch, we use mel-scale.
# Mel-scale is a logarithm scale, which imitates hearing of humans. Thus, it enables us to capture features as if heard by human.

# Computing the Mel-Filter bank
# 1. Decide upper and lower frequencies in Hertz(SampleRate/2 and 300Hz repectively) 
# 2. Convert them to mels.
# 3. Compute 12 linearly-spaced frequencies inclusive of lower and upper mels.
# 4. Convert these points back to Hertz.
# 5. Round the frequencies to their nearest FFT Bins.
# 6. Create Filterbanks

def mels(sampleRate,NFFT):
    mels=0
    freq_to_mel=lambda freq:1125*math.log(1+freq/700)
    lower_hz=300
    upper_hz=sampleRate/2

    lower_mel=freq_to_mel(lower_hz)
    upper_mel=freq_to_mel(upper_hz)

    n_filters=40
    mel_arr=np.linspace(lower_mel,upper_mel,n_filters+2)
    hz_arr=[700*(math.exp((i/1125))-1) for i in mel_arr]

    freq_bin=[math.floor((NFFT+1)*hz_arr_i/sampleRate) for hz_arr_i in hz_arr]
    return n_filters,freq_bin


In [14]:
# computing the filterbanks

def mel_filterbanks(NFFT,n_filters,freq_bin,power_spectrum):
    temp_filter_bank=np.zeros((n_filters,int((NFFT/2))+1))
    for i in range(1,n_filters+1):
        for k in range(0,int((NFFT/2))):  #frame length
            if k<freq_bin[i]:
                temp_filter_bank[i-1][k]=0
            elif freq_bin[i-1]<=k and k<=freq_bin[i]:
                temp_filter_bank[i-1][k]=(k-freq_bin[i-1])/(freq_bin[i]-freq_bin[i-1])
            elif freq_bin[i]<=k and k<=freq_bin[i+1]:
                temp_filter_bank[i-1][k]=(freq_bin[i+1]-k)/(freq_bin[i+1]-freq_bin[i])
            else:
                temp_filter_bank[i-1][k]=0


    filter_banks=np.dot(power_spectrum, temp_filter_bank.T)
    filter_banks=np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
    filter_banks=np.log(filter_banks+1e-8)

    return filter_banks

def plot_mel_spectogram(sampleRate,filter_banks):
    plt.figure(figsize=(12, 4))
    librosa.display.specshow(filter_banks.T, sr=sampleRate, x_axis='time', y_axis='mel',cmap='turbo')
    plt.colorbar()
    plt.tight_layout()
    plt.show()
    plt.close()

In [15]:
# Generate MFCCs
# We apply DCT on the filterbanks to obtain a set of 26 Mel-Frequency Cepstral Coefficients.
# We only require first 13 coefficients for ASR purposes. Rest are to be discarded.

def gen_mfcc(filter_banks):
    mfcc = dct(filter_banks, type=2, axis=1)[:, 1:13] # Keep 2-13
    return mfcc

def plot_mfcc(sampleRate,mfcc):
    plt.figure(figsize=(12, 4))
    librosa.display.specshow(mfcc.T, sr=sampleRate, x_axis='time', y_axis='mel',cmap='turbo',vmin=-100,vmax=100)
    plt.colorbar()
    plt.ylabel("mfcc coeff")
    plt.tight_layout()
    s=f"C:/Users/svija/Downloads/testmfcc.png"
    plt.savefig(s)
    plt.show()
    plt.close()

In [16]:
def get_mfcc(path):
    flag=0
    sampleRate,audio=load_audio(path)
    sampleRate,audio=vad(sampleRate,audio,path)
    if sampleRate==-1:
        flag=1
        return flag,flag
    audio_preemphasized=pre_emphasize(sampleRate,audio)
    frame_size,audio_frames=frame_audio(sampleRate,audio_preemphasized)
    hammed_audio=window_audio(sampleRate,frame_size,audio_frames)
    NFFT,power_spectrum=pow_spec(hammed_audio)
    n_filters,freq_bin=mels(sampleRate,NFFT)
    filter_banks=mel_filterbanks(NFFT,n_filters,freq_bin,power_spectrum)
    mfcc=gen_mfcc(filter_banks)
    return flag,mfcc

def get_mel_filterbanks(path):
    sampleRate,audio=load_audio(path)
    sampleRate,audio=vad(sampleRate,audio,path)
    audio_preemphasized=pre_emphasize(sampleRate,audio)
    frame_size,audio_frames=frame_audio(sampleRate,audio_preemphasized)
    hammed_audio=window_audio(sampleRate,frame_size,audio_frames)
    NFFT,power_spectrum=pow_spec(hammed_audio)
    n_filters,freq_bin=mels(sampleRate,NFFT)
    filter_banks=mel_filterbanks(NFFT,n_filters,freq_bin,power_spectrum)
    return filter_banks

In [17]:
import os

In [18]:
def prepare_training_data(input_path,output_path):
    os.makedirs(output_path, exist_ok=True)
    file_names = os.listdir(input_path)
    for file in file_names:
        flag,mfcc=get_mfcc(input_path+file)
        if flag==1:
            continue
        plt.figure(figsize=(12, 4))
        librosa.display.specshow(mfcc.T, x_axis='time', y_axis='mel',cmap='turbo',vmin=-100,vmax=100)
        plt.colorbar()
        plt.ylabel("mfcc coeff")
        plt.tight_layout()
        plt.savefig(output_path+file+".png")
        plt.close()

In [None]:
#For audio containing word "dog"
input_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/audioData/Animals/dog/"
output_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/mfccs_all/dog/"

prepare_training_data(input_path,output_path)

  list_backends = torchaudio.list_audio_backends()
  wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)


In [20]:
#For audio containing word "cat"
input_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/audioData/Animals/cat/"
output_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/mfccs_all/cat/"

prepare_training_data(input_path,output_path)

  list_backends = torchaudio.list_audio_backends()
  wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)


In [21]:
#For audio containing word "bird"
input_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/audioData/Animals/bird/"
output_path="C:/Main/Dev/Github/Py-STT-Engine-venv/Py-STT-Engine/mfccs_all/bird/"

prepare_training_data(input_path,output_path)

In [22]:
import cv2

In [None]:
# Make array of MFCC images of word "bird"
image_bird_path="mfccs_all/bird/"
images_bird=[]

image_names=os.listdir(image_bird_path)
for im in image_names:
    image=cv2.imread(image_bird_path+im)
    images_bird.append(image)

In [None]:
# Make array of MFCC images of word "cat"
image_cat_path="mfccs_all/cat/"
images_cat=[]

image_names=os.listdir(image_cat_path)
for im in image_names:
    image=cv2.imread(image_cat_path+im)
    images_cat.append(image)

In [None]:
# Make array of MFCC images of word "dog"
image_dog_path="mfccs_all/dog/"
images_dog=[]

image_names=os.listdir(image_dog_path)
for im in image_names:
    image=cv2.imread(image_dog_path+im)
    images_dog.append(image)