## 1. MFCC

In [3]:
import warnings

import librosa
import numpy as np
import scipy.signal

from inaSpeechSegmenter.sidekit_mfcc import read_wav
from numba import njit


warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning)

sig, read_framerate, sample_width = read_wav('VT 150hz baseline example.converted.wav')

### 1.1 Pre-emphasis

In [59]:
from numba import float32


@njit()
def pre_emphasis(input_sig: float32[:], pre: float32) -> float32[:]:
    """Pre-emphasis of an audio signal.
    :param input_sig: the input vector of signal to pre emphasize
    :param pre: value that defines the pre-emphasis filter.
    """
    output_sig = input_sig.copy()
    for i in range(len(input_sig) - 1):
        output_sig[i + 1] -= input_sig[i] * pre

    return output_sig

In [62]:
%time _ = pre_emphasis(sig, 0.97)

CPU times: user 1.69 ms, sys: 363 µs, total: 2.05 ms
Wall time: 2.82 ms


In [65]:
from inaSpeechSegmenter import sidekit_mfcc


_, loge, _, mspec = sidekit_mfcc.mfcc(sig.astype(np.float32), get_mspec=True)

mspec

array([[-27.783958, -28.875742, -27.59585 , ..., -22.183365, -21.926704,
        -21.66598 ],
       [-20.239338, -21.617071, -19.956205, ..., -15.973019, -15.646971,
        -15.546377],
       [-17.183352, -17.388645, -18.993906, ..., -14.162017, -14.242383,
        -14.167353],
       ...,
       [-21.263563, -21.953535, -22.782087, ..., -17.215765, -16.623173,
        -17.11898 ],
       [-34.86673 , -34.941814, -34.066074, ..., -27.91287 , -27.821241,
        -27.738558],
       [      -inf,       -inf,       -inf, ...,       -inf,       -inf,
              -inf]], dtype=float32)

In [66]:
%time _ = sidekit_mfcc.mfcc(sig.astype(np.float32), get_mspec=True)

CPU times: user 208 ms, sys: 296 ms, total: 504 ms
Wall time: 113 ms


In [52]:
import tensorflow as tf
import tensorflow_io as tfio

sr = 16000
window_length = 0.025 * sr
step = 0.01 * sr
prefac = 0.97

# spec = tf.math.abs(tf.signal.stft(pre_emphasis(sig, 0.97), frame_length=int(window_length), frame_step=int(step)))
spec = tf.math.abs(tf.signal.stft(sig, frame_length=int(window_length), frame_step=int(step)))
spec.numpy() ** 2

array([[2.5048321e-11, 6.1607830e-11, 4.2541376e-10, ..., 3.0197691e-09,
        1.7688846e-09, 8.7659896e-10],
       [5.5596672e-07, 1.0201875e-06, 2.9036614e-06, ..., 2.7829384e-07,
        1.2000672e-07, 7.1218672e-08],
       [7.8948251e-05, 5.5227672e-05, 4.6898822e-05, ..., 1.4559896e-07,
        7.6278617e-08, 4.9978253e-09],
       ...,
       [2.0598622e-05, 1.9185214e-05, 1.5901693e-05, ..., 4.4386192e-07,
        4.0143576e-07, 3.8744446e-07],
       [1.1237545e-11, 1.1085248e-11, 1.0639377e-11, ..., 3.3479929e-12,
        2.9027568e-12, 2.7506764e-12],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [83]:
%time _ = tf.math.abs(tf.signal.stft(sig, frame_length=int(window_length), frame_step=int(step)))

CPU times: user 13.8 ms, sys: 6.85 ms, total: 20.6 ms
Wall time: 17.6 ms


In [23]:
from numba import jit
from inaSpeechSegmenter.sidekit_mfcc import framing


def power_spectrum(input_sig, fs=8000, win_time=0.025, shift=0.01, prefac=0.97):
    """
    Compute the power spectrum of the signal.
    """
    window_length = int(round(win_time * fs))
    overlap = window_length - int(shift * fs)
    framed = framing(input_sig, window_length, win_shift=window_length - overlap).copy()
    # Pre-emphasis filtering is applied after framing to be consistent with stream processing
    # framed = pre_emphasis(framed, prefac)
    l = framed.shape[0]
    n_fft = 2 ** int(np.ceil(np.log2(window_length)))
    # Windowing has been changed to hanning which is supposed to have less noisy sidelobes
    # ham = numpy.hamming(window_length)
    window = np.hanning(window_length)

    spec = np.ones((l, int(n_fft / 2) + 1), dtype=np.float32)
    log_energy = np.log((framed ** 2).sum(axis=1))
    dec = 500000
    start = 0
    stop = min(dec, l)
    while start < l:
        ahan = framed[start:stop, :] * window
        mag = np.abs(np.fft.rfft(ahan, n_fft, axis=-1))
        spec[start:stop, :] = mag
        start = stop
        stop = min(stop + dec, l)

    return spec ** 2, log_energy


In [29]:
# spec, loge = power_spectrum(pre_emphasis(sig, 0.97), fs=16000)
spec, loge = power_spectrum(pre_emphasis(sig, 0.97), fs=16000)
spec
# np.linalg.norm(spec) - np.linalg.norm(spec2.numpy() ** 2)

array([[5.77967680e-15, 8.01227167e-14, 8.17671018e-13, ...,
        1.12401928e-08, 6.41681419e-09, 3.07041814e-09],
       [8.15631007e-10, 1.69756842e-09, 7.47028572e-09, ...,
        1.07176049e-06, 4.69670454e-07, 2.78265759e-07],
       [1.17427064e-07, 1.01974564e-07, 1.12832915e-07, ...,
        5.63141839e-07, 2.89370234e-07, 1.70949672e-08],
       ...,
       [1.03543862e-09, 2.73064660e-09, 7.21970306e-09, ...,
        1.76567232e-06, 1.59864840e-06, 1.54331894e-06],
       [1.94653186e-14, 1.98259276e-14, 2.07852398e-14, ...,
        1.34547521e-11, 1.15527631e-11, 1.09030892e-11],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], dtype=float32)

In [71]:
np.abs(librosa.stft(sig, n_fft=512, hop_length=int(step), win_length=int(window_length), center=False).T)

array([[9.21060055e-05, 1.06944717e-04, 1.05446168e-04, ...,
        1.37174517e-04, 1.80683186e-04, 1.81996787e-04],
       [1.75047235e-03, 1.30912091e-03, 1.46864564e-03, ...,
        4.91781509e-04, 2.97301536e-04, 1.02613179e-04],
       [1.10664070e-02, 1.15234870e-02, 1.16806505e-02, ...,
        3.03659879e-04, 4.23202815e-04, 3.33288976e-04],
       ...,
       [1.62179377e-02, 1.48036405e-02, 1.19423326e-02, ...,
        1.27195485e-03, 1.51321420e-03, 1.69277168e-03],
       [6.20754436e-04, 5.95139980e-04, 5.46903640e-04, ...,
        1.39128053e-04, 1.13545604e-04, 1.05756699e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], dtype=float32)

In [79]:
import scipy

_, _, spec3 = scipy.signal.stft(sig, fs=sr, nfft=512, padded=False, nperseg=window_length, noverlap=step)
np.abs(spec3.T)

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [3.5942190e-07, 4.0613733e-07, 2.9541232e-07, ..., 4.9127698e-07,
        6.5347223e-07, 6.4656780e-07],
       [2.6849399e-05, 1.8837529e-05, 1.7056307e-05, ..., 1.9736924e-06,
        1.0564380e-06, 1.1808006e-07],
       ...,
       [5.7505876e-08, 6.1045306e-08, 6.9954169e-08, ..., 4.2406750e-08,
        2.8164308e-08, 2.0811203e-08],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [18]:
np.array_equal(pre_emphasis(sig, 0.97), (sig[1:] - 0.97 * sig[:-1]))



KeyboardInterrupt: 

In [15]:
(sig[1:] - 0.97 * sig[:-1])


array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [63]:


_, loge, _, mspec = sidekit_mfcc.mfcc(sig.astype(np.float32), get_mspec=True)
mspec


NameError: name 'sidekit_mfcc' is not defined