preprocessing/dsp.py

import sys

import numpy as np
from numpy.lib.stride_tricks import as_strided

sys.path.append("..")
from utils.windows import WindowInitializer

#######################################################################
#                          Signal Resampling                          #
#######################################################################


def batch_resample(X, new_dim, mode="bilinear"):
    """
    Resample each image (or similar grid-based 2D signal) in a batch to
    `new_dim` using the specified resampling strategy.

    Parameters
    ----------
    X : numpy array of shape (n_ex, in_rows, in_cols, in_channels)
        An input image volume
    new_dim : 2-tuple of (out_rows, out_cols)
        The dimension to resample each image to
    mode : str
        The resampling strategy to employ. Valid entries are {'bilinear',
        'neighbor'}

    Returns
    -------
    resampled : numpy array of shape (n_ex, out_rows, out_cols, in_channels)
        The resampled image volume
    """
    if mode == "bilinear":
        interpolate = bilinear_interpolate
    elif mode == "neighbor":
        interpolate = nn_interpolate_2D
    else:
        raise NotImplementedError("Unrecognized resampling mode: {}".format(mode))

    out_rows, out_cols = new_dim
    n_ex, in_rows, in_cols, n_in = X.shape

    # compute coordinates to resample
    x = np.tile(np.linspace(0, in_cols - 2, out_cols), out_rows)
    y = np.repeat(np.linspace(0, in_rows - 2, out_rows), out_cols)

    # resample each image
    resampled = []
    for i in range(n_ex):
        r = interpolate(X[i, ...], x, y)
        r = r.reshape(out_rows, out_cols, n_in)
        resampled.append(r)
    return np.dstack(resampled)


def nn_interpolate_2D(X, x, y):
    """
    Estimates of the pixel values at the coordinates (x, y) in X using a
    nearest neighbor interpolation strategy. Assumes the current entries
    in X reflect equally-spaced samples from a 2D integer grid.

    Parameters
    ----------
    X : numpy array of shape (in_rows, in_cols, in_channels)
        An input image sampled along a grid of `in_rows` by `in_cols`.
    x : list of length k
        A list of x-coordinates for the samples we wish to generate
    y : list of length k
        A list of y-coordinates for the samples we wish to generate

    Returns
    -------
    samples : numpy array of shape (k, in_channels)
        The samples for each (x,y) coordinate computed via nearest neighbor
        interpolation
    """
    nx, ny = np.around(x), np.around(y)
    nx = np.clip(nx, 0, X.shape[1] - 1).astype(int)
    ny = np.clip(ny, 0, X.shape[0] - 1).astype(int)
    return X[ny, nx, :]


def nn_interpolate_1D(X, t):
    """
    Estimates of the signal values at X[t] using a nearest neighbor
    interpolation strategy.

    Parameters
    ----------
    X : numpy array of shape (in_length, in_channels)
        An input image sampled along an integer `in_length`
    t : list of length k
        A list of coordinates for the samples we wish to generate

    Returns
    -------
    samples : numpy array of shape (k, in_channels)
        The samples for each (x,y) coordinate computed via nearest neighbor
        interpolation
    """
    nt = np.clip(np.around(t), 0, X.shape[0] - 1).astype(int)
    return X[nt, :]


def bilinear_interpolate(X, x, y):
    """
    Estimates of the pixel values at the coordinates (x, y) in X via bilinear
    interpolation. Assumes the current entries in X reflect equally-spaced
    samples from a 2D integer grid.

    Modified from https://bit.ly/2NMb1Dr

    Parameters
    ----------
    X : numpy array of shape (in_rows, in_cols, in_channels)
        An input image sampled along a grid of `in_rows` by `in_cols`.
    x : list of length k
        A list of x-coordinates for the samples we wish to generate
    y : list of length k
        A list of y-coordinates for the samples we wish to generate

    Returns
    -------
    samples : list of length (k, in_channels)
        The samples for each (x,y) coordinate computed via bilinear
        interpolation
    """
    x0 = np.floor(x).astype(int)
    y0 = np.floor(y).astype(int)
    x1 = x0 + 1
    y1 = y0 + 1

    x0 = np.clip(x0, 0, X.shape[1] - 1)
    y0 = np.clip(y0, 0, X.shape[0] - 1)
    x1 = np.clip(x1, 0, X.shape[1] - 1)
    y1 = np.clip(y1, 0, X.shape[0] - 1)

    Ia = X[y0, x0, :].T
    Ib = X[y1, x0, :].T
    Ic = X[y0, x1, :].T
    Id = X[y1, x1, :].T

    wa = (x1 - x) * (y1 - y)
    wb = (x1 - x) * (y - y0)
    wc = (x - x0) * (y1 - y)
    wd = (x - x0) * (y - y0)

    return (Ia * wa).T + (Ib * wb).T + (Ic * wc).T + (Id * wd).T


#######################################################################
#                        Fourier Decomposition                        #
#######################################################################


def DCT(frame, orthonormal=True):
    """
    A naive O(N^2) implementation of the 1D discrete cosine transform-II
    (DCT-II). For a signal x consisting of N samples, the k'th DCT coefficient,
    c[k], is

        c[k] = 2 * sum_{n=0}^{N-1} x[n] cos [pi * k * (2 * n + 1) / (2 * N)]

    where k ranges from 0, ..., N-1.

    The DCT is highly similar to the DFT  -- whereas in a DFT the basis
    functions are sinusoids, in a DCT they are restricted solely to cosines. A
    signal's DCT representation tends to have more of its energy concentrated
    in a smaller number of coefficients when compared to the DFT, and is thus
    commonly used for signal compression. **

    ** Smoother signals can be accurately approximated using fewer DFT / DCT
    coefficients, resulting in a higher compression ratio. The DCT naturally
    yields a continuous extension at the signal boundaries due its use of even
    basis functions (cosine). This in turn produces a smoother extension in
    comparison to DFT or DCT approximations, resulting in a higher compression.

    Parameters
    ----------
    frame : numpy array of shape (N,)
        A signal frame consisting of N samples
    orthonormal : bool (default: True)
        Scale to ensure the coefficient vector is orthonormal

    Returns
    -------
    dct : numpy array of shape (N,)
        The discrete cosine transform of the samples in `frame`
    """
    N = len(frame)
    out = np.zeros_like(frame)
    for k in range(N):
        for (n, xn) in enumerate(frame):
            out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N))
        scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N))
        out[k] *= 2 * scale if orthonormal else 2
    return out


def __DCT2(frame):
    """Currently broken"""
    N = len(frame)  # window length

    k = np.arange(N, dtype=float)
    F = k.reshape(1, -1) * k.reshape(-1, 1)
    K = np.divide(F, k, out=np.zeros_like(F), where=F != 0)

    FC = np.cos(F * np.pi / N + K * np.pi / 2 * N)
    return 2 * (FC @ frame)


def DFT(frame, positive_only=True):
    """
    A naive O(N^2) implementation of the 1D discrete Fourier transform (DFT).

    The Fourier transform decomposes a signal into a linear combination of
    sinusoids (ie., basis elements in the space of continuous periodic
    functions).  For a sequence `x` of N evenly spaced samples, the k'th
    DFT coefficient is given by:

        c[k] = sum_{n=0}^{N-1} x[n] * exp(-2 * pi * i * k * n / N)

    where i is the imaginary unit, k is an index ranging from 0, ..., N-1,
    and X_k is the complex coefficient representing the phase (imaginary part)
    and amplitude (real part) of the k'th sinusoid in the DFT spectrum. The
    frequency of the k'th sinusoid is (k * 2 * pi / N) radians per sample.

    When applied to a real-valued input, the negative frequency terms are the
    complex conjugates of the positive-frequency terms and the overall spectrum
    is symmetric (excluding the first index, which contains the zero-frequency
    / intercept term).

    Parameters
    ----------
    frame : numpy array of shape (N,)
        A signal frame consisting of N samples
    positive_only : bool (default: True)
        Whether to only return the coefficients for the positive frequency
        terms

    Returns
    -------
    spectrum : numpy array of shape (N,) or (N // 2 + 1,) if `real_only`
        The coefficients of the frequency spectrum for `frame`, including
        imaginary components.
    """
    N = len(frame)  # window length

    # F[i,j] = coefficient for basis vector i, timestep j (i.e., k * n)
    F = np.arange(N).reshape(1, -1) * np.arange(N).reshape(-1, 1)
    F = np.exp(F * (-1j * 2 * np.pi / N))

    # vdot only operates on vectors (rather than ndarrays), so we have to
    # loop over each basis vector in F explicitly
    spectrum = np.array([np.vdot(f, frame) for f in F])
    return spectrum[: (N // 2) + 1] if positive_only else spectrum


def dft_bins(N, fs=44000, positive_only=True):
    """
    Calc the frequency bin centers for a DFT with N coefficients.

    Parameters
    ----------
    N : int
        The number of frequency bins in the DFT
    fs : int (default: 44000)
        The sample rate/frequency of the signal (in Hz)
    positive_only : bool (default: True)
        Whether to only return the bins for the positive frequency
        terms

    Returns
    -------
    bins : numpy array of shape (N,) or (N // 2 + 1,) if `positive_only`
        The frequency bin centers associated with each coefficient in the
        DFT spectrum
    """
    if positive_only:
        freq_bins = np.linspace(0, fs / 2, 1 + N // 2, endpoint=True)
    else:
        l, r = (1 + (N - 1) / 2, (1 - N) / 2) if N % 2 else (N / 2, -N / 2)
        freq_bins = np.r_[np.arange(l), np.arange(r, 0)] * fs / N
    return freq_bins


def magnitude_spectrum(frames):
    """
    Compute the magnitude spectrum (i.e., absolute value of the DFT spectrum)
    for each frame in `frames`. Assumes each frame is real-valued only.

    Parameters
    ----------
    frames : numpy array of shape (M, N)
        A sequence of `M` frames each consisting of `N` samples

    Returns
    -------
    magnitude_spec : numpy array of shape (M, N // 2 + 1)
        The magnitude spectrum for each frame in `frames`. Only includes the
        coefficients for the positive spectrum frequencies.
    """
    return np.vstack([np.abs(DFT(frame, positive_only=True)) for frame in frames])


def power_spectrum(frames, scale=False):
    """
    Compute the power spectrum for a signal represented as a collection of
    frames. Assumes each frame is real-valued only.

    The power spectrum is simply the square of the magnitude spectrum, possibly
    scaled by the number of FFT bins. It measures how the energy of the signal
    is distributed over the frequency domain.

    Parameters
    ----------
    frames : numpy array of shape (M, N)
        A sequence of `M` frames each consisting of `N` samples
    scale : bool (default: False)
        Whether the scale by the number of DFT bins

    Returns
    -------
    power_spec : numpy array of shape (M, N // 2 + 1)
        The power spectrum for each frame in `frames`. Only includes the
        coefficients for the positive spectrum frequencies.
    """
    scaler = frames.shape[1] // 2 + 1 if scale else 1
    return (1 / scaler) * magnitude_spectrum(frames) ** 2


#######################################################################
#                       Preprocessing Utils                           #
#######################################################################


def to_frames(x, frame_width, stride, writeable=False):
    """
    Convert a 1D signal x into overlapping windows of width `frame_width` using
    a hop length of `stride`.

    NB 1: if (len(x) - frame_width) % stride != 0 then some number of the samples
    in x will be dropped. Specifically,
        n_dropped_frames = len(x) - frame_width - stride * (n_frames - 1)
    where
        n_frames = (len(x) - frame_width) // stride + 1

    NB 2: This method uses low-level stride manipulation to avoid creating an
    additional copy of `x`. The downside is that if `writeable`=True, modifying
    the `frame` output can result in unexpected behavior:

        >>> out = to_frames(np.arange(6), 5, 1)
        >>> out
        array([[0, 1, 2, 3, 4],
               [1, 2, 3, 4, 5]])
        >>> out[0, 1] = 99
        >>> out
        array([[ 0, 99,  2,  3,  4],
               [99,  2,  3,  4,  5]])

    Parameters
    ----------
    x : numpy array of shape (N,)
        A 1D signal consisting of N samples
    frame_width : int
        The width of a single frame window in samples
    stride : int
        The hop size / number of samples advanced between consecutive frames
    writeable : bool (default: False)
        If set to False, the returned array will be readonly. Otherwise it will
        be writable if `x` was. It is advisable to set this to False whenever
        possible to avoid unexpected behavior (see NB 2 above).

    Returns
    -------
    frame: numpy array of shape (n_frames, frame_width)
        The collection of overlapping frames stacked into a matrix
    """
    assert x.ndim == 1
    assert stride >= 1
    assert len(x) >= frame_width

    # get the size for an element in x in bits
    byte = x.itemsize
    n_frames = (len(x) - frame_width) // stride + 1
    return as_strided(
        x,
        shape=(n_frames, frame_width),
        strides=(byte * stride, byte),
        writeable=writeable,
    )


def autocorrelate1D(x):
    """
    Autocorrelate a 1D signal `x` with itself.

        auto[k] = sum_n x[n + k] * x[n]

    NB. This is a naive O(N^2) implementation.  For a faster O(N log N)
    approach using the FFT, see:
    https://en.wikipedia.org/wiki/Autocorrelation#Efficient%computation

    Parameters
    ----------
    x : numpy array of shape (N,)
        A 1D signal consisting of N samples

    Returns
    -------
    auto : numpy array of shape (N,)
        The autocorrelation of `x` with itself
    """
    N = len(x)
    auto = np.zeros(N)
    for k in range(N):
        for n in range(N - k):
            auto[k] += x[n + k] * x[n]
    return auto


#######################################################################
#                               Filters                               #
#######################################################################


def preemphasis(x, alpha):
    """
    Increase the amplitude of high frequency bands + decrease the amplitude of
    lower bands.

    Preemphasis filtering is (was?) a common transform in speech processing,
    where higher frequencies tend to be more useful during signal
    disambiguation.

        preemphasis( x[t] ) = x[t] - alpha * x[t-1]

    Parameters
    ----------
    x : numpy array of shape (N,)
        A 1D signal consisting of N samples
    alpha : float in [0, 1)
        The preemphasis coefficient. A value of 0 corresponds to no
        filtering

    Returns
    -------
    out : numpy array of shape (N,)
        The filtered signal
    """
    return np.concatenate([x[:1], x[1:] - alpha * x[:-1]])


def cepstral_lifter(mfccs, D):
    """
    A simple sinusoidal filter applied in the Mel-frequency domain.

    Cepstral lifting helps to smooth the spectral envelope and dampen the
    magnitude of the higher MFCC coefficients while keeping the other
    coefficients unchanged. The filter function is:

        lifter( x[n] ) = x[n] * [1 + D / 2 * sin (pi * n / D)]

    Parameters
    ----------
    mfccs : numpy array of shape (G, C)
        Matrix of Mel cepstral coefficients. Rows correspond to frames, columns
        to cepstral coefficients
    D : int in [0, +infty]
        The filter coefficient. 0 corresponds to no filtering, larger values
        correspond to greater amounts of smoothing

    Returns
    -------
    out : numpy array of shape (G, C)
        The lifter'd MFCC coefficients
    """
    if D == 0:
        return mfccs
    n = np.arange(mfccs.shape[1])
    return mfccs * (1 + (D / 2) * np.sin(np.pi * n / D))


def mel_spectrogram(
    x,
    window_duration=0.025,
    stride_duration=0.01,
    mean_normalize=True,
    window="hamming",
    n_filters=20,
    center=True,
    alpha=0.95,
    fs=44000,
):
    """
    Apply the Mel-filterbank to the power spectrum for a signal `x`.

    Specifically, the Mel spectrogram is the projection of the power spectrum
    of the framed and windowed signal onto the basis set provided by the Mel
    filterbank.

    Parameters
    ----------
    x : numpy array of shape (N,)
        A 1D signal consisting of N samples
    window_duration : float (default: 0.025)
        The duration of each frame / window (in seconds)
    stride_duration : float (default: 0.01)
        The duration of the hop between consecutive windows (in seconds)
    mean_normalize : bool (default : True)
        Whether to subtract the coefficient means from the final filter values
        to improve the signal-to-noise ratio
    window : {'hamming', 'hann', 'blackman_harris'} (default: 'hamming')
        The windowing function to apply to the signal before FFT
    n_filters : int (default: 20)
        The number of mel filters to include in the filterbank
    center : bool (default : False)
        Whether to the kth frame of the signal should *begin* at index x[k *
        stride_len] (center = False) or be *centered* at x[k * stride_len]
        (center = True)
    alpha : float in [0, 1) (default: 0.95)
        The coefficient for the preemphasis filter. A value of 0 corresponds to
        no filtering
    fs : int (default : 44000)
        The sample rate/frequency for the signal

    Returns
    -------
    filter_energies : numpy array of shape (G, n_filters)
        The (possibly mean_normalized) power for each filter in the Mel
        filterbank (i.e., the Mel spectrogram). Rows correspond to frames,
        columns to filters
    energy_per_frame : numpy array of shape (G,)
        The total energy in each frame of the signal
    """
    eps = np.finfo(float).eps
    window_fn = WindowInitializer()(window)

    stride = round(stride_duration * fs)
    frame_width = round(window_duration * fs)
    N = frame_width

    # add a preemphasis filter to the raw signal
    x = preemphasis(x, alpha)

    # convert signal to overlapping frames and apply a window function
    x = np.pad(x, N // 2, "reflect") if center else x
    frames = to_frames(x, frame_width, stride, fs)

    window = np.tile(window_fn(frame_width), (frames.shape[0], 1))
    frames = frames * window

    # compute the power spectrum
    power_spec = power_spectrum(frames)
    energy_per_frame = np.sum(power_spec, axis=1)
    energy_per_frame[energy_per_frame == 0] = eps

    # compute the power at each filter in the Mel filterbank
    fbank = mel_filterbank(N, n_filters=n_filters, fs=fs)
    filter_energies = power_spec @ fbank.T
    filter_energies -= np.mean(filter_energies, axis=0) if mean_normalize else 0
    filter_energies[filter_energies == 0] = eps
    return filter_energies, energy_per_frame


#######################################################################
#                       Mel-Frequency Features                        #
#######################################################################


def mfcc(
    x,
    fs=44000,
    n_mfccs=13,
    alpha=0.95,
    center=True,
    n_filters=20,
    window="hann",
    normalize=True,
    lifter_coef=22,
    stride_duration=0.01,
    window_duration=0.025,
    replace_intercept=True,
):
    """
    Compute the Mel-frequency cepstral coefficients (MFCC) for a signal.

    Computing MFCC features proceeds in the following stages:

        1. Convert the signal into overlapping frames and apply a window fn
        2. Compute the power spectrum at each frame
        3. Apply the mel filterbank to the power spectra to get mel filterbank powers
        4. Take the logarithm of the mel filterbank powers at each frame
        5. Take the discrete cosine transform (DCT) of the log filterbank
           energies and retain only the first k coefficients to further reduce
           the dimensionality

    MFCCs were developed in the context of HMM-GMM automatic speech recognition
    (ASR) systems and can be used to provide a somewhat speaker/pitch
    invariant representation of phonemes.

    Parameters
    ----------
    x : numpy array of shape (N,)
        A 1D signal consisting of N samples
    fs : int (default : 44000)
        The sample rate/frequency for the signal
    n_mfccs : int (default : 13)
        The number of cepstral coefficients to return (including the intercept
        coefficient)
    alpha : float in [0, 1)
        The preemphasis coefficient. A value of 0 corresponds to no
        filtering
    center : bool (default : True)
        Whether to the kth frame of the signal should *begin* at index x[k *
        stride_len] (center = False) or be *centered* at x[k * stride_len]
        (center = True)
    n_filters : int (default: 20)
        The number of filters to include in the Mel filterbank
    normalize : bool (default: True)
        Whether to mean-normalize the MFCC values
    lifter_coef : int in [0, +infty]
        The cepstral filter coefficient. 0 corresponds to no filtering, larger
        values correspond to greater amounts of smoothing
    window : {'hamming', 'hann', 'blackman_harris'} (default : 'hann')
        The windowing function to apply to the signal before taking the DFT
    stride_duration : float
        The duration of the hop between consecutive windows (in seconds)
    window_duration : float
        The duration of each frame / window (in seconds)
    replace_intercept : bool (default: True)
        Replace the first MFCC coefficient (the intercept term) with the
        log of the total frame energy instead.

    Returns
    -------
    mfccs : numpy array of shape (G, C)
        Matrix of Mel-frequency cepstral coefficients. Rows correspond to
        frames, columns to cepstral coefficients
    """
    # map the power spectrum for the (framed + windowed representation of) `x`
    # onto the mel scale
    filter_energies, frame_energies = mel_spectrogram(
        x=x,
        fs=fs,
        alpha=alpha,
        center=center,
        window=window,
        n_filters=n_filters,
        mean_normalize=False,
        window_duration=window_duration,
        stride_duration=stride_duration,
    )

    log_energies = 10 * np.log10(filter_energies)

    # perform a DCT on the log-mel coefficients to further reduce the data
    # dimensionality -- the early DCT coefficients will capture the majority of
    # the data, allowing us to discard coefficients > n_mfccs
    mfccs = np.array([DCT(frame) for frame in log_energies])[:, :n_mfccs]

    mfccs = cepstral_lifter(mfccs, D=lifter_coef)
    mfccs -= np.mean(mfccs, axis=0) if normalize else 0

    if replace_intercept:
        # the 0th MFCC coefficient doesn't tell us anything about the spectrum;
        # replace it with the log of the frame energy for something more
        # informative
        mfccs[:, 0] = np.log(frame_energies)
    return mfccs


def mel2hz(mel, formula="htk"):
    """
    Convert the mel-scale representation of a signal into Hz

    Parameters
    ----------
    mel : numpy array of shape (N, ...)
        An array of mel frequencies to convert
    formula : {"htk", "slaney"}
        The Mel formula to use. "htk" uses the formula used by the Hidden
        Markov Model Toolkit, and described in O'Shaughnessy (1987). "slaney"
        uses the formula used in the MATLAB auditory toolbox (Slaney, 1998)

    Parameters
    ----------
    hz : numpy array of shape (N, ...)
        The frequencies of the items in `mel`, in Hz
    """
    fstr = "formula must be either 'htk' or 'slaney' but got '{}'"
    assert formula in ["htk", "slaney"], fstr.format(formula)
    if formula == "htk":
        return 700 * (10 ** (mel / 2595) - 1)
    raise NotImplementedError("slaney")


def hz2mel(hz, formula="htk"):
    """
    Convert the frequency representaiton of a signal in Hz into the mel scale.

    Parameters
    ----------
    hz : numpy array of shape (N, ...)
        The frequencies of the items in `mel`, in Hz
    formula : {"htk", "slaney"}
        The Mel formula to use. "htk" uses the formula used by the Hidden
        Markov Model Toolkit, and described in O'Shaughnessy (1987). "slaney"
        uses the formula used in the MATLAB auditory toolbox (Slaney, 1998)

    Parameters
    ----------
    mel : numpy array of shape (N, ...)
        An array of mel frequencies to convert
    """
    fstr = "formula must be either 'htk' or 'slaney' but got '{}'"
    assert formula in ["htk", "slaney"], fstr.format(formula)

    if formula == "htk":
        return 2595 * np.log10(1 + hz / 700)
    raise NotImplementedError("slaney")


def mel_filterbank(
    N, n_filters=20, fs=44000, min_freq=0, max_freq=None, normalize=True
):
    """
    Compute the filters in a Mel filterbank and return the corresponding
    transformation matrix

    The Mel scale is a perceptual scale designed to simulate the way the human
    ear works. Pitches judged by listeners to be equal in perceptual /
    psychological distance have equal distance on the Mel scale.  Practically,
    this corresponds to a scale with higher resolution at low frequencies and
    lower resolution at higher (> 500 Hz) frequencies.

    Each filter in the Mel filterbank is triangular with a response of 1 at its
    center and a linear decay on both sides until it reaches the center
    frequency of the next adjacent filter.

    NB. This implementation is based on code in the (superb) LibRosa package:
    https://librosa.github.io

    Parameters
    ----------
    N : int
        The number of DFT bins
    n_filters : int (default: 20)
        The number of mel filters to include in the filterbank
    min_freq : int (default: 0)
        Minimum filter frequency (in Hz)
    max_freq : int (default: 0)
        Maximum filter frequency (in Hz)
    fs : int (default : 44000)
        The sample rate/frequency for the signal
    normalize : bool (default: True)
        If True, scale the Mel filter weights by their area in Mel space.

    Returns
    -------
    fbank : numpy array of shape (n_filters, N // 2 + 1)
        The mel-filterbank transformation matrix. Rows correspond to filters,
        columns to DFT bins.
    """
    max_freq = fs / 2 if max_freq is None else max_freq
    min_mel, max_mel = hz2mel(min_freq), hz2mel(max_freq)

    fbank = np.zeros((n_filters, N // 2 + 1))

    # uniformly spaced values on the mel scale, translated back into Hz
    mel_bins = mel2hz(np.linspace(min_mel, max_mel, n_filters + 2))

    # the centers of the frequency bins for the DFT
    hz_bins = dft_bins(N, fs)

    mel_spacing = np.diff(mel_bins)

    # ramps[i] = mel_bins[i] - hz_bins
    ramps = mel_bins.reshape(-1, 1) - hz_bins.reshape(1, -1)
    for i in range(n_filters):
        # calc the filter values on the left and right across the bins ...
        left = -ramps[i] / mel_spacing[i]
        right = ramps[i + 2] / mel_spacing[i + 1]

        # .. and set them zero when they cross the x-axis
        fbank[i] = np.maximum(0, np.minimum(left, right))

    if normalize:
        energy_norm = 2.0 / (mel_bins[2 : n_filters + 2] - mel_bins[:n_filters])
        fbank *= energy_norm[:, np.newaxis]

    return fbank