In [120]:
import librosa
import numpy as np
import scipy

In [160]:
n_fft = 1024
hop_length = 768
n_mels = 128
n_mfcc = 18
htk = False

In [192]:
y, sr = librosa.load("snare.wav", sr=44100, duration = 0.04)

mel_basis = librosa.filters.mel(sr, n_fft=n_fft, n_mels=n_mels, htk=htk)

stft = librosa.stft(y, n_fft = n_fft, hop_length = hop_length)
stft = abs(stft)**2

melspectrogram = np.dot(mel_basis, stft)
S = librosa.power_to_db(melspectrogram)

dct = scipy.fftpack.dct(S, axis=0, type=2, norm='ortho')[:n_mfcc]
dct[6]

array([ 6.0769954, 20.773346 , 16.251583 ], dtype=float32)

In [187]:
y, sr = librosa.load("snare.wav", sr=44100, duration = 0.04)
mfcc = librosa.feature.mfcc(y, sr= sr, n_mfcc = 18, n_fft = n_fft, hop_length = hop_length, n_mels = n_mels, htk = htk)
mfcc

array([[-2.86003326e+02, -3.34808105e+02, -3.69137604e+02],
       [ 6.87944641e+01,  2.83664112e+01,  3.45792770e+01],
       [-1.12308792e+02, -7.76358643e+01, -5.18394089e+01],
       [ 5.42328796e+01,  8.86547318e+01,  8.60850906e+01],
       [-1.98972530e+01, -2.22195187e+01, -2.10354557e+01],
       [-2.00221014e+00,  2.21133156e+01,  2.21021461e+01],
       [ 6.07699537e+00,  2.07733459e+01,  1.62515831e+01],
       [ 1.65625725e+01,  1.50623417e+01, -6.57583177e-01],
       [ 6.83916092e-01,  1.03626194e+01,  7.85667956e-01],
       [ 1.63335991e+01,  2.43107834e+01,  1.58602848e+01],
       [ 4.17882442e-01,  2.75050879e+00,  5.97928524e+00],
       [ 7.03080416e-01,  5.55301905e+00,  2.33952093e+00],
       [-3.14959598e+00,  1.25797768e+01,  1.42104876e+00],
       [ 1.08754606e+01,  1.21074438e+01,  1.05043144e+01],
       [ 1.48730907e+01,  6.08099842e+00,  5.88224792e+00],
       [-6.30869818e+00, -3.08710098e+00, -3.26855993e+00],
       [ 4.90730190e+00,  1.30602665e+01

In [123]:
#number of samples to pad is: floor(n_fft/2)
#reflect over first and last values
#number Of FFT Frames is: 1 + int((len(y) - frame_length) / hop_length)
#perform stft



In [None]:
def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann',
         center=True, dtype=np.complex64, pad_mode='reflect'):

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
                                          stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window *
                                            y_frames[:, bl_s:bl_t],
                                            axis=0)[:stft_matrix.shape[0]]

    return stft_matrix

In [None]:
def mfcc(y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', **kwargs):

    if S is None:
        S = power_to_db(melspectrogram(y=y, sr=sr, **kwargs))

    return scipy.fftpack.dct(S, axis=0, type=dct_type, norm=norm)[:n_mfcc]