# MFCC preprocessing
Prototype and playground for mfcc preprocessing

In [183]:
import numpy as np
import pickle

Initialize toy data with different frame, each frame have same number of cepstral/item
- (data, frame, cepstral)

In [None]:
mfcc = np.array([
    np.array([[1,2,1],[2,2,2],[3,3,3],[3,3,3],[3,3,3]]),
    np.array([[2,2,2],[3,3,3],[4,4,4],[4,4,4]]),
    np.array([[3,3,3],[4,4,4],[5,5,5],[5,5,5],[5,5,5],[5,5,5]]),
    np.array([[5,5,5],[6,6,6]])
])

## Normalizing

In [115]:
mfcc_mean = np.mean([np.mean(item, axis=0) for item in mfcc], axis=0)
mfcc_mean

array([3.9125, 3.9625, 3.9125])

In [130]:
mfcc_std = np.mean([np.std(item, axis=0) for item in mfcc], axis=0)
mfcc_std

array([0.7232297 , 0.64570419, 0.7232297 ])

In [131]:
mfcc_sub = [(item-mfcc_mean)/mfcc_std for item in mfcc]
mfcc_sub

[array([[-4.02707464, -3.03931743, -4.02707464],
        [-2.64438807, -3.03931743, -2.64438807],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [-1.2617015 , -1.49062065, -1.2617015 ]]),
 array([[-2.64438807, -3.03931743, -2.64438807],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [ 0.12098508,  0.05807613,  0.12098508],
        [ 0.12098508,  0.05807613,  0.12098508]]),
 array([[-1.2617015 , -1.49062065, -1.2617015 ],
        [ 0.12098508,  0.05807613,  0.12098508],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165]]),
 array([[1.50367165, 1.60677291, 1.50367165],
        [2.88635822, 3.15546969, 2.88635822]])]

In [132]:
def normalize_mfcc_by_mean_cepstral(mfcc):
    ceps_mean = np.mean([np.mean(item, axis=0) for item in mfcc], axis=0)
    ceps_std = np.mean([np.std(item, axis=0) for item in mfcc], axis=0)
    return np.array([(item-ceps_mean)/ceps_std for item in mfcc])

In [134]:
norm_mfcc = normalize_mfcc_by_mean_cepstral(mfcc)

## Padding

In [180]:
def find_max_length(x):
    return max([item.shape[0] for item in x])

find_max_length(norm_mfcc)

6

In [229]:
def pad_data(vector, pad_width, iaxis, kwargs):
    pad_value = kwargs.get('padder', 0)
    vector[:pad_width[0]] = pad_value

max_length = find_max_length(norm_mfcc)
[np.pad(item, [(max(0, max_length-item.shape[0]),0),(0,0)], pad_with) for item in norm_mfcc]

[array([[ 0.        ,  0.        ,  0.        ],
        [-4.02707464, -3.03931743, -4.02707464],
        [-2.64438807, -3.03931743, -2.64438807],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [-1.2617015 , -1.49062065, -1.2617015 ]]),
 array([[ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        [-2.64438807, -3.03931743, -2.64438807],
        [-1.2617015 , -1.49062065, -1.2617015 ],
        [ 0.12098508,  0.05807613,  0.12098508],
        [ 0.12098508,  0.05807613,  0.12098508]]),
 array([[-1.2617015 , -1.49062065, -1.2617015 ],
        [ 0.12098508,  0.05807613,  0.12098508],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165],
        [ 1.50367165,  1.60677291,  1.50367165]]),
 array([[0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        ,

## Preprocess from audio to mfcc
- (data, feature, timeframe)

In [227]:
import os
import math
from functools import partial
import librosa
from librosa import feature

# np_load_old = partial(np.load)
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [191]:
audio_data = np.load(os.path.join('../data/d_dataset_t3','dataset.npz'))['ns_audio_data'][:20]

In [194]:
audio_data.shape

(20,)

In [192]:
def split_audio(audio_data):
    # split audio
    split_audio = np.array([data[:math.ceil(0.5*len(data))] if j == 0 else data[math.floor(0.5*len(data)):] for i, data in enumerate(audio_data) for j in range(2)])
    return split_audio

In [197]:
split_audio_data = split_audio(audio_data)

In [198]:
split_audio_data.shape

(40,)

In [221]:
outputs = []
for data in split_audio_data:
    mfcc = feature.mfcc(data, sr=16000, n_mfcc = 13)
    outputs.append(np.swapaxes(np.concatenate((mfcc,feature.delta(mfcc),feature.delta(mfcc, order=2)),axis=0),0,1))

In [222]:
outputs = np.array(outputs)

In [225]:
outputs = normalize_mfcc_by_mean_cepstral(outputs)
max_length = find_max_length(outputs)
outputs = np.array([np.pad(item, [(0,max(0, max_length-item.shape[0])),(0,0)], pad_with) for item in outputs])

In [226]:
outputs.shape

(40, 23, 39)