In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = [15,10]
import librosa
import librosa.display
import IPython.display as ipd
from pydub import AudioSegment
from pydub.utils import mediainfo
import glob
import scipy.io as spio
import scipy.io.wavfile as sciwav
import h5py
import math
from scipy.fftpack import dct
import python_speech_features
import os
import sys
scriptpath = "../"
# Add the directory containing your module to the Python path (wants absolute paths)
sys.path.append(os.path.abspath(scriptpath))
from Libs.lcj_io import getDirsInFolder,getFilesInFloder
import glob
from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm
from time import sleep
# from sklearn.utils import shuffle
from Libs.utils import get_recursive_files

In [2]:
# !pip install surfboard

In [3]:
keywrod_train_root_dir = "../Speech_DataSets/whole_keyword_clean_second_run_1429/"
timit_train_root_dir = "../../Speech_DataSets/TIMIT/TRAIN/"
timit_train_dr1_root_dir = "../../Speech_DataSets/TIMIT/TRAIN/DR1/"
timit_train_dr1_sub_1_dir = "../../Speech_DataSets/TIMIT/TRAIN/DR1/MWAR0/"

In [4]:
KEYWORD_CLS = 1
FILLER_CLS = 2

In [5]:
def hz2mel_nature(freq):
    return 1127. * np.log(1. + freq / 700.)

def mel2hz_nature(mel):
    return 700. * (np.exp(mel / 1127.) - 1.)

def hz2mel(hz):
    return 2595 * np.log10(1+hz/700.)

def mel2hz(mel):
    return 700*(10**(mel/2595.0)-1)

def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))

In [6]:
def get_default_filterbanks(nfilt=10,nfft=1024,samplerate=16000,lowfreq=0,highfreq=8000):
    highfreq= highfreq or samplerate/2
    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
    # compute points evenly spaced in mels
    lowmel = hz2mel_nature(lowfreq)
    highmel = hz2mel_nature(highfreq)
    melpoints = np.linspace(lowmel,highmel,nfilt+2)
    # our points are in Hz, but we use fft bins, so we have to convert
    #  from Hz to fft bin number
    mid_freqs = mel2hz_nature(melpoints)
    
    bins = np.floor((nfft+1)*mid_freqs/samplerate)
    fbank = np.zeros([nfilt,nfft//2+1])
    for j in range(0,nfilt):
        for i in range(int(bins[j]), int(bins[j+1])):
            fbank[j,i] = (i - bins[j]) / (bins[j+1]-bins[j])
        for i in range(int(bins[j+1]), int(bins[j+2])):
            fbank[j,i] = (bins[j+2]-i) / (bins[j+2]-bins[j+1])
    return fbank

def get_filterbank_from_midfreqs(midFreqs,samplerate, n_filt, n_fft):
#     mid_freqs = midFreqs#[229.8,304.1,402.4,532.4,704.4,931.9,1233.1,1631.5,4000.,5500.]
    target_mid_freqs = np.empty(n_filt+2,dtype=np.float)
    idx = 0
    for freq in midFreqs:
        target_mid_freqs[idx] = freq
        idx += 1
#     print(target_mid_freqs)
    bins = np.floor((n_fft+1)*target_mid_freqs/samplerate)
#     print(len(bins))
    fbank = np.zeros([n_filt,n_fft//2+1])
    for j in range(0,n_filt):
        for i in range(int(bins[j]), int(bins[j+1])):
            fbank[j,i] = (i - bins[j]) / (bins[j+1]-bins[j])
        for i in range(int(bins[j+1]), int(bins[j+2])):
            fbank[j,i] = (bins[j+2]-i) / (bins[j+2]-bins[j+1])
    return fbank

def logfbank(signal,samplerate=16000,nfilt=10,nfft=1024,lowfreq=0,highfreq=8000):
    feat,energy = fbank(signal,samplerate,lowfreq,highfreq)
    return np.log(feat)

In [7]:
def magspec(frames, NFFT):
    if np.shape(frames)[1] > NFFT:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
            np.shape(frames)[1], NFFT)
    complex_spec = np.fft.fft(frames, NFFT)
#     print(complex_spec.shape)
    return np.absolute(complex_spec)

def powspec(frames, NFFT):
    theFrames = magspec(frames,NFFT)
    energy = np.sum(theFrames,1)
    return np.square(theFrames), energy
#     return 1.0 / NFFT * numpy.square(theFrames)

def logpowspec(frames, NFFT=1024, norm=0):
    ps = powspec(frames, NFFT);
    ps[ps <= 1e-30] = 1e-30
    lps = np.log(ps)
    if norm:
        return lps - np.max(lps)
    else:
        return lps

In [8]:
def lifter(cepstra, L=22):
    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
    magnitude of the high frequency DCT coeffs.
    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
    """
    if L > 0:
        nframes,ncoeff = np.shape(cepstra)
        n = np.arange(ncoeff)
        lift = 1 + (L/2.)*np.sin(np.pi*n/L)
        return lift*cepstra
    else:
        # values of L <= 0, do nothing
        return cepstra

def delta(feat, N):
    """Compute delta features from a feature vector sequence.
    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
    :param N: For each frame, calculate delta features based on preceding and following N frames
    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row del sig
    del samp_freq
    del sig_len
    del frame_len
    del over_lap
    del step_len
    del framed_sig
    del frame_lps, _energy
    del half_frame_lps
    del wav_feat
    del norm_wav_mfcc
    print(flatten_mfcc.shape)
    return flatten_mfccholds 1 delta feature vector.
    """
    if N < 1:
        raise ValueError('N must be an integer >= 1')
    NUMFRAMES = len(feat)
    denominator = 2 * sum([i**2 for i in range(1, N+1)])
    delta_feat = np.empty_like(feat)
    padded = np.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
    for t in range(NUMFRAMES):
        # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
        delta_feat[t] = np.dot(np.arange(-N, N+1), padded[t : t+2*N+1]) / denominator  
    return delta_feat

In [9]:
def norm_signal(frames):
    return frames/np.max(frames)

def FormatWavSig_MS(norm_sig, sr):
    strt_samp = 0
    end_samp = len(norm_sig)
    end_ms = len(norm_sig)/sr
    xrange = np.linspace(0, end_ms, end_samp-strt_samp)
    return strt_samp, end_samp, end_ms, xrange

In [10]:
def calculate_padlen(numframes, frame_step, frame_len):
    return int((numframes - 1) * frame_step + frame_len)

def calculate_frame_rate(sig_len, step_len):
    return int( sig_len // step_len)

def calculate_total_frames_of_signal(slen, frame_len, frame_step):
    if slen <= frame_len:
        numframes = 1
        return numframes
    else:
        numframes = 1 + int(math.floor((1.0 * slen - frame_len) / frame_step))
        return numframes

In [11]:
def framesig(sig, frame_len, frame_step):
    slen = len(sig)
#     frame_len = int(round_half_up(frame_len))
#     frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))

    padlen = int((numframes - 1) * frame_step + frame_len)
    zeros = np.zeros((padlen - slen,))
    padsignal = np.concatenate((sig, zeros))
    indices = np.tile(np.arange(0, frame_len), (numframes, 1)) + np.tile(
            np.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
    indices = np.array(indices, dtype=np.int32)
    frames = padsignal[indices]
    return  frames

In [12]:
def get_mfsc(frames, nfft=1024):
    ret_lps, theEnergy = powspec(frames,nfft)
    return ret_lps, theEnergy

In [13]:
"""
One may apply sinusoidal liftering1 to the MFCCs to de-emphasize higher MFCCs 
which has been claimed to improve speech recognition in noisy signals.
"""
def get_mfcc(spectrum,dct_type=2,num_ceps=13):
    ret_mfcc = dct(spectrum, type=dct_type, axis=1, norm="ortho")[:, 1 : (num_ceps + 1)]
    return ret_mfcc

def get_min_max_norm_mfcc(spectrum,dct_type=2,num_ceps=13):
    _mfcc = dct(spectrum, type=dct_type, axis=1, norm=None)[:, 1 : (num_ceps + 1)]
    min_ele = np.amin(_mfcc)
    max_ele = np.amax(_mfcc)
    norm_mfcc = (_mfcc-min_ele)/(max_ele-min_ele)
    return norm_mfcc

def get_max_norm_mfcc(spectrum,dct_type=2,num_ceps=13):
    _mfcc = dct(spectrum, type=dct_type, axis=1, norm=None)[:, 1 : (num_ceps + 1)]
    max_ele = np.amax(_mfcc)
    norm_mfcc = _mfcc/max_ele
    return norm_mfcc

def get_lifted_mfcc(mfccs,num_lift=22):
    (nframes, ncoeff) = ret_mfcc.shape
    n = np.arange(ncoeff)
    lift = 1 + (num_lift / 2) * np.sin(np.pi * n / num_lift)
    lifted_mfcc = ret_mfcc * lift
    return lifted_mfcc

def get_mean_mfcc(mfccs):
    mean_mfccs = mfccs - (np.mean(mfccs, axis=0) + 1e-8)
    return mean_mfccs

In [14]:
def get_libosa_mfcc(frames, n_points, dct):
    ret_mfcc = librosa.feature.mfcc(y=frames, sr=16000, dct_type=dct, n_mfcc=n_points)
    return ret_mfcc

In [15]:
def framingRawAudio(wavsig, seg_len, seg_step_len):
    """This function is used to frame the audio into 2D array, 
    and each row is one second long under 16K sample rate
    """
    audio_frame_num = len(wavsig)
    ret_array = framesig(wavsig, seg_len, seg_step_len)
    return ret_array

In [16]:
def framing_Frames(frames, frame_length, hop_length):
    overlap_len = frame_length - hop_length
#     numFrames = calculate_total_frames_of_signal(framesLen, frame_length, hop_length)
#     numCounter = 0
    ff = framed_sig(frames, frame_length, hop_length)
    return ff
#     for audio_row in ff:
        #perform mfcc calculation and add to list

In [17]:
def calculate_nfft(samplerate, winlen):
    """Calculates the FFT size as a power of two greater than or equal to
    the number of samples in a single window length.
    
    Having an FFT less than the window length loses precision by dropping
    many of the samples; a longer FFT than the window allows zero-padding
    of the FFT buffer which is neutral in terms of frequency domain conversion.
    :param samplerate: The sample rate of the signal we are working with, in Hz.
    :param winlen: The length of the analysis window in seconds.
    """
    window_length_samples = winlen * samplerate
    nfft = 1
    while nfft < window_length_samples:
        nfft *= 2
    return nfft

In [18]:
calculate_nfft(16000,0.025)

512

In [19]:
def save_train_data(save_path=None,index_array=None,mfcc_matrix=None,label_matrix=None,fn_matrix=None):
    with h5py.File(save_path,"w") as f:
        f.create_group('data1')
        f.create_group('data2')
        f.create_group('data3')
        f.create_group('data4')
        
        f.create_dataset("x_data",data=mfcc_matrix)
        f.create_dataset("y_data",data=label_matrix)
        f.create_dataset("idx_data", data= index_array)
        f.create_dataset("file_list", data=fn_matrix)
        f.flush()
        return True
    return False

def save_train_data_in_npy(index_matrix=None,mfcc_matrix=None,label_matrix=None,fn_matrix=None):
    np.save("../train_data/train_data.npy",mfcc_matrix)
    np.save("../train_data/train_lable.npy",label_matrix)
    np.save("../train_data/train_index.npy",index_matrix)
    np.save("../train_data/train_fn.npy",fn_matrix)
    return True

In [20]:
def get_wav_acoustic_features(wav_sig=None,sample_rate=None,fb=None,mfcc_type=1,sample_len=513,label=KEYWORD_CLS):
#     samp_freq, sig = sciwav.read(wav_file)
    sig = wav_sig
    samp_freq = sample_rate
    sig_len = len(sig)
    frame_len = 400#int(samp_freq * 0.025) #25ms, 400
    over_lap = 0#int(samp_freq * 0.0) #10ms, 160
    step_len = 400#int(samp_freq * 0.025) #15ms, 240
    framed_sig = framesig(sig,frame_len,step_len)
    #the following is the logic of streaming logic
    
    frame_lps,_energy = get_mfsc(framed_sig) # get one second power-spectrum
    half_frame_lps = np.split(frame_lps.T,[0,sample_len],axis=0)[1]
    wav_feat = np.dot(half_frame_lps.T,fb.T)
    wav_feat = np.log(wav_feat+np.finfo('float').eps)
    norm_wav_mfcc = get_max_norm_mfcc(wav_feat)
    flatten_mfcc = norm_wav_mfcc.flatten()#get_mfcc(wav_feat).flatten()
    del sig
    del samp_freq
    del sig_len
    del frame_len
    del over_lap
    del step_len
    del framed_sig
    del frame_lps, _energy
    del half_frame_lps
    del wav_feat
    del norm_wav_mfcc
    return flatten_mfcc

In [None]:
def test_streaming(wavfile=None):
    pass

In [21]:
# test_speech_path = '../../Speech_DataSets/TIMIT/TRAIN/DR1/MWAR0/SX55.wav'

In [None]:
def Create_Train_Data(train_data_save_path,num_filt=40):
    #     process_timit_config_list = []
    fb_40 = get_default_filterbanks(nfilt=num_filt)
    speech_files = getFilesInFloder(keywrod_train_root_dir)
    data_list = []
    label_list = []
    fnlist = []
    for speech_f in tqdm(speech_files):
        if speech_f.endswith("wav"):
#             fn = f.split(".")[0]
            full_path = keywrod_train_root_dir+speech_f
            w_freq, w_sig = sciwav.read(full_path)
            temp_mfcc = get_wav_acoustic_features(wav_sig=w_sig, sample_rate=w_freq,fb=fb_40)
            data_list.append(temp_mfcc)
            label_list.append([1,0])
            fnlist.append(full_path.encode())
            sleep(0.01)
            
    first_lvl_dirs = getDirsInFolder(timit_train_root_dir)
    counter = 0
    for lvl1d in tqdm(first_lvl_dirs):
        second_lvl_dirs = getDirsInFolder(timit_train_root_dir+lvl1d)
        for d in second_lvl_dirs:
            current_folder = os.path.join(timit_train_root_dir,lvl1d,d)+"/"
            files_list = getFilesInFloder(current_folder)
            for f in files_list:
                if f.endswith("wav"):
                    counter += 1
                    full_path = current_folder+f
                    w_freq, w_sig = sciwav.read(full_path)
                    temp_mfcc = get_wav_acoustic_features(wav_sig=w_sig[0:16000], sample_rate=w_freq,fb=fb_40)
                    data_list.append(temp_mfcc)
                    label_list.append([0,1])
                    fnlist.append(full_path.encode())
                    sleep(0.01)
    
    data_array = np.array(data_list)
    lbl_array = np.array(label_list).T
    index_array = np.arange(len(data_list))
    
    file_array = np.array(fnlist)
    print("total noise is: ", counter)
    print(file_array.T.shape)
    shuffled_idx_ary = shuffle(index_array)
#     print(shuffled_idx_ary)
    print("data shape:",data_array.shape," lable shape:",lbl_array.T.shape)
    print(data_list[0])
#     training_mat_dict = {"x_data":data_list}
#     training_lbl_mat_dict = {"y_data":label_list}
#     spio.savemat("../train_data/train_data_mat.mat",training_mat_dict,oned_as='column')
#     spio.savemat("../train_data/train_lable_mat.mat",training_lbl_mat_dict,oned_as='row')
#     save_train_data(train_data_save_path,shuffled_idx_ary,data_array,lbl_array.T,file_array.T)
    save_train_data_in_npy(shuffled_idx_ary,data_array,lbl_array.T,file_array.T)   

In [22]:
# train_data_save_path = "../train_data/test_train_data.hdf5"
# # cent_bands = [0.0,195.0985245,251.8401972,312.1788118,376.342384,444.5733837,517.1296516,1550.447293,2554.078667,3461.030019,4620.759758,8000.0]
# # _fb = get_filterbank_from_midfreqs(cent_bands,16000,40,1024)
# _fb_40 = get_default_filterbanks(nfilt=40)
# # _fb_26 = get_default_filterbanks(nfilt=26)
# _freq, _sig = sciwav.read(test_speech_path)
# loop_count = (len(_sig)//16000)
# one_second_points = 16000
# train_data_list = []
# for i in range(loop_count):
#     tmp_sig = _sig[i*one_second_points:(i+1)*one_second_points]
#     tmp_mfcc = get_wav_acoustic_features(wav_sig=tmp_sig, sample_rate=_freq,fb=_fb_40)
#     train_data_list.append(tmp_mfcc)
#     del tmp_sig
#     del tmp_mfcc
    
# data_array = np.array([train_data_list])
# save_train_data(train_data_save_path,data_array[0])
# print(data_array[0].shape)

In [23]:
# def test_read_h5(file_path):
#     data_set = None
#     with h5py.File(file_path,"r") as r:
#         data_set = np.array(r["x_data"])
#     print(data_set.shape)

In [24]:
# test_h5_file = "../train_data/test_train_data.hdf5"
# test_read_h5(test_h5_file)

In [177]:
def GenData_Main(train_data_save_path,num_filt=40):
#     process_timit_config_list = []
    fb_40 = get_default_filterbanks(nfilt=num_filt)
    speech_files = getFilesInFloder(keywrod_train_root_dir)
    data_list = []
    label_list = []
    fnlist = []
    for speech_f in tqdm(speech_files):
        if speech_f.endswith("wav"):
#             fn = f.split(".")[0]
            full_path = keywrod_train_root_dir+speech_f
            w_freq, w_sig = sciwav.read(full_path)
            temp_mfcc = get_wav_acoustic_features(wav_sig=w_sig, sample_rate=w_freq,fb=fb_40)
            data_list.append(temp_mfcc)
            label_list.append([1,0])
            fnlist.append(full_path.encode())
            sleep(0.01)
            
    first_lvl_dirs = getDirsInFolder(timit_train_root_dir)
    counter = 0
    for lvl1d in tqdm(first_lvl_dirs):
        second_lvl_dirs = getDirsInFolder(timit_train_root_dir+lvl1d)
        for d in second_lvl_dirs:
            current_folder = os.path.join(timit_train_root_dir,lvl1d,d)+"/"
            files_list = getFilesInFloder(current_folder)
            for f in files_list:
                if f.endswith("wav"):
                    counter += 1
                    full_path = current_folder+f
                    w_freq, w_sig = sciwav.read(full_path)
                    temp_mfcc = get_wav_acoustic_features(wav_sig=w_sig[0:16000], sample_rate=w_freq,fb=fb_40)
                    data_list.append(temp_mfcc)
                    label_list.append([0,1])
                    fnlist.append(full_path.encode())
                    sleep(0.01)
    
    data_array = np.array(data_list)
    lbl_array = np.array(label_list).T
    index_array = np.arange(len(data_list))
    
    file_array = np.array(fnlist)
    print("total noise is: ", counter)
    print(file_array.T.shape)
    shuffled_idx_ary = shuffle(index_array)
#     print(shuffled_idx_ary)
    print("data shape:",data_array.shape," lable shape:",lbl_array.T.shape)
    print(data_list[0])
#     training_mat_dict = {"x_data":data_list}
#     training_lbl_mat_dict = {"y_data":label_list}
#     spio.savemat("../train_data/train_data_mat.mat",training_mat_dict,oned_as='column')
#     spio.savemat("../train_data/train_lable_mat.mat",training_lbl_mat_dict,oned_as='row')
#     save_train_data(train_data_save_path,shuffled_idx_ary,data_array,lbl_array.T,file_array.T)
    save_train_data_in_npy(shuffled_idx_ary,data_array,lbl_array.T,file_array.T)   



In [178]:
train_data_file = "../train_data/train_data.hdf5"
GenData_Main(train_data_file)

100%|██████████| 1504/1504 [00:16<00:00, 89.77it/s]
100%|██████████| 8/8 [00:51<00:00,  6.41s/it]

total noise is:  4620
(6124,)
data shape: (6124,)  lable shape: (6124, 2)
[ 7.32890666e-01  1.84772549e-01  2.11117555e-01 -4.69129782e-01
 -1.83329295e-01 -5.76900931e-02 -2.85447163e-01 -9.87113826e-02
  5.91450781e-02 -1.15874809e-01  1.17622075e-01  6.76127722e-02
 -5.12801990e-03  6.47753229e-01  1.46728434e-01  1.76967724e-01
 -2.50384840e-01 -2.03132516e-01 -4.86591900e-02 -2.66642415e-01
 -1.28154764e-01  1.95242102e-02 -7.38654964e-02  5.74522164e-02
  7.15151044e-02 -1.26471324e-02  5.79844900e-01 -3.26022553e-02
  9.71338529e-02 -2.30312938e-01 -2.62235148e-01 -6.05024611e-02
 -2.30293273e-01 -8.81531369e-02  3.87289425e-02 -4.47206290e-02
  6.20161032e-02  8.40207453e-02  1.29488406e-02  6.44357554e-01
 -2.62953922e-01 -1.93010359e-02 -3.30094034e-01 -3.22518781e-01
 -4.40641364e-02 -1.76159150e-01 -9.71743978e-02 -4.76526280e-02
 -5.76124193e-02  7.02533517e-02 -4.76765407e-02 -8.63574670e-02
  6.58477445e-01 -5.35569974e-01 -2.18176595e-01 -3.33423695e-01
 -7.75201672e-02




In [21]:
# def get_wav_acoustic_features(wav_file):
#     samp_freq, sig = sciwav.read(wav_file)
#     sig_len = len(sig)
#     frame_len = int(samp_freq * 0.025) #25ms, 400
#     over_lap = int(samp_freq * 0.0) #10ms, 160
#     step_len = int(samp_freq * 0.025) #15ms, 240
#     print(frame_len, over_lap, step_len)
#     frame_rate = calculate_frame_rate(samp_freq, step_len)


    # sig = 1. * sig[1:16000]
    # data = librosa.feature.melspectrogram(y=sig, sr=16000, n_mels=40, fmax=8000)
    # data = librosa.power_to_db(data)
    # mfccs = librosa.feature.mfcc(S=data,n_mfcc=39)
    # plt.figure(figsize=(20, 4))
    # librosa.display.specshow(data, x_axis='time')
    # plt.colorbar()
    # plt.title('MFCC')
    # plt.tight_layout()
    # plt.show()


    # framing process
    # total_frames = calculate_total_frames_of_signal(sig_len, frame_len, step_len)
    # pad_len = calculate_padlen(total_frames, step_len, frame_len)
    # print(sig_len, pad_len)
    # framed_sig_1_sec = framesig(sig[0:16000],frame_len,step_len)
    # print(framed_sig_1_sec[1,0:30])
#     framed_sig = framesig(sig[0:16000],frame_len,step_len)
#     print(framed_sig.shape)
    # print(framed_sig[1,0:30])
    # print(sig_row_num,16000/400)
#     mfcc_list = []
    # db_list = []
#     stop_flag = 0
#     for row in framed_sig:
    #     if stop_flag > 2:
    #         break
#         data = librosa.feature.melspectrogram(y=row, sr=16000, n_mels=40, fmax=8000)
#         data = librosa.power_to_db(data)
        #, n_mfcc=10, dct_type=1
#         mfccs = librosa.feature.mfcc(S=data, n_mfcc=20, dct_type=1)
    #     if stop_flag < 2:
    #         print("mfccs shape is",mfccs)
    #         plt.figure(figsize=(10, 4))
    #         librosa.display.specshow(mfccs, x_axis='time')
    #         plt.colorbar()
    #         plt.title('MFCC')
    #         plt.tight_layout()
    #         plt.show()
#         stop_flag += 1
    #     db_list.append()
    #     tmp_mfcc1 = get_mfcc(frames=row, n_points = , dct = 1)
#         mfcc_list.append(mfccs[:,0])
#         mfcc_list.append(mfccs[:,1])

#     mfcc_ary = np.array(mfcc_list)
#     reshape_colnum = mfcc_ary.shape[0] * mfcc_ary.shape[1]
#     print(mfcc_ary.shape[0], mfcc_ary.shape[1])
#     mfcc_ary = mfcc_ary.reshape(1,reshape_colnum)
#     print(mfcc_ary)

    # print(mfcc_ary)

    #, n_mfcc=10, dct_type=1

In [None]:
# norm_sig = norm_signal(sig)
# _strt_samp, _end_samp, _end_ms, _xrange = FormatWavSig_MS(norm_sig,samp_freq)
# X = librosa.stft(norm_sig[_strt_samp:_end_samp], win_length=frame_len)

In [10]:
# Path to arctic_a0005: 'will we ever forget it'
# speech_path = '../../Speech_DataSets/TIMIT/TRAIN/DR1/MWAR0/SX55.wav'
# sound = AudioSegment.from_wav(speech_path)  # Read audio file
# sound_samples = sound.get_array_of_samples()  # Extract signal samples
# samp_freq = sound.frame_rate  # Sampling frequency

# # Normilze to max amplitude of 1
# speech_samples_norm = np.array(sound_samples)/np.max(np.array(sound_samples))

# strt_samp = 0
# end_samp = len(speech_samples_norm)
# end_ms = len(speech_samples_norm)/samp_freq

# xrange = np.linspace(0, end_ms, end_samp-strt_samp)

# # Plot speech and the corresponding spectrogram
# fg1 = plt.figure(figsize=(18, 8))
# plt.plot(xrange, speech_samples_norm)
# plt.xlabel('Time in seconds')
# plt.ylabel('Amplitude')
# plt.axis('tight')

# # fg1.savefig('speech.jpg')

# winlen = int(samp_freq*.025)  # Window size of 30 ms
# X = librosa.stft(
#     np.array(speech_samples_norm[strt_samp:end_samp]), win_length=winlen)
# Xdb = librosa.amplitude_to_db(abs(X))
# fg2 = plt.figure(figsize=(18, 8))
# librosa.display.specshow(Xdb, sr=samp_freq, x_axis='time',
#                          y_axis='hz', hop_length=winlen/4)

# fg2.savefig('specgram.jpg')

In [56]:
np.finfo('float').eps

2.220446049250313e-16