In [1]:
import numpy
# import librosa
import scipy.io.wavfile as wavio
import scipy.io as spio
from scipy.fftpack import rfft, rfftfreq
from scipy.stats.mstats import gmean
import glob
import os
import os.path as path
import json
from os import listdir
from os.path import isfile, isdir,join
import decimal
import os.path as path
import librosa
import matplotlib.pyplot as plt
import math
from datetime import date
from datetime import datetime
import time
import tqdm
import sys
from time import sleep
import logging

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
#Define Constans
DEBUG_SINGLE_VARIABLE = "In Fun:{}, Detect_Object:{}, Value={}"

In [3]:
def getFilesInFloder(folderPath):
    onlyfiles = [f for f in listdir(folderPath) if isfile(join(folderPath, f))]
    return onlyfiles

def getDirsInFolder(baseDirPath):
    onlySubDirs = [d for d in listdir(baseDirPath) if isdir(join(baseDirPath, d))]
    return onlySubDirs

def safe_wav_read(wav_file):
    try:
        std_sr = 16000
        sr, sig = wavio.read(wav_file)
        if sig.shape[0] < sig.size:
            sig = sig[0]
            print("\n{} is channel 2".format(wav_file))
        return sr, sig
    except:
        print("Error occured in read and convert wav to ndarray in file {}".format(wav_file))

In [4]:
def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))

In [5]:
"""
frame_len, frame_step unit here is second
"""
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=False):
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = numpy.zeros((padlen - slen,))
    padsignal = numpy.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = numpy.array(indices, dtype=numpy.int32)
        frames = padsignal[indices]
        # if we don't apply window, then the codes are as following:
        win = numpy.tile(winfunc(frame_len), (numframes, 1))
        frames = frames * win
        # if we wanna apply hanning then the codes are as following:
#         frames *= numpy.hanning(frame_len)
    return  frames

In [6]:
def preemphasis(signal, coeff=0.95):
    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])

def hz2mel_nature(freq):
    return 1127. * numpy.log(1. + freq / 700.)

def mel2hz_nature(mel):
    return 700. * (numpy.exp(mel / 1127.) - 1.)

def hz2mel(hz):
    return 2595 * numpy.log10(1+hz/700.)

def mel2hz(mel):
    return 700*(10**(mel/2595.0)-1)

In [7]:
def print_fb_info(theMidFreqs, theBins):
    print("Middle Frequencies are:")
    for m in theMidFreqs:
        print(m)
    print("Bins are:")
    for b in theBins:
        print(b)

In [8]:
def get_filterbanks(nfilt=10,nfft=1024,samplerate=16000,lowfreq=0,highfreq=8000):
    highfreq= highfreq or samplerate/2
    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
    # compute points evenly spaced in mels
    lowmel = hz2mel_nature(lowfreq)
    highmel = hz2mel_nature(highfreq)
    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
    # our points are in Hz, but we use fft bins, so we have to convert
    #  from Hz to fft bin number
    mid_freqs = mel2hz_nature(melpoints)
    bins = numpy.floor((nfft+1)*mid_freqs/samplerate)
    fbank = numpy.zeros([nfilt,nfft//2+1])
    for j in range(0,nfilt):
        for i in range(int(bins[j]), int(bins[j+1])):
            fbank[j,i] = (i - bins[j]) / (bins[j+1]-bins[j])
        for i in range(int(bins[j+1]), int(bins[j+2])):
            fbank[j,i] = (bins[j+2]-i) / (bins[j+2]-bins[j+1])
    print_fb_info(mid_freqs,bins)
    return fbank

In [9]:
# def get_filterbanks_from_40(nfilt=40, nfft=1024, bin_list=[], samplerate=16000, lowfreq=70, highfreq=8000):
#     highfreq= highfreq or samplerate/2
#     assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
#     # compute points evenly spaced in mels
#     lowmel = hz2mel_nature(lowfreq)
#     highmel = hz2mel_nature(highfreq)
#     melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
#     # our points are in Hz, but we use fft bins, so we have to convert
#     #  from Hz to fft bin number
#     mid_freqs = mel2hz_nature(melpoints)
#     print("mid_freqs is {}".format(mid_freqs))
# #*********************************************
#     c = 0
#     for freq in mid_freqs:
#         c += 1 
#     target_mid_freqs = numpy.empty(12,dtype=numpy.float)
#     idx = 0
#     for i in (2,3,5,6,8,9,10,12,22,32):
#         target_mid_freqs[idx] = mid_freqs[i]
#         idx += 1
#     nfilt = 10
# #*********************************************
#     bins = numpy.floor((nfft+1)*target_mid_freqs/samplerate)
#     fbank = numpy.zeros([nfilt,nfft//2+1])
#     for j in range(0,nfilt):
#         for i in range(int(bins[j]), int(bins[j+1])):
#             fbank[j,i] = (i - bins[j]) / (bins[j+1]-bins[j])
#         for i in range(int(bins[j+1]), int(bins[j+2])):
#             fbank[j,i] = (bins[j+2]-i) / (bins[j+2]-bins[j+1])
#     return fbank

def get_filterbank_from_midfreqs(midFreqs,samplerate, n_filt, n_fft):
#     target_mid_freqs = numpy.empty(n_filt+2,dtype=numpy.float)
#     idx = 0
#     for freq in midFreqs:
#         target_mid_freqs[idx] = freq
#         idx += 1
#     bins = numpy.floor((n_fft+1)*target_mid_freqs/samplerate)
    bins = numpy.floor((n_fft+1)*midFreqs/samplerate)
    fbank = numpy.zeros([n_filt,n_fft//2+1])
    for j in range(0,n_filt):
        for i in range(int(bins[j]), int(bins[j+1])):
            fbank[j,i] = (i - bins[j]) / (bins[j+1]-bins[j])
        for i in range(int(bins[j+1]), int(bins[j+2])):
            fbank[j,i] = (bins[j+2]-i) / (bins[j+2]-bins[j+1])
    print_fb_info(midFreqs,bins)
    return fbank

In [10]:
def draw_FBank(filter_bank):
    plt.figure(figsize=(20,10))
    plt.plot(numpy.transpose(filter_bank))

In [11]:
def magspec(frames, NFFT):
    if numpy.shape(frames)[1] > NFFT:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
            numpy.shape(frames)[1], NFFT)
    complex_spec = numpy.fft.fft(frames, NFFT)
#     print("complex_spec is {}".format(complex_spec))
    return numpy.absolute(complex_spec)

def powspec(frames, NFFT):
    return numpy.square(frames)
#     return 1.0 / NFFT * numpy.square(frames)


In [12]:
# TrainConfig = [
#     {
#         "midfreqs"=[0.0,376.3,444.5,594.2,676.3,763.5,856.3,955.018,1171.497,1290.134,1416.292,8000.0],
#         "SpeechDir"="",
#         "NoiseDir"="",
        
#     }
# ]

In [16]:
"""
202.0711212749858
250.94952194706576
302.47638125606954
356.79520498481855
414.0572747255863
474.422069209549
538.057708465823
605.1414220470965
675.8600426259053
750.4105263362397
1632.067444896822
3467.0109274574374
"""
# __midfreqs = [0.0, 195.0985245, 251.8401972
# ,312.1788118
# ,376.342384
# ,444.5733837
# ,517.1296516
# ,594.2853728
# ,676.332114
# ,1550.447293
# ,2554.078667
# ,8000.]
# __midfreqs = [0.0,444.5,594.2,676.3,763.5,856.3,955.018,1171.497,1290.134,1416.292,1550.44,8000.0]
__midfreqs = [0.0,195.0985245,251.8401972,312.1788118,376.342384,444.5733837,517.1296516,1550.447293,2554.078667,3461.030019,4620.759758,8000.0]
__midfreqs_array = numpy.array(__midfreqs)
aFBank = get_filterbank_from_midfreqs(__midfreqs_array,16000,10,1024)
print(aFBank.shape)
#get_filterbanks()#get_filterbanks_from_40()#get_filterbanks()
# print(aFBank)
frame_ms = 25
overlap_factor = 0.75
frames_per_segment = 4
num_fft = 1024
sample_len = 513
def getDataList(wavfile=None,default_label=1):
    tmp_data_list = []
    tmp_lbl_list = []
    (fs,sig) = safe_wav_read(wavfile)
    data_length = len(sig)
    frame_size = int(fs * frame_ms // 1000)
    hop_step = int(frame_size - (frame_size*overlap_factor)) #for old overlap
    segment_size = int(frame_size * frames_per_segment)
    segment_overlap = int(segment_size * overlap_factor)
    segment_step = int(segment_size - segment_overlap)
    data_length = int(segment_step*(numpy.floor(data_length/segment_step)))+1
    count = 0
    for i in range(segment_size, data_length, segment_step):
        count += 1
        # calculate actual 4 indices
        idx1 = i-1600
        idx2 = i-1200
        idx3 = i-800
        idx4 = i-400
        # get current 100ms data
        s1 = sig[idx1:idx2] #0-399
        s2 = sig[idx2:idx3] #400-799run_test_main,number
        s3 = sig[idx3:idx4] #800-1199
        s4 = sig[idx4:i]    #1200-1599
                
        s1 = s1.reshape(1,len(s1))
        s2 = s2.reshape(1,len(s2))
        s3 = s3.reshape(1,len(s3))
        s4 = s4.reshape(1,len(s4))
        
        s1 = magspec(s1,num_fft)
        s2 = magspec(s2,num_fft)
        s3 = magspec(s3,num_fft)
        s4 = magspec(s4,num_fft)
        
        s1 = powspec(s1,num_fft)
        s2 = powspec(s2,num_fft)
        s3 = powspec(s3,num_fft)
        s4 = powspec(s4,num_fft)
        
        s1 = numpy.split(s1.T,[0,sample_len],axis=0)[1]
        s2 = numpy.split(s2.T,[0,sample_len],axis=0)[1]
        s3 = numpy.split(s3.T,[0,sample_len],axis=0)[1]
        s4 = numpy.split(s4.T,[0,sample_len],axis=0)[1]
        
#         feat1 = numpy.dot(s1,aFBank.T) # compute the filterbank energies
        feat1  = numpy.matmul(aFBank,s1)
        feat1 = numpy.where(feat1 == 0,numpy.finfo(float).eps,feat1) # if feat is
        feat1 = numpy.log(feat1)
#         feat1 = 10 * numpy.log10(feat1)
        
#         feat2 = numpy.dot(s2,aFBank.T) # compute the filterbank energies
        feat2  = numpy.matmul(aFBank,s2)
        feat2 = numpy.where(feat2 == 0,numpy.finfo(float).eps,feat2) # if feat is 
        feat2 = numpy.log(feat2)
#         feat2 = 10 * numpy.log10(feat2)
        
#         feat3 = numpy.dot(s3,aFBank.T) # compute the filterbank energies
        feat3  = numpy.matmul(aFBank,s3)
        feat3 = numpy.where(feat3 == 0,numpy.finfo(float).eps,feat3) # if feat is 
        feat3 = numpy.log(feat3)
#         feat3 = 10 * numpy.log10(feat3)
        
#         feat4 = numpy.dot(s4,aFBank.T) # compute the filterbank energies
        feat4  = numpy.matmul(aFBank,s4)
        feat4 = numpy.where(feat4 == 0,numpy.finfo(float).eps,feat4) # if feat is 
        feat4 = numpy.log(feat4)
#         feat4 = 10 * numpy.log10(feat4)
        
        x= numpy.array([feat1,feat2,feat3,feat4]).reshape(1,40)
        x_40 = x[0:40]
        max_ele = numpy.amax(x_40,axis=1)
        min_ele = numpy.amin(x_40,axis=1)
        sum_of_x = numpy.sum(x_40)
        # normalize
        x_normalize = (x_40-min_ele)/(max_ele-min_ele+0.0001)
        
#         tmp_data_list.append(x_40[0])
        tmp_data_list.append(x_normalize[0])
        tmp_lbl_list.append(default_label)
    return tmp_data_list, tmp_lbl_list

Middle Frequencies are:
0.0
195.0985245
251.8401972
312.1788118
376.342384
444.5733837
517.1296516
1550.447293
2554.078667
3461.030019
4620.759758
8000.0
Bins are:
0.0
12.0
16.0
19.0
24.0
28.0
33.0
99.0
163.0
221.0
296.0
512.0
(10, 513)


In [15]:
def saveTrainingData_V2():
    speech_dir = "../../../../../MyWork/speechData/2020_Acoutic_Resources/second_silence_removal/MIR-1K_Remove_Silence_2/"
    music_dir = "../../../../../MyWork/speechData/2020_Acoutic_Resources/iPhone11_music/"
#     speech_test_dir = "../../../MyWork/speechData/2020_Acoutic_Resources/speechtestdir/"
#     music_test_dir = "../../../MyWork/speechData/2020_Acoutic_Resources/musictestdir/"
    speech_files = getFilesInFloder(speech_dir)
    music_files = getFilesInFloder(music_dir)
    print(music_files)
    datalist = []
    labellist = []
    for f2 in tqdm.tqdm(music_files):
        tmpdatalist2, tmplabellist2 = getDataList(music_dir+f2, default_label=0)
        return
        datalist.extend(tmpdatalist2)
        labellist.extend(tmplabellist2)
        
    for f in tqdm.tqdm(speech_files):
        tmpdatalist, tmplabellist = getDataList(speech_dir+f)
        datalist.extend(tmpdatalist)
        labellist.extend(tmplabellist)
    
    training_mat_dict = {"x_data":datalist}
    training_lbl_mat_dict = {"y_data":labellist}
    
    spio.savemat("../../trainingData/training_data_new_bands_20200515_ln.mat",training_mat_dict,oned_as='column')
    spio.savemat("../../trainingData/training_label_new_bands_20200515_ln.mat",training_lbl_mat_dict,oned_as='row')
    print("training mat files saved. |^_^| ")

In [21]:
saveTrainingData_V2()

  if sys.path[0] == '':
  1%|          | 1/94 [00:00<00:09,  9.92it/s]

['Audio_KzWAV_44.wav', 'Audio_KzWAV_38.wav', 'Audio_KzWAV_75.wav', 'Audio_KzWAV_7.wav', 'Audio_KzWAV_53.wav', 'Audio_KzWAV_48.wav', 'Audio_KzWAV_56.wav', 'Audio_KzWAV_69.wav', 'Audio_KzWAV_29.wav', 'Audio_KzWAV_13.wav', 'Audio_KzWAV_14.wav', 'Audio_KzWAV_85.wav', 'Audio_KzWAV_67.wav', 'Audio_KzWAV_62.wav', 'Audio_KzWAV_70.wav', 'Audio_KzWAV_15.wav', 'Audio_KzWAV_60.wav', 'Audio_KzWAV_61.wav', 'Audio_KzWAV_26.wav', 'Audio_KzWAV_58.wav', 'Audio_KzWAV_71.wav', 'Audio_KzWAV_17.wav', 'Audio_KzWAV_73.wav', 'Audio_KzWAV_4.wav', 'Audio_KzWAV_57.wav', 'Audio_KzWAV_5.wav', 'Audio_KzWAV_89.wav', 'Audio_KzWAV_82.wav', 'Audio_KzWAV_42.wav', 'Audio_KzWAV_10.wav', 'Audio_KzWAV_74.wav', 'Audio_KzWAV_36.wav', 'Audio_KzWAV_77.wav', 'Audio_KzWAV_63.wav', 'Audio_KzWAV_33.wav', 'Audio_KzWAV_91.wav', 'Audio_KzWAV_23.wav', 'Audio_KzWAV_20.wav', 'Audio_KzWAV_35.wav', 'Audio_KzWAV_9.wav', 'Audio_KzWAV_59.wav', 'Audio_KzWAV_52.wav', 'Audio_KzWAV_80.wav', 'Audio_KzWAV_12.wav', 'Audio_KzWAV_55.wav', 'Audio_KzWAV_

100%|██████████| 94/94 [00:06<00:00, 13.92it/s]
100%|██████████| 110/110 [00:27<00:00,  3.94it/s]


training mat files saved. |^_^| 
