In [1]:
import numpy as np
import os
import librosa
import sed_eval
import scipy

In [2]:
def PNCC(y,sr,parameters):
    '''
    Calculate the
        MFCCs : Mel-frequency cepstral coefficients (MFCCs)
        PCENs : using PCEN() to replace the log amplitude (dB) scaling on Mel spectra
    '''
    win_size = parameters['win_size']
    hop_size = parameters['hop_size']
    n_mels = parameters['num_mel_filters']
    n_dct = parameters['n_dct']
    fmin = parameters['min_freq']
    fmax = parameters['max_freq']
    mel_spectrogram = librosa.feature.melspectrogram(y=y, 
                                                    sr=sr,
                                                    n_fft=win_size, 
                                                    hop_length=hop_size, 
                                                    power=1, 
                                                    n_mels= n_mels,
                                                    fmin = fmin,
                                                    fmax = fmax)
    
    S_PNCC = librosa.pcen(mel_spectrogram *(2**31))
    
    return scipy.fftpack.dct(S_PCNN, axis=0, type=2, norm='ortho')[:n_dct]
#================================================================================================
def FeatureExtraction(FilePath,Parameters):
    sr = Parameters['sampling_rate']
    win_size =  Parameters['win_size']
    hop_size = Parameters['hop_size']

    audio_data,audio_sr = librosa.load(FilePath,sr, mono = True)

    #Other Features might be used========================================================
    #audio_mag = np.abs(librosa.stft(audio_data, n_fft=win_size, hop_length=hop_size)) 
    #Short Time Energy
    #ste = ShortTimeEnergy(signal=audio_data,win_size=win_size,hop_size=hop_size)
        #ZCR
    #zcr = librosa.feature.zero_crossing_rate(y=audio_data,frame_length=win_size,hop_length=hop_size)
        #Spectral Centroid
    #cent = librosa.feature.spectral_centroid(y=audio_data, sr=sr,n_fft=win_size, hop_length=hop_size)
        #Spectral Entropy
    #entropy = Spectral_Entropy(y_Mag=audio_mag,sr=sr,n_short_blocks=10)
        #MFCCs and MFCCs with PCEN scaling
    #mfccs, pcens = PCEN_MFCC(y=audio_data,sr=sr,parameters=Parameters)
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20, dct_type=2, norm='ortho')
    #pnccs = PCNN(y=audio_data, sr=sr, parameters = Parameters)
    #Flatten all the feature matrix and concatenate them into a 1D vector
    #feature_vector = np.concatenate((mfccs,pnccs), axis = 0)
    return mfccs, audio_data, audio_sr
#================================================================================================
def CreateLabelVector(Data,EventList,Parameters,LabelIndex=1):
    #Given audio data, event list, create its corresponding label vector with given index
    #list for saving number of frame labels. 
    hop_size = Parameters['hop_size']
    audio_sr = Parameters['sampling_rate']
    label_vector = np.zeros(int(np.ceil(len(Data)/hop_size)))                      
    for event in EventList:
        osnet_frame = np.ceil(event['onset'] * audio_sr/hop_size).astype(int)
        offset_frame = np.floor(event['offset'] * audio_sr/hop_size).astype(int)
        label_vector[osnet_frame:offset_frame] = LabelIndex
    return label_vector.astype(int)

def ComputeStateTransition(LabelVector):
        #Calculate the probability of status transition If data contains speech 
        ee_temp = 0 # num of event to event
        nn_temp = 0 # num of none to none
        en_temp = 0 # num of event to none
        ne_temp = 0 # num of none to event
        previous_e = 0 # num of frames with previous is event
        previous_n = 0 # num of frames with previous is none
        if sum(LabelVector) != 0:
            for frame_index in range(len(LabelVector)-1):
                if LabelVector[frame_index] == 0 and LabelVector[frame_index+1] > LabelVector[frame_index]:
                    previous_n = previous_n + 1
                    ne_temp = ne_temp + 1
                elif LabelVector[frame_index] == 0 and LabelVector[frame_index+1] == LabelVector[frame_index]:
                    previous_n = previous_n + 1
                    nn_temp = nn_temp + 1
                elif LabelVector[frame_index] == 1 and LabelVector[frame_index+1] < LabelVector[frame_index]:
                    previous_e = previous_e + 1
                    en_temp = en_temp + 1
                elif LabelVector[frame_index] == 1 and LabelVector[frame_index+1] == LabelVector[frame_index]:
                    previous_e = previous_e + 1
                    ee_temp = ee_temp + 1
            #compute the probability        
            p_ee = ee_temp/previous_e
            p_nn = nn_temp/previous_n
            p_en = en_temp/previous_e
            p_ne = ne_temp/previous_n
        return p_ee,p_nn,p_en,p_ne
#=================================================================================================================
def CreateDataset(Files_Dir,Parameters,Property,EventLabel='speech'):
    #Process all the .wav, .txt, in the assigned folder
    
    #Files_Dir: Target folder for saving all raw audio data
    #Parameters: Using  for processing audio files
    #Property:Create train/test set?
    #EventLabel: The label in string which indicate the class you want to learn

    data = []
    labels = []
    
    #Calculate the Transition Matrix From Training set,saving the probability of each status transition
    SS = []#Speech to Speech
    NN = []#NonSpeech to NonSpeech
    SN = []#Speech to NonSpeech
    NS = []#NonSpeech to Speech
    
    file_index = 0
    for root, dirs,files in os.walk(Files_Dir):
        #.txt-based: which means processing txt first then find its corresponding .wav file
        
        #root_path,subfolders = root,dirs
        for file in os.listdir(root):   
            if file.endswith('.txt'):                
                file_path = root + '/' + file
                print(file_path)
                processed_filename =  os.path.splitext(file)[0]
                #Load Annotated Information
                annotated_event = sed_eval.io.load_event_list(file_path)
                target_event = sed_eval.util.event_list.filter_event_list(annotated_event, scene_label=None, event_label=EventLabel, filename=None)

                #Load Audio for feature extraction
                audio_file_name = os.path.splitext(file)[0] + '.wav'
                audio_file_path = root + '/' + audio_file_name             
                featuree_vector,audio_data, audio_sr = FeatureExtraction(audio_file_path,Parameters)

                #Using Annotation info to create the vector labels
                label_vector = CreateLabelVector(Data=audio_data,
                                                     EventList=target_event,
                                                     Parameters=Parameters,
                                                     LabelIndex=1)                                        

                #Calculate the probability in the transition state matrix If data contains speech 
                if sum(label_vector) != 0:
                    #probability of ss,nn,sn,ns; n=nonspeech, s = speech
                    p_ss, p_nn, p_sn, p_ns = ComputeStateTransition(label_vector)                
                    SS.append(p_ss)
                    NN.append(p_nn)
                    SN.append(p_sn)
                    NS.append(p_ns)

                data.append([featuree_vector,label_vector])
                file_index += 1
    
    #Transition Matrix:
    trans_matrix = np.array([[np.mean(SS),1 - np.mean(SS)],[1 - np.mean(NN),np.mean(NN)]])
    #Saving the transition Matrix
    np.save(os.getcwd()+'/JPNotebookExported/' + Property + '_TransitionMatrix.npy', trans_matrix)
    np.save(os.getcwd()+'/JPNotebookExported/' + Property + '_Dataset.npy', np.asarray(data))
    print(trans_matrix)
    
    return np.asarray(data),trans_matrix

In [3]:
Params = {
        'sampling_rate':22050,
        'win_size': 1024,
        'hop_size': 512,
        'min_freq': 80,
        'max_freq': 8000,
        'num_mel_filters': 128,
        'n_dct': 20}

#Train
Train_path = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/train/generated/'
TrainData,TrainTransMatrix = CreateDataset(Files_Dir = Train_path,
                                                          Parameters = Params,
                                                          Property = 'Train_MFCC')

#Validate
Validate_path = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/Validate/generated/'
ValidateData,ValidateTransMatrix = CreateDataset(Files_Dir = Validate_path,
                                                                   Parameters = Params,
                                                                   Property = 'Validate_MFCC')

#Test
Test_path = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/Test/generated/'
TestData,TestTransMatrix = CreateDataset(Files_Dir = Test_path,
                                                       Parameters = Params,
                                                       Property = 'Test_MFCC')

/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform76.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform62.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform89.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform88.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform63.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/uniform/soundscape_train_uniform77.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train

/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal25.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal31.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal30.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal24.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal18.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train/generated/bimodal/soundscape_train_bimodal32.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/train

/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal6.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal5.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal4.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal0.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal1.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_Generate/audio/soundbanks/Validate/generated/bimodal/soundscape_validate_bimodal3.txt
/Users/anderson675/Desktop/RandomForestVAD/jupyternotebook/1_Dataset_G