In [6]:
#Process speech audio, slice them into 4s per file.

In [7]:
import pandas as pd
import numpy as np
import os
import librosa
import soundfile as sf

In [8]:
#Concatenate all audio files in a folder, and slice them into pieces of desired length
def slice_audio(input_dir,output_dir,expected_len,sr = 22050):

    '''
    parameters:
        input_dir: the path of the folder where you saved all the audio file to process
        output_dir: the path of the folder where you want to export the processed audio files
        expected_len: desired length in second of sliced audio
        sr: output sampling rate.(default = 22050 since we are using librosa.load to load in)
    
    '''
    audio_data = []
    file_idx = 0
    
    for file in os.listdir(input_dir):
        if file.endswith('.mp3'):
            print("processing {}".format(file))
            file_path = input_dir + file
            y_data, y_samplerate = librosa.load(file_path)
            audio_data = audio_data + y_data.tolist()
            
            if len(audio_data) > expected_len * sr:
                #Slice audio
                audio_data_np = np.asarray(audio_data[0:expected_len * sr], dtype=np.float32)
                #save the audio
                file_path = output_dir + "speech" + str(file_idx) + '-2.wav'
                sf.write(file_path, audio_data_np, sr)
                #update file index and list buffer
                file_idx += 1               
                audio_data = audio_data[expected_len * sr + 1:-1]

#============================================================================
#Speech audio folder
speech_folder = os.getcwd() + '/0_RawAudio_Speech/' 
output_folder = os.getcwd() + '/Exported/' 
sr = 22050
slice_audio(input_dir = speech_folder,
            output_dir = output_folder,
            expected_len = 4,
            sr = 22050)

In [None]:
#This is a function that replace the children_playing audio file with speech in UrbanSound8K dataset
#CSV will also be changed that speech file will replace the children_playing
metadatafile = os.getcwd() + '/1_Dataset_Generate/metadata/UrbanSound8K.csv'
metadata = pd.read_csv(metadatafile)
children_playing_metadata = metadata[metadata.classID == 2]
speechfolder = os.getcwd() + '/Exported/'
speech_file_idx = 0
for i,r in children_playing_metadata.iterrows():
    audiofolder = os.getcwd() + '/1_Dataset_Generate/audio/fold{:d}/'.format(r[5])
    audiofile = audiofolder + r[0]
    #Delete original audio file
    os.remove(audiofile)
    
    #put in the speech audio file 
    speech_file_name =  'speech' + str(speech_file_idx) + '-2.wav'
    speechfile = speechfolder + speech_file_name
    shutil.copy(speechfile, audiofolder)
    
    #Change the corresponding metadata infor
    children_playing_metadata.loc[i,'slice_file_name'] = speech_file_name
    children_playing_metadata.loc[i,'class'] = 'speech'
    
    children_playing_metadata.loc[i,'fsID'] = 0
    children_playing_metadata.loc[i,'start'] = 0
    children_playing_metadata.loc[i,'end'] = 0
    children_playing_metadata.loc[i,'salience'] = 0    
    #update parameters
    speech_file_idx += 1
    
new_frames = [metadata[metadata.classID != 2],speech_metadata]
new_metadata = pd.concat(new_frames, sort=False)
new_metadata.to_csv('metadata_with_speech.csv',index=False)

In [195]:
## Create soundbank for TRAIN (folds 1-6)
#Generate Dataset using pre-sorted data

import pandas as pd
import numpy as np
import shutil
import os
from tqdm import tqdm
import glob

metadatafile = os.getcwd() + '/1_Dataset_Generate/metadata/metadata_with_speech.csv'
metadata = pd.read_csv(metadatafile)
label_list = sorted(metadata['class'].unique())
print(label_list)

folds = [1, 2, 3, 4, 5, 6]
splitname = 'train'
soundbankfolder = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/'
soundbankfolder = os.path.join(soundbankfolder, splitname)

label_count = {}
for label in label_list:
    label_count[label] = 0
    
for fold in folds:
    
    print('FOLD {:d}'.format(fold))
    audiofolder = os.getcwd() + '/1_Dataset_Generate/audio/fold{:d}/'.format(fold)
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))
    
    fold_label_count = {}
    for label in label_list:
        fold_label_count[label] = 0
    
    for af in tqdm(audiofiles):
        # print(os.path.basename(af))
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]
        label_count[label] += 1
        fold_label_count[label] += 1
        destfolder = os.path.join(soundbankfolder, 'foreground', label)
        if not os.path.isdir(destfolder):
            os.mkdir(destfolder)
        destfile = os.path.join(destfolder, os.path.basename(af))
        shutil.copyfile(af, destfile)
        
    # Print fold report
    print('   Fold {:d} labels:'.format(fold))
    for label in label_list:
        print('   {:s}:\t{:d}'.format(label, fold_label_count[label]))
        
# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))

  2%|▏         | 18/873 [00:00<00:04, 177.57it/s]

['air_conditioner', 'car_horn', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'speech', 'street_music']
FOLD 1


100%|██████████| 873/873 [00:04<00:00, 182.84it/s]
  2%|▏         | 16/888 [00:00<00:05, 155.97it/s]

   Fold 1 labels:
   air_conditioner:	100
   car_horn:	36
   dog_bark:	100
   drilling:	100
   engine_idling:	96
   gun_shot:	35
   jackhammer:	120
   siren:	86
   speech:	100
   street_music:	100
FOLD 2


100%|██████████| 888/888 [00:04<00:00, 182.19it/s]
  2%|▏         | 22/925 [00:00<00:04, 213.56it/s]

   Fold 2 labels:
   air_conditioner:	100
   car_horn:	42
   dog_bark:	100
   drilling:	100
   engine_idling:	100
   gun_shot:	35
   jackhammer:	120
   siren:	91
   speech:	100
   street_music:	100
FOLD 3


100%|██████████| 925/925 [00:04<00:00, 194.38it/s]
  2%|▏         | 22/990 [00:00<00:04, 214.99it/s]

   Fold 3 labels:
   air_conditioner:	100
   car_horn:	43
   dog_bark:	100
   drilling:	100
   engine_idling:	107
   gun_shot:	36
   jackhammer:	120
   siren:	119
   speech:	100
   street_music:	100
FOLD 4


100%|██████████| 990/990 [00:05<00:00, 190.60it/s]
  2%|▏         | 19/936 [00:00<00:05, 181.87it/s]

   Fold 4 labels:
   air_conditioner:	100
   car_horn:	59
   dog_bark:	100
   drilling:	100
   engine_idling:	107
   gun_shot:	38
   jackhammer:	120
   siren:	166
   speech:	100
   street_music:	100
FOLD 5


100%|██████████| 936/936 [00:04<00:00, 188.25it/s]
  2%|▏         | 19/823 [00:00<00:04, 165.21it/s]

   Fold 5 labels:
   air_conditioner:	100
   car_horn:	98
   dog_bark:	100
   drilling:	100
   engine_idling:	107
   gun_shot:	40
   jackhammer:	120
   siren:	71
   speech:	100
   street_music:	100
FOLD 6


100%|██████████| 823/823 [00:04<00:00, 195.10it/s]

   Fold 6 labels:
   air_conditioner:	100
   car_horn:	28
   dog_bark:	100
   drilling:	100
   engine_idling:	107
   gun_shot:	46
   jackhammer:	68
   siren:	74
   speech:	100
   street_music:	100


OVERALL labels:
air_conditioner:	600
car_horn:	306
dog_bark:	600
drilling:	600
engine_idling:	624
gun_shot:	230
jackhammer:	668
siren:	607
speech:	600
street_music:	600





## Create soundbank for VALIDATE (folds 7-8)

In [200]:
folds = [7, 8]
splitname = 'validate'

soundbankfolder = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/'
soundbankfolder = os.path.join(soundbankfolder, splitname)

label_count = {}
for label in label_list:
    label_count[label] = 0
    
for fold in folds:
    
    print('FOLD {:d}'.format(fold))
    audiofolder = os.getcwd() + '/1_Dataset_Generate/audio/fold{:d}/'.format(fold)
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))
    
    fold_label_count = {}
    for label in label_list:
        fold_label_count[label] = 0
    
    for af in tqdm(audiofiles):
        # print(os.path.basename(af))
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]       
        label_count[label] += 1
        fold_label_count[label] += 1
        destfolder = os.path.join(soundbankfolder, 'foreground', label)
        if not os.path.isdir(destfolder):
            os.mkdir(destfolder)
        destfile = os.path.join(destfolder, os.path.basename(af))
        shutil.copyfile(af, destfile)
        
    # Print fold report
    print('   Fold {:d} labels:'.format(fold))
    for label in label_list:
        print('   {:s}:\t{:d}'.format(label, fold_label_count[label]))
        
# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))

  2%|▏         | 17/838 [00:00<00:04, 168.21it/s]

FOLD 7


100%|██████████| 838/838 [00:04<00:00, 190.85it/s]
  2%|▏         | 18/806 [00:00<00:04, 176.62it/s]

   Fold 7 labels:
   air_conditioner:	100
   car_horn:	28
   dog_bark:	100
   drilling:	100
   engine_idling:	106
   gun_shot:	51
   jackhammer:	76
   siren:	77
   speech:	100
   street_music:	100
FOLD 8


100%|██████████| 806/806 [00:04<00:00, 191.40it/s]

   Fold 8 labels:
   air_conditioner:	100
   car_horn:	30
   dog_bark:	100
   drilling:	100
   engine_idling:	88
   gun_shot:	30
   jackhammer:	78
   siren:	80
   speech:	100
   street_music:	100


OVERALL labels:
air_conditioner:	200
car_horn:	58
dog_bark:	200
drilling:	200
engine_idling:	194
gun_shot:	81
jackhammer:	154
siren:	157
speech:	200
street_music:	200





## Create soundbank for TEST (folds 9-10)

In [201]:
folds = [9, 10]
splitname = 'test'

soundbankfolder = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/'
soundbankfolder = os.path.join(soundbankfolder, splitname)

label_count = {}
for label in label_list:
    label_count[label] = 0
    
for fold in folds:
    
    print('FOLD {:d}'.format(fold))
    audiofolder = os.getcwd() + '/1_Dataset_Generate/audio/fold{:d}/'.format(fold)
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))
    
    fold_label_count = {}
    for label in label_list:
        fold_label_count[label] = 0
    
    for af in tqdm(audiofiles):
        # print(os.path.basename(af))
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]
        label_count[label] += 1
        fold_label_count[label] += 1
        destfolder = os.path.join(soundbankfolder, 'foreground', label)
        if not os.path.isdir(destfolder):
            os.mkdir(destfolder)
        destfile = os.path.join(destfolder, os.path.basename(af))
        shutil.copyfile(af, destfile)
        
    # Print fold report
    print('   Fold {:d} labels:'.format(fold))
    for label in label_list:
        print('   {:s}:\t{:d}'.format(label, fold_label_count[label]))
        
# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))

  2%|▏         | 17/816 [00:00<00:04, 164.56it/s]

FOLD 9


100%|██████████| 816/816 [00:04<00:00, 178.61it/s]
  3%|▎         | 22/837 [00:00<00:03, 210.41it/s]

   Fold 9 labels:
   air_conditioner:	100
   car_horn:	32
   dog_bark:	100
   drilling:	100
   engine_idling:	89
   gun_shot:	31
   jackhammer:	82
   siren:	82
   speech:	100
   street_music:	100
FOLD 10


100%|██████████| 837/837 [00:04<00:00, 193.96it/s]

   Fold 10 labels:
   air_conditioner:	100
   car_horn:	33
   dog_bark:	100
   drilling:	100
   engine_idling:	93
   gun_shot:	32
   jackhammer:	96
   siren:	83
   speech:	100
   street_music:	100


OVERALL labels:
air_conditioner:	200
car_horn:	65
dog_bark:	200
drilling:	200
engine_idling:	182
gun_shot:	63
jackhammer:	178
siren:	165
speech:	200
street_music:	200





## Copy noise excerpt for all files

In [203]:
noisefile = os.getcwd() + '/1_Dataset_Generate/audio/noise/brownian.wav'
soundbankfolder = os.getcwd() + '/1_Dataset_Generate/audio/soundbanks/'

for split in ['train', 'validate', 'test']:
    
    destfolder = os.path.join(soundbankfolder, split, 'background', 'noise')
    if not os.path.isdir(destfolder):
        os.mkdir(destfolder)
        
    destfile = os.path.join(destfolder, os.path.basename(noisefile))
    shutil.copyfile(noisefile, destfile)
