In [1]:
import sys
import os
import subprocess
import glob
import numpy as np
import wavio
import time;
import random;
from common import opts;
import common.utils as U;
import re

In [2]:
import argparse
from datetime import datetime

In [3]:
src_positive_wav_dir = "./datasets/forOneClassModel_alarm/alarm_positive_21/"
src_negative_wav_dir = "./datasets/forOneClassModel_alarm/alarm_negtive_99/"
test_positive_wav_dir = "./datasets/forOneClassModel_alarm/test_data_p_52/"
test_negative_wav_dir = "./datasets/forOneClassModel_alarm/test_data_n_99/"
train_save_name = "./datasets/forOneClassModel_alarm/train_test_npz/"
test_save_name = "./datasets/processing_tmp_audios/test/"

In [4]:
def getOpts():
    parser = argparse.ArgumentParser(description='Transfer Learning for ACDNet');
    parser.add_argument('--netType', default='ACDNet_TL_Model_Extend',  required=False);
    parser.add_argument('--data', default='../datasets/processed/',  required=False);
    parser.add_argument('--dataset', required=False, default='uec_iot', choices=['10']);
    parser.add_argument('--BC', default=True, action='store_true', help='BC learning');
    parser.add_argument('--strongAugment', default=True,  action='store_true', help='Add scale and gain augmentation');
    #在ipynb中，不能使用parser.parse，要改用parser.parse_known_args()
    opt, unknown = parser.parse_known_args()
    #Leqarning settings
    opt.batchSize = 88;
    opt.weightDecay = 5e-4;
    opt.momentum = 0.09;
    opt.nEpochs = 10;#2000;
    opt.LR = 0.1;
    opt.schedule = [0.3, 0.6, 0.9];
    opt.warmup = 10;
    opt.device = 'cpu';#torch.device("cuda:0" if torch.cuda.is_available() else "cpu");
    #Basic Net Settings
    opt.nClasses = 12#50;
    opt.nFolds = 5;
    opt.splits = [i for i in range(1, opt.nFolds + 1)];
    opt.sr = 20000;
    opt.inputLength = 30225;
    #Test data
    opt.nCrops = 4;
    return opt

In [5]:
def convert_sr(src_path, dst_path, sr):
    # print('* {} -> {}'.format(src_path, dst_path))
    if not os.path.exists(dst_path):
        os.mkdir(dst_path);
    for src_file in sorted(glob.glob(os.path.join(src_path, '*.wav'))):
        dst_file = src_file.replace(src_path, dst_path);
        subprocess.call('ffmpeg -i {} -ac 1 -ar {} -loglevel error -y {}'.format(
            src_file, sr, dst_file), shell=True);

In [6]:
def display_info(opt):
    print('+------------------------------+');
    print('| {} Sound classification'.format(opt.netType));
    print('+------------------------------+');
    print('| dataset  : {}'.format(opt.dataset));
    print('| nEpochs  : {}'.format(opt.nEpochs));
    print('| LRInit   : {}'.format(opt.LR));
    print('| schedule : {}'.format(opt.schedule));
    print('| warmup   : {}'.format(opt.warmup));
    print('| batchSize: {}'.format(opt.batchSize));
    print('| Splits: {}'.format(opt.splits));
    print('+------------------------------+');

In [7]:
def genDataTimeStr():
    return datetime.today().strftime('%Y-%m-%d %H:%M:%S').replace('-',"").replace(' ',"").replace(':',"");

In [8]:
def getFileList(srcDir,regex='.*\.wav'):
    # regex = '.*\.mp3'
    results = os.listdir(srcDir)
    out_files = []
    cnt_files = 0
    for file in results:
        if os.path.isdir(os.path.join(srcDir, file)):
            out_files += getMp3List(os.path.join(srcDir, file))
        elif re.match(regex, file,  re.I):  # file.startswith(startExtension) or file.endswith(".txt") or file.endswith(endExtension):
            out_files.append(os.path.join(srcDir, file))
            cnt_files = cnt_files + 1
    return out_files

In [9]:
def create_dataset(src_path, dst_path, classes_dict):
    # print('* {} -> {}'.format(src_path, dst_path))
    my_dataset = {};
    print(f"--Start to preparing training dataset...---------------");
    my_dataset['fold1'] = {}
    my_sounds = []
    my_labels = []
    wav_list = None
    for k in classes_dict:
        if k == 52:
            cur_src_dir = os.path.join(src_path,classes_dict[k]);
            wav_list = getFileList(cur_src_dir);
        elif k == 99:
            cur_src_dir = os.path.join(src_path,classes_dict[k]);
            wav_list = getFileList(cur_src_dir);
        # print(wav_list)
        # print(f"current source directory:{cur_src_dir}");
        for wav_file in wav_list:
            sound = wavio.read(wav_file).data.T[0]
            start = sound.nonzero()[0].min()
            end = sound.nonzero()[0].max()
            sound = sound[start: end + 1]  # Remove silent sections
            label = int(k);#int(os.path.splitext(wav_file)[0].split('-')[-1])
            my_sounds.append(sound)
            my_labels.append(k)
            print(f"sound:{os.path.basename(wav_file)} is chopped:\n   lable:{k}\n   from {start} to {end} \n   len:{(end-start)/20000}\n");
            # print(f"sound:{wav_file}\nlabel:{k}") 
    print(f"--End of preparing training dataset-------------------");

    my_dataset['fold1']['sounds'] = my_sounds
    my_dataset['fold1']['labels'] = my_labels
    npzname = dst_path.format(genDataTimeStr());
    np.savez(npzname, **my_dataset)
    print(f"npz file:{npzname}")

In [10]:
map_dict_train = {
        52:"alarm_positive_52", #alarm
        99:"alarm_negtive_99", #other_sounds
    };

map_dict_test = {
        52:"test_data_p_52", #alarm
        99:"test_data_n_99", #other_sounds
    };
wav_src_dir = "./datasets/forOneClassModel_alarm/";
# create train npz
dest_npz_save_dir = "./datasets/forOneClassModel_alarm/train_test_npz/trainData_{}.npz"
create_dataset(src_path=wav_src_dir, dst_path=dest_npz_save_dir, classes_dict=map_dict_train);

# creat test npz
# dest_test_npz_save_dir = "./datasets/forOneClassModel_alarm/train_test_npz/testData_{}.npz"
# create_dataset(src_path=wav_src_dir, dst_path=dest_test_npz_save_dir, classes_dict=map_dict);

--Start to preparing training dataset...---------------
sound:fire-alarm-6708_p_1.wav is chopped:
   lable:52
   from 0 to 119993 
   len:5.99965

sound:33747_p_1.wav is chopped:
   lable:52
   from 0 to 83695 
   len:4.18475

sound:emergency-alarm-old-fire-house-alarm-02-160194_p_1.wav is chopped:
   lable:52
   from 1 to 99999 
   len:4.9999

sound:400583_p_2.wav is chopped:
   lable:52
   from 8691 to 99999 
   len:4.5654

sound:alarm-76369_p_2.wav is chopped:
   lable:52
   from 0 to 99999 
   len:4.99995

sound:171515_p_1.wav is chopped:
   lable:52
   from 0 to 77954 
   len:3.8977

sound:207751_p_2.wav is chopped:
   lable:52
   from 0 to 65999 
   len:3.29995

sound:35433_p_3.wav is chopped:
   lable:52
   from 0 to 99999 
   len:4.99995

sound:111707_p_1.wav is chopped:
   lable:52
   from 0 to 96555 
   len:4.82775

sound:33732_p_1.wav is chopped:
   lable:52
   from 0 to 99999 
   len:4.99995

sound:fire-alarm-from-reverse-beeps-29525_p_1.wav is chopped:
   lable:52
   from 

In [11]:
# def create_test_data_src_npz(src_path, dst_path):
#     # print('* {} -> {}'.format(src_path, dst_path))
#     classes_dict = {
#         17:"17_pouring_water", #pouring_water
#         18:"18_toilet_flushing", #toilet_flushing
#         21:"21_sneezing", #snezzing
#         24:"24_coughing", #coughing
#         51:"51_kettle_sound", #kettle_sound
#         52:"52_alarm", #alarm
#         #53:"53_boiling_water_bubble_sound", #boiling_water_bubble_sound
#         54:"54_ringtone", #rington
#         55:"55_shower_water", #shower_water
#         56:"56_pain_sounds", #pain_sounds
#         57:"57_footsteps", #footsteps
#         98:"98_silence", #silence
#         99:"99_other_sounds", #other_sounds
#     };
    
#     my_dataset = {};
#     my_sounds = []
#     my_labels = []
#     my_dataset['testdata'] = {}
#     for k in classes_dict:
#         cur_src_dir = os.path.join(src_path,classes_dict[k]);
#         print(f"current source directory:{cur_src_dir}");
#         for wav_file in sorted(glob.glob(os.path.join(cur_src_dir, '*.wav'))):
#             sound = wavio.read(wav_file).data.T[0]
#             start = sound.nonzero()[0].min()
#             end = sound.nonzero()[0].max()
#             sound = sound[start: end + 1]  # Remove silent sections
#             # label = k;#int(os.path.splitext(wav_file)[0].split('-')[-1])
#             my_sounds.append(sound)
#             my_labels.append(k)
#             print(f"sound:{wav_file}\nlabel:{k}") 
#     print(f"--End of preparing test dataset-------------------");

#     my_dataset['testdata']['sounds'] = my_sounds
#     my_dataset['testdata']['labels'] = my_labels

#     np.savez(dst_path, **my_dataset)

In [12]:
def test_npz(train_npz):
    dataset = np.load(train_npz, allow_pickle=True);
    print(f"len of sound:{len(dataset['fold1'].item()['sounds'][123])}");
    print(f"label:{dataset['fold1'].item()['labels'][123]}");
    # print(f"sound:{dataset['fold3'].item()['sounds'][123]}");
    # print(f"label:{dataset['fold3'].item()['labels'][123]}");
    # train_sounds = []
    # train_labels = []
    # for i in range(1, opt.nFolds + 1):
    #     sounds = dataset['fold{}'.format(i)].item()['sounds']
    #     labels = dataset['fold{}'.format(i)].item()['labels']
    #     if i != split:
    #         train_sounds.extend(sounds)
    #         train_labels.extend(labels)

In [13]:
#load_train_npz("../../../RLRepo/Works/Projects/TransferLearning_for_ACDNet/datasets/fsd50k_processed_audios/train_fsd50_20K__202401041450.npz")

In [14]:
class ValGenerator():
    #Generates data for Keras
    def __init__(self, samples, labels, options):
        random.seed(42);
        #Initialization
        print(len(samples));
        self.data = [(samples[i], labels[i]) for i in range (0, len(samples))];
        self.opt = options;
        self.batch_size = len(samples);#88;#options.batchSize // options.nCrops;
        print(f"batch_size:{self.batch_size}");
        self.preprocess_funcs = self.preprocess_setup();
        self.map_dict= {
            52:1, #alarm
            99:2,
            # 17:1, #pouring_water
            # 18:2, #toilet_flushing
            # 21:3, #snezzing
            # 24:4, #coughing
            # 51:5, #kettle_sound
            # 52:6, #alarm
            # #53:"53_boiling_water_bubble_sound", #boiling_water_bubble_sound
            # 54:7, #rington
            # 55:8, #shower_water
            # 56:9, #pain_sounds
            # 57:10, #footsteps
            # 98:11, #silence
            # 99:12, #other_sounds
        };

    def get_data(self):
        #Generate one batch of data
        x, y = self.generate();
        x = np.expand_dims(x, axis=1)
        x = np.expand_dims(x, axis=3)
        # print(x.shape);
        # print(y.shape);
        return x, y

    def generate(self):
        #Generates data containing batch_size samples
        sounds = [];
        labels = [];
        indexes = None;
        for i in range(self.batch_size):
            sound, target = self.data[i];
            target = self.map_dict[target] - 1;
            sound = self.preprocess(sound).astype(np.float32)
            # print(sound)
            label = np.zeros((self.opt.nCrops, self.opt.nClasses));
            label[:,target] = 1;
            print(f"nCrops:{self.opt.nCrops}, nClasses:{self.opt.nClasses}")
            sounds.append(sound);
            labels.append(label);
        """
        #dtype="object" for ValueError: setting an array element with a sequence. 
        The requested array has an inhomogeneous shape after 1 dimensions. 
        The detected shape was (58,) + inhomogeneous part.
        """
        sounds = np.asarray(sounds,dtype="object")
        # expand_sounds = np.expand_dims(np.asarray(sounds,dtype="object"),axis=1); 
        labels = np.asarray(labels);
        # print(f"shape of sounds:{expand_sounds.shape}")
        sounds = sounds.reshape(sounds.shape[0]*sounds.shape[1], sounds.shape[2]);
        labels = labels.reshape(labels.shape[0]*labels.shape[1], labels.shape[2]);

        return sounds, labels;

    def preprocess_setup(self):
        funcs = []
        funcs += [U.padding(self.opt.inputLength // 2),
                  U.normalize(32768.0),
                  U.multi_crop(self.opt.inputLength, 2)] # we use single crop here.

        return funcs

    def preprocess(self, sound):
        for f in self.preprocess_funcs:
            sound = f(sound)

        return sound;



### ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (58,) + inhomogeneous part.
### 把諸存的型態改成dtype='object'

In [15]:
def create_test_compress_npz(test_src_npz=None, dest_name=None):
    opt = getOpts();#opts.parse();
    opts.display_info(opt);
    # opt.batchSize=88;
    opt.nCrops = 2;
    opt.nClasses=2;
    for sr in [20000]:
        opt.sr = sr;
        opt.inputLength = 30225;
        val_sounds = [];
        val_labels = [];
        dataset = np.load(test_src_npz, allow_pickle=True);
        # for s in opt.splits:
        start_time = time.perf_counter();
        sounds = dataset['fold1'].item()['sounds'];
        labels = dataset['fold1'].item()['labels'];
        print(f"len of sounds:{len(sounds)}, len of labels:{len(labels)}")
        
        # print(sounds)
        val_sounds.extend(sounds);
        val_labels.extend(labels);
        print(f"len of val_sounds:{len(val_sounds)}, len of val_labels:{len(val_labels)}")
        
        valGen = ValGenerator(sounds, labels, opt);
        valX, valY = valGen.get_data();

        np.savez_compressed(dest_name, x=valX, y=valY);
        print('compressed npz generated with\n  shape x:{}\n  y:{}\n  took {:.2f} secs'.format(valX.shape, valY.shape, time.perf_counter()-start_time));
        sys.stdout.flush();

In [18]:
# for validata npz in training processing
# test_npz_name = "./datasets/forOneClassModel_alarm/train_test_npz/testData_20240108152701.npz"
# compressed_test_npz_name = "./datasets/forOneClassModel_alarm/train_test_npz/compressed_val_npz_{}".format(genDataTimeStr())
# create_test_compress_npz(test_src_npz=test_npz_name, dest_name=compressed_test_npz_name);
# for test npz in quantization processing
src_npz_name = "./datasets/forOneClassModel_alarm/train_test_npz/trainData_20240108153111.npz"
compress_quant_test_npz_name = "./datasets/forOneClassModel_alarm/train_test_npz/compressed_quant_test_npz_{}".format(genDataTimeStr())
create_test_compress_npz(test_src_npz=src_npz_name, dest_name=compress_quant_test_npz_name);

+------------------------------+
| ACDNet_TL_Model_Extend Sound classification
+------------------------------+
| dataset  : uec_iot
| nEpochs  : 10
| LRInit   : 0.1
| schedule : [0.3, 0.6, 0.9]
| warmup   : 10
| batchSize: 88
| Splits: [1, 2, 3, 4, 5]
+------------------------------+
len of sounds:167, len of labels:167
len of val_sounds:167, len of val_labels:167
167
batch_size:167
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCrops:2, nClasses:2
nCro

In [32]:
# loading compressed npz
def load_compressed_npz(test_npz):
    dataset = np.load(test_npz, allow_pickle=True);
    # sound_ary = np.asarray();
    sounds = dataset['x'];
    labels = dataset['y']
    print(f"shape of test labels is {labels.shape}")
    # print(f"")
        # extend(repeat(x, 100))
    # sound_data = []
    # sound_data.extend(sound_tmp, len(sound_tmp))
    # print(dataset['x'])

In [49]:
load_compressed_npz("./datasets/forOneClassModel_alarm/train_test_npz/compressed_val_npz_20240108191921.npz");

shape of test labels is (116, 2)


In [9]:
# create_test_npz(".");