## 測試項目
1. 倒水聲(15)及馬桶沖水聲(18)
2. 使用acdnet20_20khz_fold4 keras h5-format model
3. export to cc file

In [1]:
import os
import sys
import datetime
import shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
import zipfile
import wavio
from common import utils as U

### Loading Keras Model

In [2]:
keras_model_path = "./models/keras_h5/acdnet20_20khz_fold4.h5"
model = keras.models.load_model(keras_model_path)

In [3]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 30225, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 1, 15109, 4)       36        
                                                                 
 batch_normalization (Batch  (None, 1, 15109, 4)       16        
 Normalization)                                                  
                                                                 
 re_lu (ReLU)                (None, 1, 15109, 4)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 1, 7553, 32)       640       
                                                                 
 batch_normalization_1 (Bat  (None, 1, 7553, 32)       128       
 chNormalization)                                            

### loading dataset from npz format
1. ACDNet input length is 30225
2. sr is 44100 and 20000
3. need to convert 16K to 20000
### ACDNet Config Setting
#### Training Parameters
1. opt.batchSize = 64;
2. opt.weightDecay = 5e-4;
3. opt.momentum = 0.9;
4. opt.nEpochs = 2000;
5. opt.LR = 0.1;
6. opt.schedule = [0.3, 0.6, 0.9];
7. opt.warmup = 10; 
#### Basic Net Configuration
- nClasses = 50
- nFolds = 5
- splits = \[i for in range(1, nFolds + 1)\]
- sr = 20000
- inputLength = 30225
<br>ngth = 30225;
### How to convert 16K sound to 44.1K with python and sox

if using sox the command is as following: <br />
    sox old.wav -b 16 new.wav 
if using python you can do as following: <br />
    import soundfile
    
data, samplerate = soundfile.read('old.wav
    <br />)
soundfile.write('new.wav', data, samplerate, subtype='PCM_1
6')

In [4]:
### convert to 16-bit
# def convertAllFilesInDirectoryTo16Bit(directory):
#     for file in os.listdir(directory):
#          if(file.endswith('.wav')):
#              nameSolo = file.rsplit('.', 1)[0]
#              print(directory + nameSolo )
#              data, samplerate = soundfile.read(directory + file)                

#            soundfile.write('/Users/yournamehere/Desktop/folderwhereyouwanttosae/' + nameSolo + '16BIT.wav', data, samplerate, subtype='PCM_16')
#             print("converting " + file + "to 16 - bit")

In [5]:
def npz_headers(npz):
    """
    Takes a path to an .npz file, which is a Zip archive of .npy files.
    Generates a sequence of (name, shape, np.dtype).
    """
    with zipfile.ZipFile(npz) as archive:
        for name in archive.namelist():
            if not name.endswith('.npy'):
                continue

            npy = archive.open(name)
            version = np.lib.format.read_magic(npy)
            shape, fortran, dtype = np.lib.format._read_array_header(npy, version)
            yield name[:-4], shape, dtype

In [6]:
print(list(npz_headers("./datasets/esc50/wav44.npz")))

[('fold1', (), dtype('O')), ('fold2', (), dtype('O')), ('fold3', (), dtype('O')), ('fold4', (), dtype('O')), ('fold5', (), dtype('O'))]


In [7]:
data = np.load("./datasets/esc50/wav44.npz", allow_pickle=True);
# test_20_X = data['x'];
# test_20_Y = data['y'];
list(data.keys())
data['fold1'].size
data['fold1'].shape
fold1_sounds_list = data['fold1'].item()['sounds']
fold1_labels_list = data['fold1'].item()['labels']
type(fold1_sounds_list)
len(fold1_labels_list)
fold1_sounds_ary = np.asarray(fold1_sounds_list,dtype="object") #set dtype="ojbect" is workable here
fold1_sounds_ary[0]
# data['fold1'].item()['labels']

array([-1,  0,  0, ...,  0,  0, -1], dtype=int16)

In [8]:
test_data = np.load("./datasets/esc50/test_44khz/", allow_pickle=True);
# test_20_X = data['x'];
# test_20_Y = data['y'];
list(data.keys())
data['fold1'].size
data['fold1'].shape
fold1_sounds_list = data['fold1'].item()['sounds']
fold1_labels_list = data['fold1'].item()['labels']
type(fold1_sounds_list)
len(fold1_labels_list)
fold1_sounds_ary = np.asarray(fold1_sounds_list,dtype="object") #set dtype="ojbect" is workable here
fold1_sounds_ary[0]

PermissionError: [Errno 13] Permission denied: './datasets/esc50/test_44khz/'

### Read Test Wav File

In [9]:
test_sound_file = "./test_sounds/toilet_flushing/flush-toilet-lid-down-68571.wav"
ec50_18_sound = "./test_sounds/toilet_flushing/ec50_files/1-20736-A-18.wav"

sound = wavio.read(test_sound_file).data.T[0]
start = sound.nonzero()[0].min()
end = sound.nonzero()[0].max()
sound = sound[start: end + 1]  # Remove silent sections
label = 18 #int(os.path.splitext(test_sound_file)[0].split('-')[-1])

ec50_sound1 =  wavio.read(ec50_18_sound).data.T[0]
start_ec50 = ec50_sound1.nonzero()[0].min()
end_ec50 = ec50_sound1.nonzero()[0].max()
ec50_sound1 = ec50_sound1[start_ec50:end_ec50+1]
ec50_18_label = 18

In [10]:
print(sound)
print(len(sound))
print(ec50_sound1)
print(len(ec50_sound1))

[ 1 -2  2 ...  0  0  1]
467132
[1434 1648 1945 ... -809  523 -730]
220500


In [35]:
_inputLen = 30225
_nCrops = 6
def preprocess_setup():
    funcs = []
    funcs += [U.padding( _inputLen// 2),
              U.normalize(32768.0),
              U.multi_crop(_inputLen, _nCrops)]

    return funcs

def preprocess(sound, funcs):
    for f in funcs:
        sound = f(sound)

    return sound;

In [36]:
_funcs = preprocess_setup()

In [37]:
sound = preprocess(sound, _funcs)

ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [31]:
print(sound.shape)

(10, 1, 30225, 1)


In [15]:
sound = np.expand_dims(sound, axis=1)
sound = np.expand_dims(sound, axis=3)
print(sound.shape)

(10, 1, 30225, 1)


In [27]:
scores = model.predict(sound, batch_size=len(sound), verbose=0);
print(type(scores))
print(scores.shape)

for res in scores:
    max_value = res.max()
    max_index = np.argmax(res)
    print(f"max value:{max_value} and index is {max_index}")
    print(res)

<class 'numpy.ndarray'>
(10, 50)
max value:0.0944233313202858 and index is 19
[0.01429683 0.01973213 0.06096708 0.01488105 0.01055669 0.01067765
 0.00539606 0.01035632 0.00598663 0.00746061 0.00821741 0.01575299
 0.00387039 0.00398408 0.00446692 0.02424019 0.0075449  0.02043411
 0.04602816 0.09442333 0.00678634 0.03983502 0.01447995 0.0133396
 0.02781055 0.01735046 0.00897249 0.0033093  0.07652918 0.06278949
 0.01773822 0.00447612 0.00611844 0.03766908 0.02824607 0.02932074
 0.01057659 0.01666032 0.01336543 0.03609875 0.01466191 0.01037799
 0.00387617 0.01406603 0.01068579 0.01535874 0.00735426 0.01363882
 0.0304396  0.01879515]
max value:0.6246333718299866 and index is 18
[7.9890713e-03 3.8701585e-03 7.6974113e-03 3.5915098e-03 3.5662581e-03
 2.7691871e-03 3.5793013e-03 3.2686781e-02 2.7590583e-03 4.9294992e-03
 1.5218394e-01 6.5829814e-03 8.0095470e-04 6.2212897e-03 3.8084278e-03
 1.7339818e-03 6.0829316e-04 1.7410167e-03 6.2463337e-01 1.3246996e-02
 4.2147213e-03 3.3158907e-03 3.345