In [1]:
import os
import sys
import datetime
import shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
import zipfile
import wavio
from common import utils as U

## Loading Model

In [2]:
model_path = "./models/keras_h5/acdnet20_20khz_fold4.h5"
model = keras.models.load_model(model_path)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 30225, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 1, 15109, 4)       36        
                                                                 
 batch_normalization (Batch  (None, 1, 15109, 4)       16        
 Normalization)                                                  
                                                                 
 re_lu (ReLU)                (None, 1, 15109, 4)       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 1, 7553, 32)       640       
                                                                 
 batch_normalization_1 (Bat  (None, 1, 7553, 32)       128       
 chNormalization)                                            

### setting global variables

In [32]:
_inputLen = 30225
_nCrops = 10
default_sr = 20000
#tag:17:pouring water; 18:toilet_flushing
ground_true_tag = 17
taglst = [ground_true_tag for i in range(_nCrops)]
print(taglst)
target_tags = np.asarray(taglst)
target_tags = np.expand_dims(target_tags,axis=1)

[18, 18, 18, 18, 18, 18, 18, 18, 18, 18]


### Setting Test Sound Path

In [41]:
# original_sound_path = "./test_sounds/toilet_flushing/toilet_flushing_02.wav"
# converted_sound_path = "./test_sounds/toilet_flushing/toilet_flushing_02_20000hz.wav"

original_sound_path = "./test_sounds/pouring_water/pouring_water_in_official.wav"
converted_sound_path = "./test_sounds/pouring_water/pouring_water_in_official_20000hz.wav"

# original_sound_path = "./test_sounds/pouring_water/pouring_coffee_test01.wav"
# converted_sound_path = "./test_sounds/pouring_water/pouring_coffee_test01_20000hz.wav"


wobj = wavio.read(original_sound_path)
sound_sig = wobj.data.T[0]
print(wobj.rate)
if wobj.rate != default_sr:
    U.convert_sr_for_single_file(original_sound_path, converted_sound_path,default_sr)
    wobj = wavio.read(converted_sound_path)
    print(f"Converted wav sampling rate is {wobj.rate}")
    sound_sig = wobj.data.T[0]


48000
* ./test_sounds/pouring_water/pouring_water_in_official.wav -> ./test_sounds/pouring_water/pouring_water_in_official_20000hz.wav
Converted wav sampling rate is 20000


In [42]:
# reading sound

start = sound_sig.nonzero()[0].min()
end = sound_sig.nonzero()[0].max()
sound_sig = sound_sig[start: end + 1]
print(f"Original length of sound_sig is {len(sound_sig)}")
if len(sound_sig) > 220500:
    sound_sig = sound_sig[:220500]
print(f"sound_sig length is {len(sound_sig)}")

Original length of sound_sig is 151449
sound_sig length is 151449


### Preprocessing sound

In [35]:
def preprocess_setup():
    funcs = []
    funcs += [U.padding( _inputLen// 2),
              U.normalize(32768.0),
              # U.rms_normalize(rms_level=2),
              # U.minmax_normalize(),
              U.multi_crop(_inputLen, _nCrops)]

    return funcs

def preprocess(sound, funcs):
    for f in funcs:
        sound = f(sound)
    return sound;

In [36]:
_funcs = preprocess_setup()
sound_sig = preprocess(sound_sig, _funcs)
print(f"The sound_sig shape after preprocessing is {sound_sig.shape}")

The sound_sig shape after preprocessing is (10, 30225)


### expanding sound dimension for input

In [37]:
sound_sig = np.expand_dims(sound_sig, axis=1)
sound_sig = np.expand_dims(sound_sig, axis=3)
print(sound_sig.shape)

(10, 1, 30225, 1)


### performing predict

In [38]:
# def compute_accuracy(y_pred, y_target):
#     #Reshape y_pred to shape it like each sample comtains 10 samples.
#     if _nCrops > 1:
#         y_pred = (y_pred.reshape(y_pred.shape[0]//_nCrops, _nCrops, y_pred.shape[1])).mean(axis=1);
#         y_target = (y_target.reshape(y_target.shape[0]//_nCrops, _nCrops, y_target.shape[1])).mean(axis=1);

#     loss = keras.losses.KLD(y_target, y_pred).numpy().mean();

#     #Get the indices that has highest average value for each sample
#     y_pred = y_pred.argmax(axis=1);
#     y_target = y_target.argmax(axis=1);
#     accuracy = (y_pred==y_target).mean()*100;

#     return accuracy, loss;

In [39]:
scores = model.predict(sound_sig, batch_size=len(sound_sig), verbose=0);
# print(type(scores))
# print(scores.shape)
# print(target_tags.shape)
# acc, loss = compute_accuracy(scores, target_tags)
# print(f"Accuracy:{acc} and loss:{loss}")
acc_count = 0;
for res in scores:
    max_value = res.max()
    max_index = np.argmax(res)
    if max_index == ground_true_tag:
        acc_count += 1
    print(f"max value:{max_value:.5f} and index is {max_index}")
    print('\n'.join('{}: {:.5f}'.format(*k) for k in enumerate(res)))
print(f"The final accuracy is {(acc_count/_nCrops)*100}%.")

max value:0.18811 and index is 28
0: 0.01647
1: 0.03125
2: 0.01130
3: 0.01677
4: 0.00669
5: 0.03434
6: 0.00494
7: 0.06340
8: 0.00626
9: 0.00568
10: 0.00439
11: 0.01018
12: 0.00350
13: 0.01863
14: 0.03067
15: 0.02670
16: 0.00658
17: 0.01506
18: 0.02836
19: 0.03367
20: 0.00995
21: 0.05631
22: 0.00212
23: 0.02815
24: 0.01856
25: 0.00689
26: 0.01656
27: 0.01571
28: 0.18811
29: 0.02430
30: 0.01825
31: 0.02431
32: 0.00629
33: 0.02679
34: 0.01803
35: 0.02973
36: 0.00624
37: 0.01357
38: 0.01125
39: 0.01038
40: 0.00795
41: 0.00456
42: 0.00412
43: 0.00939
44: 0.00714
45: 0.00821
46: 0.00302
47: 0.00423
48: 0.02511
49: 0.01994
max value:0.63350 and index is 18
0: 0.00183
1: 0.00553
2: 0.04594
3: 0.00303
4: 0.00352
5: 0.00124
6: 0.00151
7: 0.00228
8: 0.00422
9: 0.00197
10: 0.02327
11: 0.00510
12: 0.00027
13: 0.00588
14: 0.00916
15: 0.00093
16: 0.00083
17: 0.00360
18: 0.63350
19: 0.01823
20: 0.00277
21: 0.00960
22: 0.00146
23: 0.01250
24: 0.01201
25: 0.00433
26: 0.00296
27: 0.00302
28: 0.06755
29: 