## Training with spectrogram

In [1]:
import tensorflow as tf
import tensorflow_io as tfio
from functools import partial
from glob import glob
from time import time
import numpy as np

2022-12-01 17:56:21.472424: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-01 17:56:21.784440: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-01 17:56:21.784483: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-01 17:56:21.845262: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-01 17:56:23.910044: W tensorflow/stream_executor/platform/de

In [2]:
LABELS = ["go", "stop"]

In [3]:
train_ds = tf.data.Dataset.list_files(["msc-train/go*", "msc-train/stop*"])
test_ds = tf.data.Dataset.list_files(["msc-test/go*", "msc-test/stop*"])
val_ds = tf.data.Dataset.list_files(["msc-val/go*", "msc-val/stop*"])

2022-12-01 17:56:26.652577: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-01 17:56:26.652613: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-01 17:56:26.652644: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (sfanigliulo): /proc/driver/nvidia/version does not exist
2022-12-01 17:56:26.653055: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Hyper parameters-to be tuned

In [4]:
PREPROCESSING_ARGS = {
    'downsampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
}

TRAINING_ARGS = {
    'batch_size': 20,
    'initial_learning_rate': 0.01,
    'end_learning_rate': 1.e-5,
    'epochs': 10
}

In [5]:
def get_audio_and_label(filename):
    audio_binary = tf.io.read_file(filename)
    audio, sampling_rate = tf.audio.decode_wav(audio_binary) 

    path_parts = tf.strings.split(filename, '/')
    path_end = path_parts[-1]
    file_parts = tf.strings.split(path_end, '_')
    label = file_parts[0]

    audio = tf.squeeze(audio)
    zero_padding = tf.zeros(sampling_rate - tf.shape(audio), dtype=tf.float32)
    audio_padded = tf.concat([audio, zero_padding], axis=0)

    return audio_padded, sampling_rate, label


def get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s):
    # TODO: Write your code here
    audio_padded, sampling_rate, label = get_audio_and_label(filename)
    
    if downsampling_rate != sampling_rate:
        sampling_rate_int64 = tf.cast(sampling_rate, tf.int64)
        audio_padded = tfio.audio.resample(audio_padded, sampling_rate_int64, downsampling_rate)

    sampling_rate_float32 = tf.cast(downsampling_rate, tf.float32)
    frame_length = int(frame_length_in_s * sampling_rate_float32)
    frame_step = int(frame_step_in_s * sampling_rate_float32)

    spectrogram = stft = tf.signal.stft(
        audio_padded, 
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=frame_length
    )
    spectrogram = tf.abs(stft)

    return spectrogram, downsampling_rate, label

def get_spectrogram_and_label(filename, downsampling_rate, frame_length_in_s, frame_step_in_s):
    spectrogram, sampling_rate, label = get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s)
    
    return spectrogram, label

get_frozen_spectrogram = partial(get_spectrogram_and_label, **PREPROCESSING_ARGS)

for spectrogram, label in train_ds.map(get_frozen_spectrogram).take(1):
    SHAPE = spectrogram.shape

def preprocess(filename):
    signal, label = get_frozen_spectrogram(filename)

    signal.set_shape(SHAPE)
    signal = tf.expand_dims(signal, -1)
    signal = tf.image.resize(signal, [32, 32])

    label_id = tf.argmax(label == LABELS)

    return signal, label_id

2022-12-01 17:56:27.886404: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX2 FMA




2022-12-01 17:56:28.860250: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:28.865848: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:28.866433: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library


In [6]:
batch_size = TRAINING_ARGS['batch_size']
epochs = TRAINING_ARGS['epochs']

train_ds = train_ds.map(preprocess).batch(batch_size).cache()
val_ds = val_ds.map(preprocess).batch(batch_size)
test_ds = test_ds.map(preprocess).batch(batch_size)



2022-12-01 17:56:30.704257: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:30.712506: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:30.713463: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library




2022-12-01 17:56:31.570432: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:31.577060: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:31.577502: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library




2022-12-01 17:56:32.379741: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:32.389612: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-01 17:56:32.390060: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library


In [7]:
for example_batch, example_labels in train_ds.take(1):
  print('Batch Shape:', example_batch.shape)
  print('Data Shape:', example_batch.shape[1:])
  print('Labels:', example_labels)

Batch Shape: (20, 32, 32, 1)
Data Shape: (32, 32, 1)
Labels: tf.Tensor([1 1 1 1 0 0 1 0 0 1 1 0 1 1 0 1 0 1 1 1], shape=(20,), dtype=int64)


2022-12-01 17:56:33.299274: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


##### Create and train the model

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=example_batch.shape[1:]),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[2, 2], use_bias=False, padding='valid'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=len(LABELS)),
    tf.keras.layers.Softmax()
])

In [9]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=False)
initial_learning_rate = TRAINING_ARGS['initial_learning_rate']
end_learning_rate = TRAINING_ARGS['end_learning_rate']

linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    end_learning_rate=end_learning_rate,
    decay_steps=len(train_ds) * epochs,
)
optimizer = tf.optimizers.Adam(learning_rate=linear_decay)
metrics = [tf.metrics.SparseCategoricalAccuracy()]
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

history = model.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### Test and save the model

In [10]:
test_loss, test_accuracy = model.evaluate(test_ds)



In [11]:
training_loss = history.history['loss'][-1]
training_accuracy = history.history['sparse_categorical_accuracy'][-1]
val_loss = history.history['val_loss'][-1]
val_accuracy = history.history['val_sparse_categorical_accuracy'][-1]

print(f'Training Loss: {training_loss:.4f}')
print(f'Training Accuracy: {training_accuracy*100.:.2f}%')
print()
print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation Accuracy: {val_accuracy*100.:.2f}%')
print()
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy*100.:.2f}%')

Training Loss: 0.1587
Training Accuracy: 94.25%

Validation Loss: 0.3270
Validation Accuracy: 90.00%

Test Loss: 0.2262
Test Accuracy: 91.00%


In [12]:
timestamp = int(time())

saved_model_dir = f'./saved_models/{timestamp}'
if not os.path.exists(saved_model_dir):
    os.makedirs(saved_model_dir)
model.save(saved_model_dir)



INFO:tensorflow:Assets written to: ./saved_models/1669914094/assets


INFO:tensorflow:Assets written to: ./saved_models/1669914094/assets


In [13]:
############## NON RUNNARE - PER SALVARE gli iperparametri e i risultati
# import pandas as pd

# output_dict = {
#     'timestamp': timestamp,
#     **PREPROCESSING_ARGS,
#     **TRAINING_ARGS,
#     'test_accuracy': test_accuracy
# }

# df = pd.DataFrame([output_dict])

# output_path='./spectrogram_results.csv'
# df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)

## TFLite conversion

In [14]:
MODEL_NAME = timestamp
converter = tf.lite.TFLiteConverter.from_saved_model(f'./saved_models/{MODEL_NAME}')
tflite_model = converter.convert()

2022-12-01 18:01:39.446336: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2022-12-01 18:01:39.446391: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2022-12-01 18:01:39.447678: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: ./saved_models/1669914094
2022-12-01 18:01:39.452899: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2022-12-01 18:01:39.452946: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: ./saved_models/1669914094
2022-12-01 18:01:39.468105: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
2022-12-01 18:01:39.473404: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2022-12-01 18:01:39.676985: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: ./save

In [15]:
import os
tflite_models_dir = './tflite_models'
if not os.path.exists(tflite_models_dir):
    os.makedirs(tflite_models_dir)
tflite_model_name = os.path.join(tflite_models_dir, f'{MODEL_NAME}.tflite')
with open(tflite_model_name, 'wb') as fp:
    fp.write(tflite_model)

## Inference with log mel spectrogram

In [16]:
PREPROCESSING_ARGS = {
    'downsampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
    'num_mel_bins': 40,
    'lower_frequency': 20,
    'upper_frequency': 4000,
}

LABELS = ["go", "stop"]

downsampling_rate = PREPROCESSING_ARGS['downsampling_rate']
sampling_rate_int64 = tf.cast(downsampling_rate, tf.int64)
frame_length = int(downsampling_rate * PREPROCESSING_ARGS['frame_length_in_s'])
frame_step = int(downsampling_rate * PREPROCESSING_ARGS['frame_step_in_s'])
spectrogram_width = (16000 - frame_length) // frame_step + 1
num_spectrogram_bins = frame_length // 2 + 1

linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
    PREPROCESSING_ARGS['num_mel_bins'],
    num_spectrogram_bins,
    downsampling_rate,
    PREPROCESSING_ARGS['lower_frequency'],
    PREPROCESSING_ARGS['upper_frequency']
)

##### Load the TFLite model

In [17]:
interpreter = tf.lite.Interpreter(model_path=f'tflite_models/{MODEL_NAME}.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Number of inputs:", len(input_details))
print("Number of outputs:", len(output_details))
print("Input name:", input_details[0]['name'])
print("Input shape:", input_details[0]['shape'])
print("Output name:", output_details[0]['name'])
print("Output shape:", output_details[0]['shape'])

Number of inputs: 1
Number of outputs: 1
Input name: serving_default_input_1:0
Input shape: [ 1 32 32  1]
Output name: StatefulPartitionedCall:0
Output shape: [1 2]


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


##### Test the TFLite model

In [18]:
filenames = glob('msc-test/go*') + glob('msc-test/stop*')


avg_preprocessing_latency = 0.0
avg_model_latency = 0.0
latencies = []
accuracy = 0.0

for filename in filenames:
    audio_binary = tf.io.read_file(filename)
    path_parts = tf.strings.split(filename, '/')
    path_end = path_parts[-1]
    file_parts = tf.strings.split(path_end, '_')
    true_label = file_parts[0]
    true_label = true_label.numpy().decode()
    
    start_preprocess = time()
    audio, sampling_rate = tf.audio.decode_wav(audio_binary) 
    audio = tf.squeeze(audio)
    zero_padding = tf.zeros(sampling_rate - tf.shape(audio), dtype=tf.float32)
    audio_padded = tf.concat([audio, zero_padding], axis=0)

    if downsampling_rate != sampling_rate:
        audio_padded = tfio.audio.resample(audio_padded, sampling_rate_int64, downsampling_rate)

    stft = tf.signal.stft(
        audio_padded, 
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=frame_length
    )
    spectrogram = tf.abs(stft)

    mel_spectrogram = tf.matmul(spectrogram, linear_to_mel_weight_matrix)
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1.e-6)
    log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, 0)
    log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, -1)
    end_preprocess = time()
    
    #log_mel_spectrogram ha shape (1,49,40,1), ma dovrebbe essere (1,32,32,1)
    interpreter.set_tensor(input_details[0]['index'], log_mel_spectrogram) 
    interpreter.invoke()
    output = interpreter.get_tensor(output_details[0]['index'])

    end_inference = time()

    top_index = np.argmax(output[0])
    predicted_label = LABELS[top_index]

    accuracy += true_label == predicted_label
    avg_preprocessing_latency += end_preprocess - start_preprocess
    avg_model_latency += end_inference - end_preprocess
    latencies.append(end_inference - start_preprocess) 

ValueError: Cannot set tensor: Dimension mismatch. Got 49 but expected 32 for dimension 1 of input 0.

## Results

In [None]:
accuracy = len(filenames)
avg_preprocessing_latency = len(filenames)
avg_model_latency = len(filenames)
median_total_latency = np.median(latencies)

import os

model_size = os.path.getsize(f'tflite_models/{MODEL_NAME}.tflite')

In [None]:
print(f'Accuracy: {100 * accuracy:.3f}%')
print(f'Model size: {model_size / 2 ** 10:.1f}KB')
print(f'Preprocessing Latency: {1000 * avg_preprocessing_latency:.1f}ms')
print(f'Model Latency: {1000 * avg_model_latency:.1f}ms')
print(f'Total Latency: {1000 * median_total_latency:.1f}ms')