## Training with spectrogram

In [100]:
!pip install tensorflow
!pip install tensorflow_io
!pip install numpy



In [101]:
import tensorflow as tf
import tensorflow_io as tfio
from functools import partial
from glob import glob
from time import time
import numpy as np
import os

In [102]:
LABELS = ["go", "stop"]

In [103]:
train_ds = tf.data.Dataset.list_files(["msc-train/go*", "msc-train/stop*"])
test_ds = tf.data.Dataset.list_files(["msc-test/go*", "msc-test/stop*"])
val_ds = tf.data.Dataset.list_files(["msc-val/go*", "msc-val/stop*"])

### Hyper parameters-to be tuned

In [104]:
# PREPROCESSING_ARGS = {
#     'downsampling_rate': 16000,
#     'frame_length_in_s': 0.04,
#     'frame_step_in_s': 0.02,
#     # AGGIUNGI ROBA
# }

PREPROCESSING_ARGS = {
    'downsampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
    'num_mel_bins': 40,
    'lower_frequency': 20,
    'upper_frequency': 4000,
}

TRAINING_ARGS = {
    'batch_size': 20,
    'initial_learning_rate': 0.01,
    'end_learning_rate': 1.e-5,
    'epochs': 10
}

In [105]:
# def get_audio_and_label(filename):
#     audio_binary = tf.io.read_file(filename)
#     audio, sampling_rate = tf.audio.decode_wav(audio_binary) 

#     path_parts = tf.strings.split(filename, '/')
#     path_end = path_parts[-1]
#     file_parts = tf.strings.split(path_end, '_')
#     label = file_parts[0]

#     audio = tf.squeeze(audio)
#     zero_padding = tf.zeros(sampling_rate - tf.shape(audio), dtype=tf.float32)
#     audio_padded = tf.concat([audio, zero_padding], axis=0)

#     return audio_padded, sampling_rate, label


# def get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s):
#     audio_padded, sampling_rate, label = get_audio_and_label(filename)
    
#     if downsampling_rate != sampling_rate:
#         sampling_rate_int64 = tf.cast(sampling_rate, tf.int64)
#         audio_padded = tfio.audio.resample(audio_padded, sampling_rate_int64, downsampling_rate)

#     sampling_rate_float32 = tf.cast(downsampling_rate, tf.float32)
#     frame_length = int(frame_length_in_s * sampling_rate_float32)
#     frame_step = int(frame_step_in_s * sampling_rate_float32)

#     spectrogram = stft = tf.signal.stft(
#         audio_padded, 
#         frame_length=frame_length,
#         frame_step=frame_step,
#         fft_length=frame_length
#     )
#     spectrogram = tf.abs(stft)

#     return spectrogram, downsampling_rate, label

# def get_spectrogram_and_label(filename, downsampling_rate, frame_length_in_s, frame_step_in_s):
#     spectrogram, sampling_rate, label = get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s)
    
#     return spectrogram, label

# get_frozen_spectrogram = partial(get_spectrogram_and_label, **PREPROCESSING_ARGS)

# for spectrogram, label in train_ds.map(get_frozen_spectrogram).take(1):
#     SHAPE = spectrogram.shape

# def preprocess(filename):
#     signal, label = get_frozen_spectrogram(filename)

#     signal.set_shape(SHAPE)
#     signal = tf.expand_dims(signal, -1)
#     signal = tf.image.resize(signal, [32, 32])

#     label_id = tf.argmax(label == LABELS)

#     return signal, label_id

In [106]:
def get_audio_and_label(filename):
    audio_binary = tf.io.read_file(filename)
    audio, sampling_rate = tf.audio.decode_wav(audio_binary) 

    path_parts = tf.strings.split(filename, '/')
    path_end = path_parts[-1]
    file_parts = tf.strings.split(path_end, '_')
    label = file_parts[0]

    audio = tf.squeeze(audio)
    zero_padding = tf.zeros(sampling_rate - tf.shape(audio), dtype=tf.float32)
    audio_padded = tf.concat([audio, zero_padding], axis=0)

    return audio_padded, sampling_rate, label

def get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s):
    audio_padded, sampling_rate, label = get_audio_and_label(filename)
    
    if downsampling_rate != sampling_rate:
        sampling_rate_int64 = tf.cast(sampling_rate, tf.int64)
        audio_padded = tfio.audio.resample(audio_padded, sampling_rate_int64, downsampling_rate)

    sampling_rate_float32 = tf.cast(downsampling_rate, tf.float32)
    frame_length = int(frame_length_in_s * sampling_rate_float32)
    frame_step = int(frame_step_in_s * sampling_rate_float32)

    spectrogram = stft = tf.signal.stft(
        audio_padded, 
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=frame_length
    )
    spectrogram = tf.abs(stft)

    return spectrogram, downsampling_rate, label

def get_log_mel_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s, num_mel_bins, lower_frequency, upper_frequency):
    # TODO: Write your code here
    spectrogram, sampling_rate, label = get_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s)

    sampling_rate_float32 = tf.cast(sampling_rate, tf.float32)
    frame_length = int(frame_length_in_s * sampling_rate_float32)
    num_spectrogram_bins = frame_length // 2 + 1

    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins=num_mel_bins,
        num_spectrogram_bins=num_spectrogram_bins,
        sample_rate=sampling_rate,
        lower_edge_hertz=lower_frequency,
        upper_edge_hertz=upper_frequency
    )

    mel_spectrogram = tf.matmul(spectrogram, linear_to_mel_weight_matrix)

    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1.e-6)

    return log_mel_spectrogram, label

def get_log_mel_spectrogram_and_label(filename, downsampling_rate, frame_length_in_s, frame_step_in_s, num_mel_bins, lower_frequency, upper_frequency):
    log_mel_spectrogram, label = get_log_mel_spectrogram(filename, downsampling_rate, frame_length_in_s, frame_step_in_s, num_mel_bins, lower_frequency, upper_frequency)
    
    return log_mel_spectrogram, label

get_frozen_log_mel_spectrogram = partial(get_log_mel_spectrogram_and_label, **PREPROCESSING_ARGS)

for log_mel_spectrogram, label in train_ds.map(get_frozen_log_mel_spectrogram).take(1):
    SHAPE = log_mel_spectrogram.shape


def preprocess_with_mel(filename):
    signal, label = get_frozen_log_mel_spectrogram(filename)
    signal = tf.expand_dims(signal, -1)
    label_id = tf.argmax(label == LABELS)

    return signal, label_id



2022-12-06 11:57:52.421225: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:52.422342: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:52.422496: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library


In [107]:
batch_size = TRAINING_ARGS['batch_size']
epochs = TRAINING_ARGS['epochs']

# train_ds = train_ds.map(preprocess_with_mel).batch(batch_size).cache()
# val_ds = val_ds.map(preprocess_with_mel).batch(batch_size)
# test_ds = test_ds.map(preprocess_with_mel).batch(batch_size)

In [108]:
def preprocess_with_resized_mel(filename):
    signal, label = get_frozen_log_mel_spectrogram(filename)
    signal.set_shape(SHAPE)
    signal = tf.expand_dims(signal, -1)
    signal = tf.image.resize(signal, [32, 32])
    label_id = tf.argmax(label == LABELS)

    return signal, label_id

train_ds = train_ds.map(preprocess_with_resized_mel).batch(batch_size).cache()
val_ds = val_ds.map(preprocess_with_resized_mel).batch(batch_size)
test_ds = test_ds.map(preprocess_with_resized_mel).batch(batch_size)



2022-12-06 11:57:52.956452: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:52.957544: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:52.957673: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library




2022-12-06 11:57:53.188560: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:53.189804: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:53.189965: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library




2022-12-06 11:57:53.406317: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:53.407504: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library
2022-12-06 11:57:53.407648: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at functional_ops.cc:373 : INTERNAL: No function library


In [109]:
for example_batch, example_labels in train_ds.take(1):
  print('Batch Shape:', example_batch.shape)
  print('Data Shape:', example_batch.shape[1:])
  print('Labels:', example_labels)

Batch Shape: (20, 32, 32, 1)
Data Shape: (32, 32, 1)
Labels: tf.Tensor([0 0 0 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 1], shape=(20,), dtype=int64)


2022-12-06 11:57:53.754210: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


##### Create and train the model

In [110]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=example_batch.shape[1:]),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[2, 2], use_bias=False, padding='valid'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], use_bias=False, padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=len(LABELS)),
    tf.keras.layers.Softmax()
])

In [111]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=False)
initial_learning_rate = TRAINING_ARGS['initial_learning_rate']
end_learning_rate = TRAINING_ARGS['end_learning_rate']

linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    end_learning_rate=end_learning_rate,
    decay_steps=len(train_ds) * epochs,
)
optimizer = tf.optimizers.Adam(learning_rate=linear_decay)
metrics = [tf.metrics.SparseCategoricalAccuracy()]
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

history = model.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##### Test and save the model

In [112]:
test_loss, test_accuracy = model.evaluate(test_ds)



In [113]:
training_loss = history.history['loss'][-1]
training_accuracy = history.history['sparse_categorical_accuracy'][-1]
val_loss = history.history['val_loss'][-1]
val_accuracy = history.history['val_sparse_categorical_accuracy'][-1]

print(f'Training Loss: {training_loss:.4f}')
print(f'Training Accuracy: {training_accuracy*100.:.2f}%')
print()
print(f'Validation Loss: {val_loss:.4f}')
print(f'Validation Accuracy: {val_accuracy*100.:.2f}%')
print()
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy*100.:.2f}%')

Training Loss: 0.1739
Training Accuracy: 93.25%

Validation Loss: 0.2681
Validation Accuracy: 93.00%

Test Loss: 0.2074
Test Accuracy: 91.00%


In [114]:
timestamp = int(time())

saved_model_dir = f'./saved_models/{timestamp}'
if not os.path.exists(saved_model_dir):
    os.makedirs(saved_model_dir)
model.save(saved_model_dir)



INFO:tensorflow:Assets written to: ./saved_models/1670324313/assets


INFO:tensorflow:Assets written to: ./saved_models/1670324313/assets


In [115]:
############## NON RUNNARE - PER SALVARE gli iperparametri e i risultati
# import pandas as pd

# output_dict = {
#     'timestamp': timestamp,
#     **PREPROCESSING_ARGS,
#     **TRAINING_ARGS,
#     'test_accuracy': test_accuracy
# }

# df = pd.DataFrame([output_dict])

# output_path='./spectrogram_results.csv'
# df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)

## TFLite conversion

In [116]:
MODEL_NAME = timestamp
converter = tf.lite.TFLiteConverter.from_saved_model(f'./saved_models/{MODEL_NAME}')
tflite_model = converter.convert()

2022-12-06 11:58:35.008101: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2022-12-06 11:58:35.008153: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2022-12-06 11:58:35.008385: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: ./saved_models/1670324313
2022-12-06 11:58:35.012783: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2022-12-06 11:58:35.012825: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: ./saved_models/1670324313
2022-12-06 11:58:35.025005: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2022-12-06 11:58:35.126104: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: ./saved_models/1670324313
2022-12-06 11:58:35.150155: I tensorflow/cc/saved_model/loader.cc:305] SavedModel load for tags { serve }; Status

In [117]:
import os
tflite_models_dir = './tflite_models'
if not os.path.exists(tflite_models_dir):
    os.makedirs(tflite_models_dir)
tflite_model_name = os.path.join(tflite_models_dir, f'{MODEL_NAME}.tflite')
with open(tflite_model_name, 'wb') as fp:
    fp.write(tflite_model)

## Inference with log mel spectrogram

In [118]:
PREPROCESSING_ARGS = {
    'downsampling_rate': 16000,
    'frame_length_in_s': 0.04,
    'frame_step_in_s': 0.02,
    'num_mel_bins': 40,
    'lower_frequency': 20,
    'upper_frequency': 4000,
}

LABELS = ["go", "stop"]

downsampling_rate = PREPROCESSING_ARGS['downsampling_rate']
sampling_rate_int64 = tf.cast(downsampling_rate, tf.int64)
frame_length = int(downsampling_rate * PREPROCESSING_ARGS['frame_length_in_s'])
frame_step = int(downsampling_rate * PREPROCESSING_ARGS['frame_step_in_s'])
spectrogram_width = (16000 - frame_length) // frame_step + 1
num_spectrogram_bins = frame_length // 2 + 1

linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
    PREPROCESSING_ARGS['num_mel_bins'],
    num_spectrogram_bins,
    downsampling_rate,
    PREPROCESSING_ARGS['lower_frequency'],
    PREPROCESSING_ARGS['upper_frequency']
)

##### Load the TFLite model

In [119]:
interpreter = tf.lite.Interpreter(model_path=f'tflite_models/{MODEL_NAME}.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Number of inputs:", len(input_details))
print("Number of outputs:", len(output_details))
print("Input name:", input_details[0]['name'])
print("Input shape:", input_details[0]['shape'])
print("Output name:", output_details[0]['name'])
print("Output shape:", output_details[0]['shape'])

Number of inputs: 1
Number of outputs: 1
Input name: serving_default_input_2:0
Input shape: [ 1 32 32  1]
Output name: StatefulPartitionedCall:0
Output shape: [1 2]


##### Test the TFLite model

In [132]:
filenames = glob('msc-test/go*') + glob('msc-test/stop*')

In [133]:
filenames = tf.data.Dataset.list_files(filenames)
# filenames = filenames.map(preprocess_with_resized_mel).batch(batch_size).cache()


In [135]:
avg_preprocessing_latency = 0.0
avg_model_latency = 0.0
latencies = []
accuracy = 0.0

for filename in filenames:
    audio_binary = tf.io.read_file(filename)
    path_parts = tf.strings.split(filename, '/')
    path_end = path_parts[-1]
    file_parts = tf.strings.split(path_end, '_')
    true_label = file_parts[0]
    true_label = true_label.numpy().decode()
    
    start_preprocess = time()
    audio, sampling_rate = tf.audio.decode_wav(audio_binary) 
    audio = tf.squeeze(audio)
    zero_padding = tf.zeros(sampling_rate - tf.shape(audio), dtype=tf.float32)
    audio_padded = tf.concat([audio, zero_padding], axis=0)

    if downsampling_rate != sampling_rate:
        audio_padded = tfio.audio.resample(audio_padded, sampling_rate_int64, downsampling_rate)

    stft = tf.signal.stft(
        audio_padded, 
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=frame_length
    )
    print(stft.shape)  ###
    spectrogram = tf.abs(stft)
    print(spectrogram.shape)   ###
    print(linear_to_mel_weight_matrix.shape)  ###
    mel_spectrogram = tf.matmul(spectrogram, linear_to_mel_weight_matrix)
    print(mel_spectrogram.shape)   ###
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1.e-6)
    log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, 0)
    log_mel_spectrogram = tf.expand_dims(log_mel_spectrogram, -1)
    log_mel_spectrogram = tf.image.resize(log_mel_spectrogram, [32, 32])
    end_preprocess = time()
    
    #log_mel_spectrogram ha shape (1,49,40,1), ma dovrebbe essere (1,32,32,1)
    interpreter.set_tensor(input_details[0]['index'], log_mel_spectrogram) 
    interpreter.invoke()
    output = interpreter.get_tensor(output_details[0]['index'])

    end_inference = time()

    top_index = np.argmax(output[0])
    predicted_label = LABELS[top_index]

    accuracy += true_label == predicted_label
    avg_preprocessing_latency += end_preprocess - start_preprocess
    avg_model_latency += end_inference - end_preprocess
    latencies.append(end_inference - start_preprocess) 

(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321, 40)
(49, 40)
(49, 321)
(49, 321)
(321,

## Results

In [138]:
accuracy /= len(filenames)
avg_preprocessing_latency /= len(filenames)
avg_model_latency /= len(filenames)
median_total_latency = np.median(latencies)

import os

model_size = os.path.getsize(f'tflite_models/{MODEL_NAME}.tflite')

In [139]:
print(f'Accuracy: {100 * accuracy:.3f}%')
print(f'Model size: {model_size / 2 ** 10:.1f}KB')
print(f'Preprocessing Latency: {1000 * avg_preprocessing_latency:.1f}ms')
print(f'Model Latency: {1000 * avg_model_latency:.1f}ms')
print(f'Total Latency: {1000 * median_total_latency:.1f}ms')

Accuracy: 100.000%
Model size: 1162.1KB
Preprocessing Latency: 1000.0ms
Model Latency: 1000.0ms
Total Latency: 8.5ms
