# Supervised Representation Learning 

In this tutorial we show how to perform supervised representation learning using a VGGish Network.

In [None]:
%xmode minimal

import os
import json

# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # disable GPU devices
os.environ["TFDS_DATA_DIR"] = os.path.expanduser("~/tensorflow_datasets")  # default location of tfds database

import os
# os.environ["KERAS_BACKEND"] = "tensorflow"
os.environ["KERAS_BACKEND"] = "jax"

import keras
from keras import layers, models

import tensorflow as tf
import tensorflow_datasets as tfds

import librosa
import librosa.display

import numpy as np
from matplotlib import pyplot as plt

from pathlib import Path

from IPython.display import Audio

# Turn off logging for TF
import logging
tf.get_logger().setLevel(logging.ERROR)

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

In [None]:
import dpmhm
# dpmhm.datasets.get_dataset_list()

from dpmhm.datasets import preprocessing, feature, utils, transformer, query_parameters

In [None]:
outdir = Path('/home/han/tmp/vggish/paderborn')
os.makedirs(outdir, exist_ok=True)

## Load dataset

In [None]:
ds0, ds_info = tfds.load(
    'Paderborn',
    split='healthy+artificial',
    # split=['healthy', 'artificial', 'lifetime'],
    with_info=True,
)

ds_parms = query_parameters('Paderborn')
print(ds_parms)

In [None]:
eles = list(ds0.take(10).as_numpy_iterator())

In [None]:
for k,v in eles[0]['signal'].items():
    print(k, v.shape)

### Preprocessing pipeline

In [None]:
compactor = transformer.DatasetCompactor(
    ds0,
    # channels=list(ds_parms['signal'].keys()), # select all channels simultaneously
    channels=['vibration', 'current'],
    # channels=[],
    keys=list(ds_parms['keys'].keys()),    
    # resampling_rate=64000,
    # split=True,  # split multidimensional signals into 1d signals, incompatible with the pretrained VGGish model
)

# Feature extractor
# Spectrogram is computed on a time window of 0.025 second every 0.0125 second, then converted to decibel scale.
_func = lambda x, sr: feature.spectral_features(
    x, sr, 'spectrogram',
    # n_mfcc=256,
    time_window=0.025, hop_step=0.0125, 
    # n_fft=512,
    normalize=False, to_db=True)[0]

extractor = transformer.FeatureExtractor(compactor.dataset, _func)

# A window of width w correspond to w*0.0125 seconds
window = transformer.WindowSlider(extractor.dataset, window_size=(64,64), hop_size=(32,32))
# window = transformer.WindowSlider(extractor.dataset, window_size=(256, 80), hop_size=40)  # 1s, full bandwidth
# window = transformer.WindowSlider(extractor.dataset, window_size=64, hop_size=32)

# compactor.dataset.element_spec

Print the dictionary of all labels

In [None]:
labels = list(compactor.full_label_dict.keys())  # need the whole list of labels
print("Number of classes:", len(labels))

compactor.full_label_dict
# compactor.label_dict

Here's how the spectrogram patches look like:

In [None]:
eles = list(window.dataset.take(10).as_numpy_iterator())

fig, axes = plt.subplots(1,2,figsize=(10,5))

axes[0].matshow(eles[0]['feature'][0])
axes[1].matshow(eles[9]['feature'][0])

As the last step of preprocessing, apply a preprocessing mapping to transform the dataset to the tuple `(feature, label)` compatible for supervised learning, with the field `feature` in the channel-last format. Finally the preprocessed dataset can be exported to disk for reuse.

In [None]:
preproc = preprocessing.get_mapping_supervised(labels)

with open(outdir/'labels.json', 'w') as fp:
    json.dump(compactor.full_label_dict,fp)
    
ds_window = utils.restore_shape(
    window.dataset.map(preproc, num_parallel_calls=tf.data.AUTOTUNE)
)
ds_size = utils.get_dataset_size(ds_window)

# ds_window.save(str(outdir/'dataset'))

In [22]:
ds_size

401574

### Load the preprocessed dataset

The preprocessed dataset can be loaded from the disk.

In [41]:
ds_window = tf.data.Dataset.load(str(outdir/'dataset'))
ds_size = ds_window.cardinality()

with open(outdir/'labels.json', 'r') as fp:
    labels = list(json.load(fp).keys())

In [10]:
print(f"Total number of elements: {ds_window.cardinality()}")
ds_window.element_spec

Total number of elements: -2


(TensorSpec(shape=(64, 64, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int32, name=None))

## Train a VGGish network

Note that the Keras preprocessing model included an outlier class (of label `0`) in the final dataset, we must accordingly increase the number of classes by 1.

In [25]:
# eles = list(ds_window.take(1).as_numpy_iterator())
# input_shape = eles[0][0].shape
input_shape = ds_window.element_spec[0].shape

n_classes = len(labels)+1  # must add one for the outlier class, Keras uses zero-based class labels.
print(f"Number of class: {n_classes}")

splits = {'train':0.7, 'val':0.2, 'test':0.1}
ds_split = utils.split_dataset(ds_window, splits, ds_size=ds_size)

Number of class: 10


Create the training/validation/test set with mini-batches. Notice that the method `.batch()` increases by 1 the rank of the dataset: it is the dimension of the mini-batch.

In [26]:
batch_size = 32

ds_train = ds_split['train'].shuffle(ds_size, reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
ds_val = ds_split['val'].batch(batch_size, drop_remainder=True)
ds_test = ds_split['test'].batch(1, drop_remainder=True)

ds_train.element_spec

(TensorSpec(shape=(32, 64, 64, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(32,), dtype=tf.int32, name=None))

### Implementation 1: Keras pretrained model with transfer learning

Keras comes with a VGG16 model pretrained on Imagenet that can be reused for feature embedding. We load and freeze the pretrained weights and perform transfer learning only on final classification layers.

In [27]:
from keras.applications import VGG16

base_model = VGG16(include_top=False, weights='imagenet', input_shape=input_shape, classes=n_classes)

base_model.trainable = False

In [28]:
x = layers.Input(input_shape)

adapt_model = models.Sequential([
    layers.Flatten(name="flatten"),
    layers.Dense(4096, activation="relu", name="fc1"),
    layers.Dense(4096, activation="relu", name="fc2"),
    layers.Dense(n_classes, activation=None, name="predictions")
])

y = adapt_model(base_model(x))

# Equivalent:
# x = base_model(x)
# x = layers.Flatten(name="flatten")(x)
# x = layers.Dense(4096, activation="relu", name="fc1")(x)
# x = layers.Dense(4096, activation="relu", name="fc2")(x)
# y = layers.Dense(n_classes, activation=None, name="predictions")(x)

In [29]:
model = models.Model(x, y)

from_logits = 'softmax' not in str(model.layers[-1].get_layer('predictions').activation)

model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits),
    metrics=['accuracy'],
)

In [None]:
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=5,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=3),
)

2024-06-13 23:56:21.347139: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1/5


In [15]:
model.evaluate(ds_test)

[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9777 - loss: 0.0619


[0.05482785403728485, 0.9822086095809937]

#### Fine tuning

After training, we can perform several steps of fine tuning with a small learning rate.

In [18]:
base_model.trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits),
    metrics=['accuracy'],
)

In [19]:
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=2,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=3),
)

Epoch 1/2
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 946ms/step - accuracy: 0.9744 - loss: 0.0925 - val_accuracy: 0.9957 - val_loss: 0.0106
Epoch 2/2
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 940ms/step - accuracy: 0.9939 - loss: 0.0201 - val_accuracy: 0.9932 - val_loss: 0.0199


In [20]:
model.evaluate(ds_test)

[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.9941 - loss: 0.0338


[0.027062658220529556, 0.9926380515098572]

In [17]:
model.save(str(outdir / 'vggish_model_transfer.keras'))

### Implementation 2: from scratch

We provide an implementation of the VGGish network. Unlike the Keras implementation, weights are not pretrained here so the training may be longer. 

In [49]:
from dpmhm.models.sl import vggish

config = vggish.Config(input_shape=input_shape, n_embedding=128, n_classes=n_classes, padding='same', activation='relu',)
model = vggish.VGG11(config)

print(config)

model.summary()

# l=model.get_layer('classifier')

Config(input_shape=(64, 64, 1), batch_size=256, epochs=100, training_steps=1000, n_classes=30, n_embedding=128, kernel_size=(3, 3), activation='relu', activation_classifier=None, padding='same', pool_size=(2, 2), strides=(2, 2))


In [50]:
from_logits = 'softmax' not in str(model.get_layer('classifier').activation)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=from_logits),
    metrics=['accuracy'],
)

In [51]:
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=5,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=3),
)


Epoch 1/5
[1m1068/1069[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 623ms/step - accuracy: 0.0490 - loss: 4.1578

2024-06-13 22:31:21.488156: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[1m1069/1069[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m702s[0m 655ms/step - accuracy: 0.0491 - loss: 4.1562 - val_accuracy: 0.1702 - val_loss: 2.6486
Epoch 2/5


2024-06-13 22:31:56.031446: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


[1m  19/1069[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:47[0m 617ms/step - accuracy: 0.1712 - loss: 2.7256

KeyboardInterrupt: 

In [25]:
history = model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=5,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=3),
)


Epoch 1/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 537ms/step - accuracy: 0.9153 - loss: 0.3731 - val_accuracy: 0.9368 - val_loss: 0.2900
Epoch 2/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 532ms/step - accuracy: 0.9170 - loss: 0.4103 - val_accuracy: 0.9810 - val_loss: 0.0912
Epoch 3/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 537ms/step - accuracy: 0.9695 - loss: 0.1564 - val_accuracy: 0.8846 - val_loss: 0.4811
Epoch 4/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 528ms/step - accuracy: 0.8788 - loss: 0.5810 - val_accuracy: 0.9712 - val_loss: 0.1216
Epoch 5/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 526ms/step - accuracy: 0.9617 - loss: 0.1724 - val_accuracy: 0.9537 - val_loss: 0.2963
Epoch 5: early stopping


In [26]:
model.evaluate(ds_test)

[1m1630/1630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - accuracy: 0.9563 - loss: 0.3218


[0.26855865120887756, 0.9527607560157776]

In [27]:
model.save(str(outdir / 'vggish_model_scratch.keras'))

# EOF