# MoCo

Using the torch backend with GPU device may result in some strange error: dataloader seems to generate elements of random size.

In [1]:
# %xmode minimal

from pathlib import Path
import os

# os.environ["KERAS_BACKEND"] = "jax"
os.environ["KERAS_BACKEND"] = "torch"
# os.environ["KERAS_BACKEND"] = "tensorflow"

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # disable GPU devices
os.environ["TFDS_DATA_DIR"] = os.path.expanduser("~/tensorflow_datasets")  # default location of tfds database

# Turn off logging for TF
import logging
logging.basicConfig(level=logging.INFO) 
# logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow_datasets as tfds
tf.get_logger().setLevel(logging.ERROR)

# import librosa
# import librosa.display
# from IPython.display import Audio

import numpy as np
from matplotlib import pyplot as plt

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

import keras
from keras import layers, models, ops, losses, metrics
# from keras.applications import resnet, vgg16

# tf.config.experimental_run_functions_eagerly(True)
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

# import torch
# torch.autograd.set_detect_anomaly(True)

In [2]:
import dpmhm
# dpmhm.datasets.get_dataset_list()

from dpmhm.datasets import preprocessing, transformer, feature, utils, spectral_window_pipeline, spectral_pipeline
from dpmhm.models import simclr

workdir = Path(os.path.expanduser("~/tmp/dpmhm/MoCo"))
os.makedirs(workdir, exist_ok=True)

dbdir = Path(os.path.expanduser('~/Projects/HIASCI/Data/MetaTwins'))
os.makedirs(dbdir, exist_ok=True)

# dpmhm.datasets.query_parameters('Paderborn')

## Build a meta-dataset

First we load several datasets and extract spectrogram patches of fixed dimension `(128,128)`. This dimension is large enough to accomodate the random crop of shape `(64,64)` that will be created later by spec-augmentation. Also we skip the resampling step which will considerable slow down the loading of datasets, and take randomly the first `Nmax` elements to reduce the size of the final dataset.

In [None]:
n_fft = 1024  # number of frequency bins
Nmax = 2500  # maximum number of elements per dataset
shuffle_size = 10000

ds_all = {}

foo = dpmhm.datasets.spectral_window_pipeline('DIRG', 51200, split='variation', channels=[], n_fft=n_fft)
# ds_all['DIRG'] = foo
ds_all['DIRG'] = utils.restore_cardinality(foo.shuffle(shuffle_size).take(Nmax))

foo = dpmhm.datasets.spectral_window_pipeline('Paderborn', 64000, split='healthy[:10%]+artificial[:10%]', channels=['vibration'], n_fft=n_fft)
# ds_all['Paderborn'] = foo
ds_all['Paderborn'] = utils.restore_cardinality(foo.shuffle(shuffle_size).take(Nmax))

foo = dpmhm.datasets.spectral_window_pipeline('Ottawa', 200000, split='all', channels=[], n_fft=n_fft)
# ds_all['Ottawa'] = foo
ds_all['Ottawa'] = utils.restore_cardinality(foo.shuffle(shuffle_size).take(Nmax))

foo = dpmhm.datasets.spectral_window_pipeline('Phmap2021', 10544, split='train[:50%]', channels=[], n_fft=n_fft)
# ds_all['Phmap2021'] = foo
ds_all['Phmap2021'] = utils.restore_cardinality(foo.shuffle(shuffle_size).take(Nmax))

# eles = list(foo.take(10).as_numpy_iterator())

# ds_size = 0
# for k, foo in ds_all.items():
#     print(k, int(foo.cardinality()))
#     ds_size += int(foo.cardinality())

INFO:absl:Load dataset info from /home/han/tensorflow_datasets/dirg/1.0.0
INFO:absl:Reusing dataset dirg (/home/han/tensorflow_datasets/dirg/1.0.0)
INFO:absl:Creating a tf.data.Dataset reading 16 files located in folders: /home/han/tensorflow_datasets/dirg/1.0.0.
2024-07-25 10:38:36.273431: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
INFO:absl:Constructing tf.data.Dataset dirg for split variation, from /home/han/tensorflow_datasets/dirg/1.0.0


In [None]:
# Serialization for better performance.
# Note that this should be done before the random augmentation.

ds1, ds2, ds3, ds4 = ds_all['DIRG'], ds_all['Paderborn'], ds_all['Ottawa'], ds_all['Phmap2021']

ds0 = ds1.concatenate(ds2).concatenate(ds3).concatenate(ds4)

ds0.save(str(dbdir/'dataset'))

In [3]:
ds0 = tf.data.Dataset.load(str(dbdir/'dataset'))

window_shape = (64, 64)

dt = transformer.SpecAugmentTwins(
    ds0,
    output_shape=window_shape,
    crop_kwargs={'prob':0.5},
    flip_kwargs={'axis':-1, 'prob':0.5},
    blur_kwargs={'sigma':1., 'prob':0.},
    fade_kwargs={'prob':0},
).dataset.map(
    lambda y1, y2: (tf.transpose(y1, perm=(1,2,0)), tf.transpose(y2, perm=(1,2,0)))  # to channel-last
)

ds = tf.data.Dataset.zip(dt, utils.constant_dataset())

# ds = utils.restore_cardinality(ds, ds_size)
input_shape = ds.element_spec[0][0].shape

ds_size = int(dt.cardinality())
ds.element_spec

2024-07-25 15:11:09.406567: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


((TensorSpec(shape=(64, 64, 1), dtype=tf.float32, name=None),
  TensorSpec(shape=(64, 64, 1), dtype=tf.float32, name=None)),
 TensorSpec(shape=(), dtype=tf.float32, name=None))

In [4]:
# # Serialization may disable the random augmentation

# ds.save(str(workdir/'simclr_dataset'))
# ds = tf.data.Dataset.load(str(workdir/'simclr_dataset'))

## Base MoCo model


In [5]:
batch_size = 32
shuffle_size = 10000

# ds_train = ds.batch(batch_size, drop_remainder=True)
ds_train = ds.shuffle(shuffle_size, reshuffle_each_iteration=True)\
    .repeat()\
    .batch(batch_size, drop_remainder=True)\
    .prefetch(tf.data.AUTOTUNE)

ds_train.element_spec

((TensorSpec(shape=(32, 64, 64, 1), dtype=tf.float32, name=None),
  TensorSpec(shape=(32, 64, 64, 1), dtype=tf.float32, name=None)),
 TensorSpec(shape=(32,), dtype=tf.float32, name=None))

In [6]:
# encoder_kwargs = dict(include_top=False, weights='imagenet', pooling='avg')
encoder_kwargs = dict(include_top=False, weights=None, pooling='avg')

model = dpmhm.models.MoCo(
    input_shape, 
    name='ResNet50', tau=0.1, 
    memsize=100*32, encoder_kwargs=encoder_kwargs
)

model._encoder.trainable = True

model.compile(
    optimizer=keras.optimizers.Adam(),
)
cb_ema = dpmhm.models.MoCo_Callback()

model.summary()

# # Manually build the model, most of time not necessary
# eles = list(ds_train.take(1))
# model(eles[0][0])

In [7]:
hh = model.fit(ds_train,
               callbacks=[cb_ema],
               steps_per_epoch=ds_size // batch_size,
               epochs=100)

model.save(workdir/'moco_base.keras')

Epoch 1/100


INFO:dpmhm.models.ssl.moco:Create EMA instance...


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 1s/step - loss: 205.4393
Epoch 2/100
[1m179/312[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m2:19[0m 1s/step - loss: 226.0592

KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots()
ax.plot(model.history.history['loss'])
ax.set_xlabel('Iteration')
ax.set_ylabel('Loss')

fig.savefig(workdir/'convergence.png')

From the trained model, we extract the feature transformation part which includes the base encoder and the first two dense layers of the projection head. 

In [9]:
x = layers.Input(input_shape)

# same same
# _proj = models.Model(inputs=model._projector.inputs, outputs=model._projector.layers[3].output)
_proj = models.Model(inputs=model._projector.layers[0].input, outputs=model._projector.layers[3].output)

# _proj.summary()  # shows a concrete value for batch

f = _proj(model._encoder(x))

model_feature = models.Model(inputs=x, outputs=f, name='Moco_feature')

model_feature.summary()  # shows `None` for batch

# model_feature.save(str(workdir/'simclr_feature.keras'))

## Transfer Learning

In [12]:
foo, full_labels_dict = dpmhm.datasets.spectral_window_pipeline(
    'CWRU', 12000, split='all', 
    channels=[], 
    keys=['FaultLocation', 'FaultComponent', 'FaultSize'], 
    n_fft=1024, ws=64, labels=True
)

labels = list(full_labels_dict.keys())  
n_classes = len(labels) + 1

INFO:absl:Load dataset info from /home/han/tensorflow_datasets/cwru/1.0.0
INFO:absl:Reusing dataset cwru (/home/han/tensorflow_datasets/cwru/1.0.0)
INFO:absl:Creating a tf.data.Dataset reading 4 files located in folders: /home/han/tensorflow_datasets/cwru/1.0.0.
INFO:absl:Constructing tf.data.Dataset cwru for split all, from /home/han/tensorflow_datasets/cwru/1.0.0


In [13]:
preproc = preprocessing.get_mapping_supervised(labels)

dw = utils.restore_cardinality(
    utils.restore_shape(
        foo.map(preproc, num_parallel_calls=tf.data.AUTOTUNE),
        key=0
    )
)

dw_size = int(dw.cardinality())

### Supervised training of the classification head

We add a classification head to the feature transformation network and fine tune the model on some new data.

In [15]:
splits = {'train':0.7, 'val':0.2, 'test':0.1}
batch_size = 32

dw_split = utils.split_dataset(
    dw, splits, 
    ds_size=dw_size, 
    # labels=np.arange(n_classes)
)

dw_train = dw_split['train']\
    .shuffle(dw_size, reshuffle_each_iteration=True)\
    .repeat()\
    .batch(batch_size, drop_remainder=True)\
    .prefetch(tf.data.AUTOTUNE)
dw_val = dw_split['val'].batch(batch_size, drop_remainder=True)
dw_test = dw_split['test'].batch(batch_size, drop_remainder=True)

The classification head here is a simple MLP. The weights of the feature transformation network are frozen for the training.

In [16]:
model_feature.trainable = False

class_head = models.Sequential([
    layers.Dense(128, activation='relu'),
    # layers.BatchNormalization(),
    layers.Dense(n_classes, activation=None) # nb labels
], name='Classification_head')

x = layers.Input(input_shape)

model_fine = models.Model(inputs=x, outputs=class_head(model_feature(x)))

model_fine.compile(
    optimizer=keras.optimizers.Adam(),
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[metrics.SparseCategoricalAccuracy()]
)

In [None]:
hh = model_fine.fit(
    dw_train,
    validation_data=dw_val,
    epochs=10
)

Epoch 1/10
    710/Unknown [1m116s[0m 125ms/step - loss: 3.2900 - sparse_categorical_accuracy: 0.0719

In [None]:
model_fine.evaluate(dw_test)

#### Fine tuning

In [None]:
model_feature.trainable = True

model_fine.compile(
    optimizer=keras.optimizers.Adam(1e-5),
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[metrics.SparseCategoricalAccuracy()]
)

hh = model_fine.fit(
    dw_train,
    validation_data=dw_val,
    epochs=2
)

model_fine.evaluate(dw_test)

# EOF

In [32]:
X = keras.random.normal((32,256))
Y = keras.random.normal((32,256))
# K = keras.random.normal((1, 32,256))
K = ops.expand_dims(Y, 0)
tau = 0.1
axis = -1
S = losses.cosine_similarity(X, Y, axis=axis) / tau  # X and Y have the same dimension, no need for broadcast. Result has shape `batch`.
N = losses.cosine_similarity(X, K, axis=axis) /  tau  # has shape `(memlen, batch)`
M = ops.transpose(ops.vstack([ops.expand_dims(S,0), N]))
L = losses.sparse_categorical_crossentropy(
    ops.zeros_like(S),  # batch size
    M,  # has shape `(batch, memlen+1)`
    from_logits=True
)
L
# -ops.sum(L)

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
       0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
       0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
       0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
       0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472, 0.6931472,
       0.6931472, 0.6931472], dtype=float32)>