In [1]:
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import StandardScaler

In [2]:
featdir = Path('audio_features')
embdir = Path('embeddings')
weightsdir = Path('weights')
lbldir = Path('labels')

In [3]:
NFEATURES = 207
# here we tried reducing the dimensionality a bit, but the clustering is much
# more difficult. So we keep the match 207 features --> 207-D embedding for now.
NEMBEDD = 207

## Load Features

In [4]:
tr_audio = np.load(featdir / 'tr_audio_features.npy')[:,:NFEATURES]
tst_audio = np.load(featdir / 'tst_audio_features.npy')[:,:NFEATURES]
val_audio = np.load(featdir / 'val_audio_features.npy')[:,:NFEATURES]

ss = StandardScaler()
tr_audio = ss.fit_transform(tr_audio)
tst_audio = ss.transform(tst_audio)
val_audio = ss.transform(val_audio)

tr_lbl_audio = np.load(lbldir / 'tr_lbl.npy')
tst_lbl_audio = np.load(lbldir / 'tst_lbl.npy')
val_lbl_audio = np.load(lbldir / 'val_lbl.npy')

tr_onehot_audio = to_categorical(tr_lbl_audio)
tst_onehot_audio = to_categorical(tst_lbl_audio)
val_onehot_audio = to_categorical(val_lbl_audio)

## Define Dense Clusterer

In [5]:
def get_clusterer():
    model = tf.keras.models.Sequential()

    model.add(Dense(NEMBEDD, input_shape=(NEMBEDD,), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(NEMBEDD, activation = 'relu'))

    return model

In [6]:
clusterer = get_clusterer()
print(clusterer.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 207)               43056     
                                                                 
 dropout (Dropout)           (None, 207)               0         
                                                                 
 dense_1 (Dense)             (None, 207)               43056     
                                                                 
Total params: 86,112
Trainable params: 86,112
Non-trainable params: 0
_________________________________________________________________
None


2022-05-31 14:21:13.405829: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Training

In [7]:
trainclust = False
if trainclust:
    import tensorflow_addons as tfa
    loss = tfa.losses.TripletSemiHardLoss(margin=2.0)
    clusterer.compile(
        optimizer=tf.keras.optimizers.Adam(0.005),
        loss=loss)
    
    train = (tr_audio, tr_lbl_audio)
    val = (val_audio, val_lbl_audio)
    
    history = clusterer.fit(
        train[0], train[1],
        validation_data=val,
        batch_size=500,
        epochs=10)
else:
    clusterer.load_weights(weightsdir / 'audio_clusterer_trimmed.h5')

## Compute Embeddings from Dense Clusterer

In [8]:
tr_out = clusterer(tr_audio)
val_out = clusterer(val_audio)
tst_out = clusterer(tst_audio)

np.save(embdir / 'embeddings_val_audio.npy', val_out)
np.save(embdir / 'embeddings_test_audio.npy', tst_out)
np.save(embdir / 'embeddings_train_audio.npy', tr_out)

## Classifying the Embeddings using a DenseNet

In [9]:
train = (tr_out, tr_onehot_audio)
val = (val_out, val_onehot_audio)

def get_classifier():
    model = tf.keras.models.Sequential()
    
    model.add(Dense(NEMBEDD, input_shape=(NEMBEDD,), activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, input_shape=(NEMBEDD,), activation = 'relu'))
    model.add(Dropout(0.6))
    model.add(Dense(128, input_shape=(NEMBEDD,), activation = 'relu'))
    model.add(Dropout(0.7))
    model.add(Dense(49, activation = 'softmax'))

    return model

classifier = get_classifier()

#### Training

In [10]:
loss = tf.keras.losses.CategoricalCrossentropy()
trainclass = True
if trainclass:
    # Compile the model
    classifier.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=loss,
        metrics=['accuracy'])
    
    history = classifier.fit(
        train[0], train[1],
        validation_data=val,
        batch_size=1000,
        epochs=400)

else:
    classifier.load_weights(weightsdir / 'audio_classifier.h5')

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

## Testing

In [11]:
pred = np.argmax(classifier(tst_out), axis=1)
print(f"Test accuracy : {np.sum(pred==tst_lbl_audio)/pred.size*100:.01f}%")

Test accuracy : 77.4%
