# Combined Classifier

In [3]:
from pathlib import Path
import numpy as np 
import tensorflow as tf

from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical

np.random.seed(42)
tf.keras.utils.set_random_seed(42)

In [4]:
embdir = Path('embeddings')
lbldir = Path('labels')

NClass = 49

# noisy: to load embeddings of a lesser quality for images (80% accuracy
# on their own)
NOISY = False

# load facenet generated encodings for testing by defining `add = '_ultimate'`
add = '' # '_ultimate'
# the embeddings of facenet have size 128:
embsize = 128 if add else 49
# (our embeddings have size 49 because of 
# the way we trained our image clusterer)

# embedding size for the audio, can vary depending on how many
# MFC coefficients and other audio features we use :
audioembsize = 207

if NOISY:
    noisy = 'noisy'
else:
    noisy = ''

## Load Embeddings

In [5]:
embeddings_val_img = np.load(embdir / f'{noisy}embeddings_val_img{add}.npy')
embeddings_test_img = np.load(embdir / f'{noisy}embeddings_test_img{add}.npy')
embeddings_train_img = np.load(embdir / f'{noisy}embeddings_train_img{add}.npy')

# dividing the audio embeddings by a manually tuned value (`norm`) to make 
# them comparable in intensity to the image embeddings.
norm = 25.
embeddings_val_audio = np.load(embdir / 'embeddings_val_audio.npy') / norm
embeddings_test_audio = np.load(embdir / 'embeddings_test_audio.npy') / norm
embeddings_train_audio = np.load(embdir / 'embeddings_train_audio.npy') / norm

tr_lbl  = np.load(lbldir / 'tr_lbl.npy')
tst_lbl = np.load(lbldir / 'tst_lbl.npy')
val_lbl = np.load(lbldir / 'val_lbl.npy')

tr_onehot = to_categorical(tr_lbl)
tst_onehot = to_categorical(tst_lbl)
val_onehot = to_categorical(val_lbl)

## Create the combined embeddings

In [6]:
embeddings_val   = np.concatenate((embeddings_val_img, 
                                   embeddings_val_audio), axis=1)

embeddings_test  = np.concatenate((embeddings_test_img, 
                                   embeddings_test_audio), axis=1)
                                
# (we do it only for val and test, as for the training set we do some 
# data augmentation, see the generator `generateTrain` below.)

### Incremented class selector used by the generator below

In [7]:
COUNT = 0
def modulo():
  global COUNT
  while True:
    if COUNT == 0:
      yield 0
    if COUNT >= NClass-1:
      COUNT = 0
      yield COUNT
    COUNT += 1

    yield COUNT

In [8]:
choicemodulo = modulo()
meta_batch_size = 2850

## Define Generator

In [9]:
def generateTrain():
    """
    here we define a generator which
    - selects a class 
    - randomly chooses an image embedding of this class
    - combines it with a randomly chosen embedding of an audio of this class. 
    (repeats it `batch_size` times.)

    Yields
    ------
    tuple of numpy arrays
        tuple[0]: labels 
        tuple[1]: embeddings 
    """
    while True:
        labels = []
        a = np.zeros((meta_batch_size, audioembsize), dtype=np.float32)
        b = np.zeros((meta_batch_size, embsize), dtype=np.float32)
        for i in range(meta_batch_size):
            cl = next(choicemodulo)
            labels.append(cl)
            alls = np.where(tr_lbl==cl)
            imgchoice = (np.random.choice(alls[0]), )
            audchoice = (np.random.choice(alls[0]), )
            imgs = embeddings_train_img[imgchoice]
            auds = embeddings_train_audio[audchoice]
            a[i, :] = auds
            b[i, :] = imgs
        yield np.array(labels), np.concatenate([b, a], axis=1)


### Putting together our validation set

In [10]:
val = (embeddings_val, val_onehot)

# Our combination classifier

In [11]:
def get_classifier():
    model = tf.keras.models.Sequential()
    
    model.add(Dense(audioembsize + embsize, 
                    input_shape=(audioembsize + embsize,),
                    activation = 'relu'))

    model.add(Dropout(0.5)) 
    model.add(Dense(NClass, activation = 'sigmoid'))
    
    print(model.summary())

    return model

In [12]:
classifier = get_classifier()

loss = tf.keras.losses.CategoricalCrossentropy()
gen = generateTrain()

# Compile the model
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=loss,
    metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               65792     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 49)                12593     
                                                                 
Total params: 78,385
Trainable params: 78,385
Non-trainable params: 0
_________________________________________________________________
None


2022-05-31 13:46:40.313787: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Training

In [13]:
# train in Ncycles cycles, each time generating new combinations
# of randomly concatenated image and audio embeddings.

Ncycles = 30

for j in range(Ncycles):
    lbls, tr = next(gen)
    lbls = to_categorical(lbls)
    train = (tr, lbls)

    epochs_per_cycle = 3
    
    history = classifier.fit(train[0], train[1],
                             validation_data=val,
                             batch_size=2000,
                             epochs=epochs_per_cycle,
                             shuffle=True, 
                             verbose=0)
    floss = classifier.history.history['accuracy'][-1]
    vloss = classifier.history.history['val_accuracy'][-1]
    print(f"accuracy: {floss*100:.01f}%. val accuracy: {100*vloss:.01f}%")

accuracy: 17.5%. val accuracy: 48.0%
accuracy: 65.5%. val accuracy: 85.8%
accuracy: 90.5%. val accuracy: 90.8%
accuracy: 96.2%. val accuracy: 91.8%
accuracy: 98.5%. val accuracy: 92.6%
accuracy: 98.9%. val accuracy: 92.9%
accuracy: 99.5%. val accuracy: 93.6%
accuracy: 99.5%. val accuracy: 94.1%
accuracy: 99.8%. val accuracy: 94.6%
accuracy: 100.0%. val accuracy: 95.1%
accuracy: 99.9%. val accuracy: 95.4%
accuracy: 100.0%. val accuracy: 95.4%
accuracy: 100.0%. val accuracy: 95.4%
accuracy: 100.0%. val accuracy: 95.6%
accuracy: 100.0%. val accuracy: 95.8%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 96.0%
accuracy: 100.0%. val accuracy: 96.0%
accuracy: 100.0%. val accuracy: 95.9%
accuracy: 100.0%. val accuracy: 96.0%
accuracy: 100.0%. val accuracy: 96.0%
accuracy: 100.0%. val accuracy: 96.0%
accuracy: 100.0%. val 

# Testing

In [14]:
pred = np.argmax(classifier(embeddings_test), axis=1)
print(f"Test accuracy : {np.sum(pred==tst_lbl)/pred.size*100:.01f}%")

Test accuracy : 95.4%
