### Model Details
Details:
- Transformer Encoders, integrates Multi-Head Attention and Position Embedding
- Main components:
  - `PositionalEmbedding`, adds position-based information to the input features
  - `Transformer Encoder`, each block contains Multi-Head Attention (the attention layers have multiple heads `num_heads`), after that it uses a simple Feedforward Network to further process (`ff_dim` determines the number of units in this layer)
  - `Global Pooling Layer`, `GlobalMaxPooling1D` is used, consending the sequence into a single vector
  - `Dense Layer`, classification, followed by a `softmax` layer, output 4 class probabilities

Hyperparameters:
- `input_shape`
- `head_size`
- `num_heads`
- `ff_dim`
- `dropout`,`mlp_dropout`

### Model training

Make sure that:
- `openface.ipynb`,`data_prep.ipynb` works before this
- note that `data_prep.py` and `data_prep.ipynb` is identical

Current Accuracy (5 Oct 1pm):
- total data (train,test,val): 5443,1638,845
- val acc: 60%
- **test acc: 69%**

### Ref from config.py

`config.N_SEGMENTS = 50`\
`config.GAZE_HP_AU = 'engage_gaze+hp+au'`\
`config.SNP: SNP (Subject Not Present)`\
`config.LABEL_MAP`:
- 0: Not-Engaged
- 1: Barely-engaged
- 2: Engaged
- 3: Highly-Engaged

In [4]:
# Trainer application code
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import normalize
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

from keras.layers import Flatten, LSTM, Dense, Conv2D, Conv3D, GlobalAveragePooling1D, Dropout, MaxPooling2D

from tensorflow import keras
from tensorflow.keras import layers

from data_prep import data_loader_v1
import config
import utils

In [5]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim
        self.mask_layer = MaskComputationLayer()  # Use the new mask layer

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return self.mask_layer(inputs)  # Use the mask layer to compute mask

    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "output_dim": self.output_dim
        })
        return config

class MaskComputationLayer(layers.Layer):
    def call(self, inputs):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads
        })
        return config

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

def build_model(
        input_shape,
        head_size,
        num_heads,
        ff_dim,
        num_transformer_blocks,
        mlp_units,
        dropout=0,
        mlp_dropout=0,
        n_classes=4
    ):
    inputs = keras.Input(shape=input_shape)
    dense_dim = 8
    embed_dim = input_shape[1]
    
    x = PositionalEmbedding(
        input_shape[0], embed_dim, name="frame_position_embedding"
    )(inputs)
    
    x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

    x = layers.GlobalMaxPooling1D()(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs)

    return model

BUFFER_SIZE = 100000

def make_ds(features, labels):
    ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
    ds = ds.shuffle(BUFFER_SIZE).repeat()
    return ds

def get_best_weights():
    import os
    def get_epoch(x):
        return int(x.split('epoch')[-1].split('-')[0])
    return sorted(os.listdir('./checkpoints/'), key=get_epoch)[-1]

def train(model_name, val=True):

    data = data_loader_v1(model_name, val=val, scale=True)
    if len(data) == 3:
        train, val, test = data
        val_x, val_y = val
    else:
        train, test = data
        
    train_x, train_y = train
    test_x, test_y = test
    
    total = train_y.shape[0]
    
    print ("train stats: ")
    print (train_x.shape, train_y.shape)
    
    if val:
        print ("val stats: ")
        print (val_x.shape, val_y.shape)
    
    print ("test stats: ")
    print (test_x.shape, test_y.shape)
    
    # class weight strategy
    class_weight = {
        k: (1 / train_y[train_y==k].shape[0]) * (total / 4) for k in np.unique(train_y)
    }
    
    
    BATCH_SIZE = 32

    input_shape = train_x.shape[1:]
        
    model = build_model(
        input_shape,
        head_size=256,
        num_heads=8,
        ff_dim=4,
        num_transformer_blocks=4,
        mlp_units=[128],
        mlp_dropout=0.5,
        dropout=0.3,
        n_classes=4
    )

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        metrics=["accuracy"],
    )
    
    filepath = "checkpoints/" + model_name + ".epoch{epoch:02d}-acc{val_accuracy:.2f}.keras"
    checkpoint = ModelCheckpoint(filepath=filepath, 
                             monitor='val_accuracy',
                             verbose=1, 
                             save_best_only=True,
                             mode='max')
    
    callbacks = [
                 keras.callbacks.EarlyStopping(patience=50, restore_best_weights=True),
                 checkpoint
    ]
    if not val:
        val_x = test_x
        val_y = test_y
    model.fit(train_x, train_y, 
              validation_data=(val_x, val_y), 
              epochs=200, 
              callbacks=callbacks)
#               class_weight=class_weight)
    
    model.load_weights(f'checkpoints/{get_best_weights()}')
    print ("Evaluating on train set: ")
    model.evaluate(train_x, train_y)

    print ("Evaluating on valid set: ")
    model.evaluate(val_x, val_y)

    print ("Evaluating on test set: ")
    model.evaluate(test_x, test_y)

#     y_pred_train = np.argmax(model.predict(train_x), axis=1)
    y_pred_val = np.argmax(model.predict(val_x), axis=1)
    y_pred_test = np.argmax(model.predict(test_x), axis=1)

    
#     print ("Classification report (train): ")
#     print(classification_report(train_y, y_pred_train))
    

    print ("Classification report (val): ")
    print(classification_report(val_y, y_pred_val))
   

    print ("Classification report (test): ")
    print(classification_report(test_y, y_pred_test))
    



In [6]:
# import sys
import shutil, os
if __name__ == '__main__':
    shutil.rmtree('checkpoints/')
    os.mkdir('checkpoints')
    train(config.GAZE_HP_AU, val=True) # GAZE_HP_AU, MARLIN

                                chunk           label
0   subject_68_0ng3yqwrg6_vid_0_0.mp4         Engaged
1   subject_68_0ng3yqwrg6_vid_0_1.mp4  Highly-Engaged
2  subject_68_0ng3yqwrg6_vid_0_10.mp4  Highly-Engaged
3  subject_68_0ng3yqwrg6_vid_0_11.mp4  Highly-Engaged
4  subject_68_0ng3yqwrg6_vid_0_12.mp4         Engaged
<class 'list'> 7664
TrainXy: ['subject_68_0ng3yqwrg6_vid_0_0.mp4', 'subject_68_0ng3yqwrg6_vid_0_1.mp4', 'subject_68_0ng3yqwrg6_vid_0_10.mp4', 'subject_68_0ng3yqwrg6_vid_0_11.mp4', 'subject_68_0ng3yqwrg6_vid_0_12.mp4']
len TrainXy: 5752
TestXy: ['subject_96_dh18u00dyu_vid_0_0.mp4', 'subject_96_dh18u00dyu_vid_0_1.mp4', 'subject_96_dh18u00dyu_vid_0_10.mp4', 'subject_96_dh18u00dyu_vid_0_11.mp4', 'subject_96_dh18u00dyu_vid_0_12.mp4']
len TestXy: 1698
ValXy: ['subject_85_rda0o4n8zs_vid_0_0.mp4', 'subject_85_rda0o4n8zs_vid_0_1.mp4', 'subject_85_rda0o4n8zs_vid_0_10.mp4', 'subject_85_rda0o4n8zs_vid_0_11.mp4', 'subject_85_rda0o4n8zs_vid_0_12.mp4']
len ValXy: 892
trainXy sample:



Epoch 1/200




[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.3554 - loss: 1.8763
Epoch 1: val_accuracy improved from -inf to 0.54201, saving model to checkpoints/engage_gaze+hp+au.epoch01-acc0.54.keras
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.3558 - loss: 1.8744 - val_accuracy: 0.5420 - val_loss: 1.0679
Epoch 2/200
[1m169/171[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.5488 - loss: 1.0899
Epoch 2: val_accuracy improved from 0.54201 to 0.59290, saving model to checkpoints/engage_gaze+hp+au.epoch02-acc0.59.keras
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.5491 - loss: 1.0890 - val_accuracy: 0.5929 - val_loss: 1.0198
Epoch 3/200
[1m169/171[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.6191 - loss: 0.9231
Epoch 3: val_accuracy improved from 0.59290 to 0.59763, saving model to checkpoints/engage_gaze+