# Part 5: Building the Multimodal Model

We incorporate our learnings from part 4 into building a Multimodal Model with different processing pathways for the various modalities, and then merge their processed features for the final classification. 

In [1]:
# Import necessary libraries

import os
import sys
import copy
import pickle
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import keras.api._v2.keras as keras
import keras.backend as K
from keras.optimizers.legacy import Adam
from keras.optimizers import SGD
from keras import callbacks
from keras.models import Model
from keras.layers import Layer, Dense, Input, Activation, Lambda, BatchNormalization, Conv1D, SpatialDropout1D, add, GlobalAveragePooling1D, LSTM, Dense, concatenate, TimeDistributed, Bidirectional, Dropout, Embedding, Attention, MultiHeadAttention, LayerNormalization, Flatten, Concatenate
from keras.activations import sigmoid
from sklearn.model_selection import KFold
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score
import tensorflow as tf
from data_load import load_data
from basemodel import BaseModel

%load_ext autoreload
%autoreload 2

In [3]:
# Load data
train_data, train_label, test_data, test_label, train_text, train_audio, train_visual, test_text, test_audio, test_visual = load_data()

Number of training samples:  120
Number of testing samples:  31
Max length of sequences:  110

No. of samples per class: {'neu': 1708, 'fru': 1849, 'ang': 1103, 'sad': 1084, 'exc': 1041, 'hap': 648}

Train text shape: 120 samples, 110 timesteps, 100 features
Train audio shape: 120 samples, 110 timesteps, 100 features
Train visual shape: 120 samples, 110 timesteps, 512 features

Test text shape: 31 samples, 110 timesteps, 100 features
Test audio shape: 31 samples, 110 timesteps, 100 features
Test visual shape: 31 samples, 110 timesteps, 512 features



## Transformer Block and Cross-Modal Attention

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(dff, activation='relu'),
            Dense(d_model)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.mha(inputs, inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class CrossModalAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.multi_head_attn = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
    
    def call(self, query, key_value):
        attn_output = self.multi_head_attn(query, key_value, key_value)
        return attn_output

## General Multimodal Transformer Model

In [15]:
class MultimodalModel(Model):
    def __init__(self, d_model, num_heads, dff, rate=0.1, num_classes=10):
        super(MultimodalModel, self).__init__()
        # Transformer block for processing video features
        self.video_transformer = TransformerBlock(d_model=512, num_heads=8, dff=dff, rate=rate)
        
        # Reducing dimensionality of processed video features to align with audio and text features
        self.dimensionality_reduction = Dense(d_model)
        
        # Text feature processing using bi-LSTM
        self.text_lstm = Bidirectional(LSTM(50, return_sequences=True))

        # Audio feature processing using 1D CNN with padding
        self.audio_cnn = Conv1D(filters=100, kernel_size=3, activation='relu', padding='same')

        # Cross-modal attention layers
        self.text_audio_attention = CrossModalAttentionLayer(d_model=d_model, num_heads=num_heads)
        self.text_video_attention = CrossModalAttentionLayer(d_model=d_model, num_heads=num_heads)

        # Final processing and classification layers
        self.concat = Concatenate()
        self.global_pool = GlobalAveragePooling1D()
        self.classifier = Dense(num_classes, activation='softmax')

    def call(self, inputs, training):
        audio_features, text_features, video_features = inputs
        
        # Process video features and reduce dimensionality
        video_features_transformed = self.video_transformer(video_features, training=training)
        video_features_reduced = self.dimensionality_reduction(video_features_transformed)

        # Process text features
        text_features = self.text_lstm(text_features)

        # Process audio features
        audio_features = self.audio_cnn(audio_features)
        
        # Apply cross-modal attention between audio-video and text-video
        text_audio_attn = self.text_audio_attention(text_features, audio_features)
        text_video_attn = self.text_video_attention(text_features, video_features_reduced)
        
        # Combine features from both attention mechanisms
        combined_features = self.concat([text_features, text_audio_attn, text_video_attn])
        outputs = self.classifier(combined_features)

        # Return the final prediction
        return outputs
    
    def evaluate(self, x_test, y_test):
        predictions = self.predict(x_test)
        y_pred = np.argmax(predictions, axis=-1).flatten()
        y_true = np.argmax(y_test, axis=-1).flatten()

        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

        # Return a dictionary of metrics
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
        }


In [6]:
# Instantiate the model
multimodal_model = MultimodalModel(d_model=100, num_heads=4, dff=2048, rate=0.1, num_classes=6)

# Compile the model
multimodal_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
multimodal_model.fit([train_audio, train_text, train_visual], train_label, epochs=10, batch_size=32)

# Evaluate the model
multimodal_model.evaluate([test_audio, test_text, test_visual], test_label)


Epoch 1/10


2024-03-31 22:50:05.545646: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-03-31 22:50:05.545690: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-31 22:50:05.545709: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-31 22:50:05.545748: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-31 22:50:05.545763: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-03-31 22:50:07.255855: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 22:50:07.671922: I t

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2024-03-31 22:50:19.523416: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 22:50:19.652896: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 22:50:19.676680: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




{'accuracy': 0.7926686217008798,
 'precision': 0.7005309195404089,
 'recall': 0.6409352371358086,
 'f1_score': 0.6609940912578373}

In [22]:
class MultimodalAttentionModel(Model):
    def __init__(self, text_input_shape, audio_input_shape, video_input_shape, num_classes, d_model=128, num_heads=4):
        super(MultimodalAttentionModel, self).__init__()
        
        # Text Pathway - Bi-LSTM with Attention
        self.text_attention = MultiHeadAttention(num_heads=4, key_dim=d_model // num_heads)
        self.text_bi_lstm1 = Bidirectional(LSTM(64, return_sequences=True))
        self.text_bi_lstm2 = Bidirectional(LSTM(64, return_sequences=True))
        self.text_dropout = Dropout(0.5)
        self.text_dense = Dense(64, activation='relu')
        
        # Audio Pathway - CNN with Attention
        self.audio_attention = MultiHeadAttention(num_heads=8, key_dim=d_model // num_heads)
        self.audio_conv1 = Conv1D(64, kernel_size=10, activation='relu', padding='same')
        self.audio_conv2 = Conv1D(64, kernel_size=9, activation='relu', padding='same')
        self.audio_conv3 = Conv1D(64, kernel_size=8, activation='relu', padding='same')
        self.audio_dense = Dense(64, activation='relu')
        
        # Video Pathway - Similar to Audio
        self.video_attention = MultiHeadAttention(num_heads=8, key_dim=d_model // num_heads)
        self.video_conv1 = Conv1D(64, kernel_size=10, activation='relu', padding='same')
        self.video_conv2 = Conv1D(64, kernel_size=9, activation='relu', padding='same')
        self.video_conv3 = Conv1D(64, kernel_size=8, activation='relu', padding='same')
        self.video_dense = Dense(64, activation='relu')
        
        # Integration and Classification
        self.concat = Concatenate()
        self.global_pool = GlobalAveragePooling1D()
        self.final_classifier = Dense(num_classes, activation='softmax')

        # Cross-modal attention layers
        self.text_audio_attention = CrossModalAttentionLayer(d_model=d_model, num_heads=num_heads)
        self.text_video_attention = CrossModalAttentionLayer(d_model=d_model, num_heads=num_heads)

        # Final processing and classification layers
        self.concat = Concatenate()
        self.global_pool = GlobalAveragePooling1D()
        self.classifier = Dense(num_classes, activation='softmax')
        
    def call(self, inputs, training=False):
        text_inputs, audio_inputs, video_inputs = inputs
        
        # Text Pathway
        x_text = self.text_attention(text_inputs, text_inputs)
        x_text = self.text_bi_lstm1(x_text)
        x_text = self.text_bi_lstm2(x_text)
        x_text = self.text_dropout(x_text, training=training)
        x_text = self.text_dense(x_text)
        
        # Audio Pathway
        x_audio = self.audio_attention(audio_inputs, audio_inputs)
        x_audio = self.audio_conv1(x_audio)
        x_audio = self.audio_conv2(x_audio)
        x_audio = self.audio_conv3(x_audio)
        x_audio = self.audio_dense(x_audio)
        
        # Video Pathway
        x_video = self.video_attention(video_inputs, video_inputs)
        x_video = self.video_conv1(x_video)
        x_video = self.video_conv2(x_video)
        x_video = self.video_conv3(x_video)
        x_video = self.video_dense(x_video)

        # Apply cross-modal attention between audio-video and text-video
        text_audio_attn = self.text_audio_attention(x_text, x_audio)
        text_video_attn = self.text_video_attention(x_text, x_video)
        
        # Combine features from both attention mechanisms
        combined_features = self.concat([x_text, text_audio_attn, text_video_attn])
        outputs = self.final_classifier(combined_features)

        # Return the final prediction
        return outputs
    
    def evaluate(self, x_test, y_test):
        predictions = self.predict(x_test)
        y_pred = np.argmax(predictions, axis=-1).flatten()
        y_true = np.argmax(y_test, axis=-1).flatten()

        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

        self.test_metrics =  {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'cm': confusion_matrix(y_true, y_pred)
        }

        print("Test Metrics:")
        print(f"Accuracy: {self.test_metrics['accuracy']:.4f}")
        # print(f"Precision: {test_metrics['precision']:.4f}")
        # print(f"Recall: {test_metrics['recall']:.4f}")
        print(f"F1 Score: {self.test_metrics['f1_score']:.4f}")
       
    
    def print_metrics(self):
        print("Test Metrics:")
        print(f"Accuracy: {self.test_metrics['accuracy']:.4f}")
        # print(f"Precision: {metrics['precision']:.4f}")
        # print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1 Score: {self.test_metrics['f1_score']:.4f}")

        class_labels = ['hap', 'sad', 'neu', 'ang', 'exc', 'fru']
        print("Confusion Matrix:")
        plt.figure(figsize=(8,6))
        sns.heatmap(self.test_metrics['cm'], annot=True, fmt='g', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.title('Confusion Matrix')
        plt.show()

In [23]:
# Instantiate the model
multimodal_model = MultimodalAttentionModel(text_input_shape=(train_text.shape[1], train_text.shape[2]), audio_input_shape=(train_audio.shape[1], train_audio.shape[2]), video_input_shape=(train_visual.shape[1], train_visual.shape[2]), num_classes=6)

# Compile the model
multimodal_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
multimodal_model.fit([train_audio, train_text, train_visual], train_label, epochs=10, batch_size=32)

# Evaluate the model
multimodal_model.evaluate([test_audio, test_text, test_visual], test_label)

Epoch 1/10


2024-03-31 23:09:22.601278: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.337751: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.368610: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.546741: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.562206: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.934354: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:23.979518: I tensorflow/core/grappler/optimizers/cust

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2024-03-31 23:09:33.876496: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:34.159294: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:34.185118: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:34.366837: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-03-31 23:09:34.379276: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Test Metrics:
Accuracy: 0.7079
F1 Score: 0.5066
