In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, layers
from tensorflow.keras.layers import Input, Bidirectional, Dense, Conv1D, Conv3D, LSTM, Flatten, Attention, MultiHeadAttention, GlobalAveragePooling1D, Concatenate, Add, Dropout, Softmax
from tensorflow.keras.regularizers import l2
from utilz import *
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


2022-10-18 12:29:41.979937: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:29:42.009939: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:29:42.010051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [3]:
visual_clip = load_features('./data/visual_clip.pkl')
acoustic = load_features('./data/acoustic_wav2vec.pkl')
bert_embs = load_features('./data/textual_bert.pkl')
label = load_features('./data/labels.pkl')

# From scratch

# Model 1

In [4]:
class Attention_Self(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(Attention_Self, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.S = tf.keras.layers.Dense(1)
        self.units = units

    def call(self, features):
        features_ = tf.expand_dims(features, 1)
        v = self.W1(features)
        q =  self.W2(features_)
        score = tf.nn.tanh(q + v)
        attention_weights = tf.nn.softmax(self.S(score), axis=1)
        ATTN = attention_weights * (v)
        ATTN = tf.reduce_sum(ATTN, axis=1)
        
        return ATTN
        
    def get_config(self):
        config = super(Attention_Self, self).get_config()
        config.update({"units": self.units})
        return config

In [None]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()



2022-10-12 22:11:51.359712: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-12 22:11:51.380725: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-12 22:11:51.380833: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-12 22:11:51.381670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compi

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 10, 64)       98368       ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                              

In [5]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_concate.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30


2022-10-12 13:35:26.290913: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8500
2022-10-12 13:35:26.634847: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.






INFO:tensorflow:Assets written to: ./res/multi_model_concate.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_concate.tf/assets


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7feb34f05120>

In [6]:
model = tf.keras.models.load_model('./res/multi_model_concate.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7206    0.7177    0.7192       248
         Pos     0.4901    0.7071    0.5789       140
         Neu     0.5000    0.0580    0.1039        69

    accuracy                         0.6149       457
   macro avg     0.5702    0.4943    0.4673       457
weighted avg     0.6167    0.6149    0.5833       457

[[178  67   3]
 [ 40  99   1]
 [ 29  36   4]]


# Model 2

In [7]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Attention_Self(256)(h)
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 conv1d_6 (Conv1D)              (None, 10, 64)       98368       ['input_4[0][0]']                
                                                                                                  
 input_5 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                            

In [8]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_selfatt.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 2/30
Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 8/30
Epoch 9/30
Epoch 10/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 11/30
Epoch 12/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 13/30
Epoch 14/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt.tf/assets


Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fea742fd780>

In [9]:
model = tf.keras.models.load_model('./res/multi_model_selfatt.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6158    0.8468    0.7131       248
         Pos     0.4914    0.4071    0.4453       140
         Neu     0.0000    0.0000    0.0000        69

    accuracy                         0.5842       457
   macro avg     0.3691    0.4180    0.3861       457
weighted avg     0.4847    0.5842    0.5234       457

[[210  38   0]
 [ 83  57   0]
 [ 48  21   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Attention(256)([h,h])
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 conv1d_12 (Conv1D)             (None, 10, 64)       98368       ['input_7[0][0]']                
                                                                                                  
 input_8 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                            

In [11]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_selfattv2.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2.tf/assets


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7feac4183160>

In [12]:
model = tf.keras.models.load_model('./res/multi_model_selfattv2.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6700    0.8024    0.7303       248
         Pos     0.5380    0.6071    0.5705       140
         Neu     0.0000    0.0000    0.0000        69

    accuracy                         0.6214       457
   macro avg     0.4027    0.4699    0.4336       457
weighted avg     0.5284    0.6214    0.5711       457

[[199  48   1]
 [ 54  85   1]
 [ 44  25   0]]


# Model 3

In [13]:
class Attention_Cross(tf.keras.layers.Layer):
    
    def __init__(self, units, **kwargs):
        super(Attention_Cross, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.S = tf.keras.layers.Dense(1)
        self.units = units

    def call(self, features1, features2):
        features2_ = tf.expand_dims(features2, 1)
        v = self.W1(features1)
        # v_ = self.W1(features2)
        q =  self.W2(features2_)
        score = tf.nn.tanh(q + v)
        attention_weights = tf.nn.softmax(self.S(score), axis=1)
        # ATTN = attention_weights * (v+v_)
        ATTN = attention_weights * v
        ATTN = tf.reduce_sum(ATTN, axis=1)
        
#         features2_ = tf.expand_dims(features2, 1)
#         score = tf.nn.tanh(self.W1(features1) + self.W2(features2_))
#         attention_weights = tf.nn.softmax(self.S(score), axis=1)
#         ATTN = attention_weights * (features1 + features2)
#         ATTN = tf.reduce_sum(ATTN, axis=1)
        
        return ATTN
        
    def get_config(self):
        config = super(Attention_Cross, self).get_config()
        config.update({"units": self.units})
        return config


In [14]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

v = Concatenate()([tex_q, tex_qv_attention])
q = Concatenate()([vis_h, aud_h])
h = Attention_Cross(128)(v, q)
# h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 conv1d_18 (Conv1D)             (None, 10, 64)       98368       ['input_10[0][0]']               
                                                                                                  
 input_11 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                            

In [15]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_crossatt.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 4/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 27/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 28/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt.tf/assets


Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fea203bd2d0>

In [16]:
model = tf.keras.models.load_model('./res/multi_model_crossatt.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6961    0.7944    0.7420       248
         Pos     0.5115    0.6357    0.5669       140
         Neu     0.0000    0.0000    0.0000        69

    accuracy                         0.6258       457
   macro avg     0.4025    0.4767    0.4363       457
weighted avg     0.5345    0.6258    0.5763       457

[[197  51   0]
 [ 51  89   0]
 [ 35  34   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

v = Concatenate()([tex_q, tex_qv_attention])
q = Concatenate()([vis_h, aud_h])
h = Attention(128)([v, q])
# h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 conv1d_24 (Conv1D)             (None, 10, 64)       98368       ['input_13[0][0]']               
                                                                                                  
 input_14 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                            

In [18]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_crossattv2.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 4/30
Epoch 5/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 6/30
Epoch 7/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 8/30
Epoch 9/30
Epoch 10/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2.tf/assets


Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fea743aecb0>

In [19]:
model = tf.keras.models.load_model('./res/multi_model_crossattv2.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6107    0.7339    0.6667       248
         Pos     0.4277    0.4857    0.4548       140
         Neu     0.0000    0.0000    0.0000        69

    accuracy                         0.5470       457
   macro avg     0.3461    0.4065    0.3738       457
weighted avg     0.4624    0.5470    0.5011       457

[[182  66   0]
 [ 72  68   0]
 [ 44  25   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model 4

In [5]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
        
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x
    

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [14]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)
# h = Concatenate()([q, qv_attention])

v = Concatenate(axis=1)([tf.expand_dims(tex_q,1), tf.expand_dims(tex_qv_attention,1)])
q = Concatenate(axis=1)([tf.expand_dims(vis_h,1), tf.expand_dims(aud_h,1)])

# h = MultiHeadAttention(num_heads=4, key_dim=32, dropout=0.2)(v, q)
h = CrossAttention(num_heads=4, key_dim=32, dropout=0.2)(v, q)

# h = Dense(64)(h)
# res = Dense(3, activation='softmax')(h)
res = Dense(3, activation='softmax')(tf.reduce_mean(h, 1))

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                                  
 conv1d_6 (Conv1D)              (None, 10, 64)       98368       ['input_4[0][0]']                
                                                                                                  
 input_5 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                            

In [15]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_multiheadatt_cross.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], 
          y=np.asarray(label['train']), batch_size=16, epochs=30,
          validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])],np.asarray(label['valid'])],
          callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross.tf/assets


Epoch 2/30
Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross.tf/assets


Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f1eeec5d540>

In [16]:
model = tf.keras.models.load_model('./res/multi_model_multiheadatt_cross.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7041    0.7581    0.7301       248
         Pos     0.5833    0.6500    0.6149       140
         Neu     0.2941    0.1449    0.1942        69

    accuracy                         0.6324       457
   macro avg     0.5272    0.5177    0.5130       457
weighted avg     0.6052    0.6324    0.6139       457

[[188  43  17]
 [ 42  91   7]
 [ 37  22  10]]


In [6]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
vis_h = Conv1D(64, 3, 1, 'same')(vis_ipt)
vis_h = Conv1D(64, 1, 1, 'same')(vis_h)
vis_h = Conv1D(64, 3, 1, 'same')(vis_h)
vis_h = LSTM(64, activation='relu')(vis_h)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
aud_h = Conv1D(64, 3, 2)(aud_ipt)
aud_h = Attention_Self(32)(aud_h)
aud_h = Bidirectional(LSTM(32))(aud_h)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
tex_q = Conv1D(64, 3, 1)(tex_ipt)
tex_v = Conv1D(64, 3, 1)(tex_ipt)
tex_qv_attention = Attention()([tex_q, tex_v])
tex_q = GlobalAveragePooling1D()(tex_q)
tex_qv_attention = GlobalAveragePooling1D()(tex_qv_attention)

h = Concatenate(axis=1)([tf.expand_dims(tex_q,1), tf.expand_dims(tex_qv_attention,1), tf.expand_dims(vis_h,1), tf.expand_dims(aud_h,1)])

h = GlobalSelfAttention(num_heads=4, key_dim=32, dropout=0.2)(h)

# h = Dense(64)(h)
# res = Dense(3, activation='softmax')(h)
res = Dense(3, activation='softmax')(tf.reduce_mean(h, 1))

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()



2022-10-18 12:30:25.491768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-18 12:30:25.492593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:30:25.492737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:30:25.492804: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 10, 64)       98368       ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                              

In [7]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_multiheadatt_global.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], 
          y=np.asarray(label['train']), batch_size=16, epochs=30,
          validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])],np.asarray(label['valid'])],
          callbacks=callback_list)

Epoch 1/30


2022-10-18 12:30:30.087430: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8500
2022-10-18 12:30:30.444872: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.






INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global.tf/assets


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f0ad9474be0>

In [8]:
model = tf.keras.models.load_model('./res/multi_model_multiheadatt_global.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6803    0.8750    0.7654       248
         Pos     0.6667    0.5286    0.5896       140
         Neu     0.2222    0.0870    0.1250        69

    accuracy                         0.6499       457
   macro avg     0.5230    0.4968    0.4934       457
weighted avg     0.6069    0.6499    0.6149       457

[[217  25   6]
 [ 51  74  15]
 [ 51  12   6]]
