In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model, layers
from tensorflow.keras.layers import Input, Bidirectional, Dense, Conv1D, Conv3D, LSTM, Flatten, Attention, MultiHeadAttention, GlobalAveragePooling1D, Concatenate, Add, Dropout, Softmax
from tensorflow.keras.regularizers import l2
from utilz import *
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [3]:
visual_clip = load_features('./data/visual_clip.pkl')
acoustic = load_features('./data/acoustic_wav2vec.pkl')
bert_embs = load_features('./data/textual_bert.pkl')
label = load_features('./data/labels.pkl')

# From pretrained

# Model 1

In [4]:
class Attention_Self(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(Attention_Self, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.S = tf.keras.layers.Dense(1)
        self.units = units

    def call(self, features):
        features_ = tf.expand_dims(features, 1)
        v = self.W1(features)
        q =  self.W2(features_)
        score = tf.nn.tanh(q + v)
        attention_weights = tf.nn.softmax(self.S(score), axis=1)
        ATTN = attention_weights * (v)
        ATTN = tf.reduce_sum(ATTN, axis=1)
        
        return ATTN
        
    def get_config(self):
        config = super(Attention_Self, self).get_config()
        config.update({"units": self.units})
        return config

In [5]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()



2022-10-18 12:12:47.170728: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-18 12:12:47.172087: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:12:47.172237: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-18 12:12:47.172313: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                                  
 model (Functional)             (None, 64)           147904      ['input_1[0][0]']                
                                                                                            

In [6]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_concate_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30


2022-10-18 12:12:53.571905: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8500


 6/86 [=>............................] - ETA: 0s - loss: 1.0731 - acc: 0.5625  

2022-10-18 12:12:53.913731: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.






INFO:tensorflow:Assets written to: ./res/multi_model_concate_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_concate_pretrained.tf/assets


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe8005a5480>

In [7]:
model = tf.keras.models.load_model('./res/multi_model_concate_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7109    0.8427    0.7712       248
         Pos     0.6288    0.5929    0.6103       140
         Neu     0.3548    0.1594    0.2200        69

    accuracy                         0.6630       457
   macro avg     0.5648    0.5317    0.5338       457
weighted avg     0.6320    0.6630    0.6387       457

[[209  29  10]
 [ 47  83  10]
 [ 38  20  11]]


# Model 2

In [8]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Attention_Self(256)(h)
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                                  
 model_4 (Functional)           (None, 64)           147904      ['input_4[0][0]']                
                                                                                            

In [9]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_selfatt_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


Epoch 2/30
Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


Epoch 4/30
Epoch 5/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


Epoch 6/30
Epoch 7/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfatt_pretrained.tf/assets


Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe77e1d95a0>

In [10]:
model = tf.keras.models.load_model('./res/multi_model_selfatt_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7319    0.8145    0.7710       248
         Pos     0.5759    0.6500    0.6107       140
         Neu     0.3913    0.1304    0.1957        69

    accuracy                         0.6608       457
   macro avg     0.5664    0.5317    0.5258       457
weighted avg     0.6327    0.6608    0.6350       457

[[202  42   4]
 [ 39  91  10]
 [ 35  25   9]]


In [11]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

h = Concatenate()([vis_h, aud_h, tex_q, tex_qv_attention])
h = Attention(256)([h,h])
h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 10, 512)]    0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 128, 512)]   0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 36, 768)]    0           []                               
                                                                                                  
 model_8 (Functional)           (None, 64)           147904      ['input_7[0][0]']                
                                                                                           

In [12]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_selfattv2_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_selfattv2_pretrained.tf/assets


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe359ef6770>

In [13]:
model = tf.keras.models.load_model('./res/multi_model_selfattv2_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6906    0.7742    0.7300       248
         Pos     0.5306    0.5571    0.5436       140
         Neu     0.3750    0.1739    0.2376        69

    accuracy                         0.6171       457
   macro avg     0.5321    0.5017    0.5037       457
weighted avg     0.5940    0.6171    0.5986       457

[[192  48   8]
 [ 50  78  12]
 [ 36  21  12]]


# Model 3

In [14]:
class Attention_Cross(tf.keras.layers.Layer):
    
    def __init__(self, units, **kwargs):
        super(Attention_Cross, self).__init__(**kwargs)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.S = tf.keras.layers.Dense(1)
        self.units = units

    def call(self, features1, features2):
        features2_ = tf.expand_dims(features2, 1)
        v = self.W1(features1)
        # v_ = self.W1(features2)
        q =  self.W2(features2_)
        score = tf.nn.tanh(q + v)
        attention_weights = tf.nn.softmax(self.S(score), axis=1)
        # ATTN = attention_weights * (v+v_)
        ATTN = attention_weights * v
        ATTN = tf.reduce_sum(ATTN, axis=1)
        
#         features2_ = tf.expand_dims(features2, 1)
#         score = tf.nn.tanh(self.W1(features1) + self.W2(features2_))
#         attention_weights = tf.nn.softmax(self.S(score), axis=1)
#         ATTN = attention_weights * (features1 + features2)
#         ATTN = tf.reduce_sum(ATTN, axis=1)
        
        return ATTN
        
    def get_config(self):
        config = super(Attention_Cross, self).get_config()
        config.update({"units": self.units})
        return config


In [15]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

v = Concatenate()([tex_q, tex_qv_attention])
q = Concatenate()([vis_h, aud_h])
h = Attention_Cross(128)(v, q)
# h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_12 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                                  
 model_14 (Functional)          [(None, 64),         295040      ['input_12[0][0]']               
                                 (None, 64)]                                               

In [16]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_crossatt_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


Epoch 4/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossatt_pretrained.tf/assets


Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe3c07512d0>

In [17]:
model = tf.keras.models.load_model('./res/multi_model_crossatt_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.6717    0.8911    0.7660       248
         Pos     0.6320    0.5643    0.5962       140
         Neu     0.3333    0.0145    0.0278        69

    accuracy                         0.6586       457
   macro avg     0.5457    0.4900    0.4633       457
weighted avg     0.6085    0.6586    0.6025       457

[[221  27   0]
 [ 59  79   2]
 [ 49  19   1]]


In [18]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)


v = Concatenate()([tex_q, tex_qv_attention])
q = Concatenate()([vis_h, aud_h])
print(v.shape, q.shape)
h = Attention(128)([v, q])
# h = Dense(64)(h)
res = Dense(3, activation='softmax')(h)

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





(None, 128) (None, 128)
Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 input_13 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                                  
 model_18 (Functional)          [(None, 64),         295040      ['input_15[0][0]']               
                                 (None, 64)]                       

In [19]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_crossattv2_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], y=np.asarray(label['train']), batch_size=16, epochs=30, 
            validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])], np.asarray(label['valid'])],
            callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


Epoch 3/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


Epoch 4/30
Epoch 5/30
Epoch 6/30



INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_crossattv2_pretrained.tf/assets


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe371e515a0>

In [20]:
model = tf.keras.models.load_model('./res/multi_model_crossattv2_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.5754    0.9234    0.7090       248
         Pos     0.5085    0.2143    0.3015       140
         Neu     0.0000    0.0000    0.0000        69

    accuracy                         0.5667       457
   macro avg     0.3613    0.3792    0.3368       457
weighted avg     0.4680    0.5667    0.4771       457

[[229  19   0]
 [110  30   0]
 [ 59  10   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model 4

In [21]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
        
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x
    

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [28]:
# Multi_head Attention : Cross, Global

In [25]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

v = Concatenate(axis=1)([tf.expand_dims(tex_q,1), tf.expand_dims(tex_qv_attention,1)])
q = Concatenate(axis=1)([tf.expand_dims(vis_h,1), tf.expand_dims(aud_h,1)])

# h = MultiHeadAttention(num_heads=4, key_dim=32, dropout=0.2)(v, q)
h = CrossAttention(num_heads=4, key_dim=32, dropout=0.2)(v, q)

# h = Dense(64)(h)
# res = Dense(3, activation='softmax')(h)
res = Dense(3, activation='softmax')(tf.reduce_mean(h, 1))

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_27"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_21 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 input_19 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_20 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                                  
 model_26 (Functional)          [(None, 64),         295040      ['input_21[0][0]']               
                                 (None, 64)]                                               

In [26]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_multiheadatt_cross_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], 
          y=np.asarray(label['train']), batch_size=16, epochs=30,
          validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])],np.asarray(label['valid'])],
          callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross_pretrained.tf/assets


Epoch 2/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_cross_pretrained.tf/assets


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe38a573340>

In [27]:
model = tf.keras.models.load_model('./res/multi_model_multiheadatt_cross_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7276    0.8508    0.7844       248
         Pos     0.6838    0.5714    0.6226       140
         Neu     0.4000    0.2899    0.3361        69

    accuracy                         0.6805       457
   macro avg     0.6038    0.5707    0.5810       457
weighted avg     0.6647    0.6805    0.6671       457

[[211  22  15]
 [ 45  80  15]
 [ 34  15  20]]


In [32]:
# './res/V_model_CNNLSTM_clip.tf'
vis_ipt = Input((10, 512))
v_model = tf.keras.models.load_model('./res/V_model_CNNLSTM_clip.tf/')
v_model = Model(inputs=v_model.inputs, outputs=v_model.layers[-2].output)
for l in v_model.layers:
    l.trainable = False
vis_h = v_model(vis_ipt)

# './res/A_model_Att-BLSTM_wav2vec_v2.tf'
aud_ipt = Input((128,512))
a_model = tf.keras.models.load_model('./res/A_model_Att-BLSTM_wav2vec_v2.tf/')
a_model = Model(inputs=a_model.inputs, outputs=a_model.layers[-2].output)
for l in a_model.layers:
    l.trainable = False
aud_h = a_model(aud_ipt)

# './res/T_model_AttCNN_bert.tf'
tex_ipt = Input((36,768))
t_model = tf.keras.models.load_model('./res/T_model_AttCNN_bert.tf/')
t_model = Model(inputs=t_model.inputs, outputs=[t_model.layers[-4].output,t_model.layers[-3].output])
for l in t_model.layers:
    l.trainable = False
tex_q, tex_qv_attention = t_model(tex_ipt)

h = Concatenate(axis=1)([tf.expand_dims(tex_q,1), tf.expand_dims(tex_qv_attention,1), tf.expand_dims(vis_h,1), tf.expand_dims(aud_h,1)])

h = GlobalSelfAttention(num_heads=4, key_dim=32, dropout=0.2)(h)

# h = Dense(64)(h)
# res = Dense(3, activation='softmax')(h)
res = Dense(3, activation='softmax')(tf.reduce_mean(h, 1))

model = Model(inputs=[vis_ipt, aud_ipt, tex_ipt], outputs=res)
model.compile(optimizer='Adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics='acc')
model.summary()





Model: "model_35"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_27 (InputLayer)          [(None, 36, 768)]    0           []                               
                                                                                                  
 input_25 (InputLayer)          [(None, 10, 512)]    0           []                               
                                                                                                  
 input_26 (InputLayer)          [(None, 128, 512)]   0           []                               
                                                                                                  
 model_34 (Functional)          [(None, 64),         295040      ['input_27[0][0]']               
                                 (None, 64)]                                               

In [33]:
callback_list = [ModelCheckpoint(filepath='./res/multi_model_multiheadatt_global_pretrained.tf', monitor='val_loss', save_best_only=True, save_freq='epoch')]

model.fit(x=[np.asarray(visual_clip['train']), np.asarray(acoustic['train']), np.asarray(bert_embs['train'])], 
          y=np.asarray(label['train']), batch_size=16, epochs=30,
          validation_data=[[np.asarray(visual_clip['valid']), np.asarray(acoustic['valid']), np.asarray(bert_embs['valid'])],np.asarray(label['valid'])],
          callbacks=callback_list)

Epoch 1/30



INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global_pretrained.tf/assets


INFO:tensorflow:Assets written to: ./res/multi_model_multiheadatt_global_pretrained.tf/assets


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe39b1aeef0>

In [34]:
model = tf.keras.models.load_model('./res/multi_model_multiheadatt_global_pretrained.tf/')
pred = model.predict([np.asarray(visual_clip['test']), np.asarray(acoustic['test']), np.asarray(bert_embs['test'])])
predicted_test_labels = pred.argmax(axis=1)
numeric_test_labels = np.array(label['test'])
            
eval_res = classification_report(numeric_test_labels, predicted_test_labels, 
                                    target_names = ['Neg', 'Pos', 'Neu'], 
                                    digits=4, output_dict=False)

print(eval_res)

cm = confusion_matrix(y_true=numeric_test_labels.tolist(), y_pred=predicted_test_labels.tolist())
print(cm)





              precision    recall  f1-score   support

         Neg     0.7529    0.7742    0.7634       248
         Pos     0.5965    0.7286    0.6559       140
         Neu     0.3871    0.1739    0.2400        69

    accuracy                         0.6696       457
   macro avg     0.5788    0.5589    0.5531       457
weighted avg     0.6498    0.6696    0.6515       457

[[192  48   8]
 [ 27 102  11]
 [ 36  21  12]]
