In [1]:
import numpy as np
import pandas as pd
import pickle

In [3]:
import pickle

with open('../../video_features/resnet_features_same_num_frames.pkl', 'rb') as f:
    video_features_dict = pickle.load(f)

In [6]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')
video_features = []

ids = list(data['id'])

for i in ids:
    if i[-2:] == "_1":
        video_features.append(video_features_dict[i[:-2]])

video_features = np.array(video_features)
video_features.shape

(690, 100, 2048)

In [7]:
import h5py

f = h5py.File("../../bert embeddings/bert_features.h5")
text_features_dict = {}
for k in list(f.keys()):
    text_features_dict[k] = np.array(f[k])
f.close()

In [8]:
text_features = []

for i in ids:
    if i[-2:] == "_1":
        text_features.append(text_features_dict[i])

text_features = np.array(text_features)
text_features.shape

(690, 20, 768)

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,utterance,sarcasm
0,0,1_60_1,It's just a privilege to watch your mind at work.,True
1,1,1_60_2,It's just a privilege to look at your mind at ...,True
2,2,1_70_1,I don't think I'll be able to stop thinking ab...,True
3,3,1_70_2,I don't think I can stop thinking about it.,True
4,4,1_80_1,"Since it's not bee season, you can have my epi...",False


In [10]:
labelDict = {}

sarcasm = list(data['sarcasm'])

for i in range(len(ids)):
    labelDict[ids[i]] = sarcasm[i]
    
labels = []

for i in ids:
    if i[-2:] == "_1":
        labels.append(int(labelDict[i]))

labels = np.array(labels)
labels.shape

(690,)

In [14]:
import keras.backend as K
from tensorflow.keras.layers import Layer, Dense

class coAttention_para(Layer):
    """
    self-defined parallel co-attention layer.
    inputs: [tFeature, iFeature]
    outputs: [coFeature]
    dimension:
    input dimensions: [(batch_size, seq_length, embedding_size), (batch_size, num_img_region, 2*hidden_size)]
        considering subsequent operation, better to set embedding_size == 2*hidden_size
    output dimensions:[(batch_size, 2*hidden_size)]
    """
    def __init__(self, dim_k, **kwargs):
        super(coAttention_para, self).__init__(**kwargs)
        self.dim_k = dim_k  # internal tensor dimension
        self.supports_masking = True

    def build(self, input_shape):
        if not isinstance(input_shape, list):
            raise ValueError('A Co-Attention_para layer should be called '
                             'on a list of inputs.')
        if len(input_shape) != 2:
            raise ValueError('A Co-Attention_para layer should be called on a list of 2 inputs.'
                             'Got '+str(len(input_shape))+'inputs.')
        self.embedding_size = input_shape[0][-1]
        self.num_region = input_shape[1][1]
        self.seq_len = input_shape[0][1]
        """
        naming variables following the VQA paper
        """
        self.Wb = self.add_weight(name="Wb",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.embedding_size),
                                  trainable=True)
        self.Wq = self.add_weight(name="Wq",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.dim_k),
                                  trainable=True)
        self.Wv = self.add_weight(name="Wv",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.dim_k),
                                  trainable=True)
        self.Whv = self.add_weight(name="Whv",
                                   initializer="random_normal",
                                   # initializer="ones",
                                   shape=(self.dim_k, 1),
                                   trainable=True)
        self.Whq = self.add_weight(name="Whq",
                                   initializer="random_normal",
                                   # initializer="ones",
                                   shape=(self.dim_k, 1),
                                   trainable=True)

        super(coAttention_para, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs, mask=None):
        tFeature = inputs[0]
        iFeature = inputs[1]
        # affinity matrix C
        affi_mat = K.dot(tFeature, self.Wb)
        affi_mat = K.batch_dot(affi_mat, K.permute_dimensions(iFeature, (0, 2, 1)))  # (batch_size, seq_len, num_region)
        # Hq, Hv, av, aq
        tmp_Hv = K.dot(tFeature, self.Wq)
        Hv = K.dot(iFeature, self.Wv) + K.batch_dot(K.permute_dimensions(affi_mat, (0, 2, 1)), tmp_Hv)
        Hv = K.tanh(Hv)
        av = K.softmax(K.squeeze(K.dot(Hv, self.Whv), axis=-1))

        tmp_Hq = K.dot(iFeature, self.Wv)
        Hq = K.dot(tFeature, self.Wq) + K.batch_dot(affi_mat, tmp_Hq)
        Hq = K.tanh(Hq)
        aq = K.softmax(K.squeeze(K.dot(Hq, self.Whq), axis=-1))

        av = K.permute_dimensions(K.repeat(av, self.embedding_size), (0, 2, 1))
        aq = K.permute_dimensions(K.repeat(aq, self.embedding_size), (0, 2, 1))

        tfeature = K.sum(aq * tFeature, axis=1)
        ifeature = K.sum(av * iFeature, axis=1)

        return tfeature+ifeature

    def get_config(self):
        return super(coAttention_para, self).get_config()

    def compute_mask(self, inputs, mask=None):
        return None

    def compute_output_shape(self, input_shape):
        output_shape = (input_shape[0][0], input_shape[0][-1])
        return output_shape
    
T = np.ones((32,20,768))
V = np.ones((32,100,768))
coAttention_para(300)([T,V]).shape

TensorShape([32, 768])

In [15]:
from tensorflow.keras.layers import Input, Concatenate, Dropout, Dense
from tensorflow.keras.models import Model

In [16]:
import tensorflow as tf
def myModel():
    V = Input(shape=(100, 2048))
    T = Input(shape=(20,768))
    
    Vp = Dense(768)(V)
    

    f = coAttention_para(300)([T, Vp])
    dropout = Dropout(0.5)(f)
    output = Dense(1, activation="sigmoid", use_bias=True)(dropout)
    
    model = Model(inputs=[V,T], outputs=output)
    model.compile(
        optimizer="adam", 
        loss='binary_crossentropy', 
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ])
    return model

model = myModel()
print(model.summary())

del model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100, 2048)]  0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20, 768)]    0           []                               
                                                                                                  
 dense_1 (Dense)                (None, 100, 768)     1573632     ['input_1[0][0]']                
                                                                                                  
 co_attention_para_1 (coAttenti  (None, 768)         1051224     ['input_2[0][0]',                
 on_para)                                                         'dense_1[0][0]']            

In [17]:
from sklearn.utils import shuffle
RANDOM_STATE = 50

def create_train_valid(features,labels,train_fraction = 0.7,max_valid=1000):

    features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels


x_train_video, x_valid_video, y_train_video, y_valid_video = create_train_valid(video_features, labels)
x_train_text, x_valid_text, y_train_text, y_valid_text = create_train_valid(text_features, labels)

In [20]:
EPOCHS = 20
BATCH_SIZE = 64

In [21]:
model = myModel()
s = 0

for i in range(1, EPOCHS+1):
    history = model.fit(x=[x_train_video, x_train_text], 
              y=y_train_video, 
              batch_size=BATCH_SIZE, 
              epochs=1, 
              verbose=2,
              validation_split=0.1)
    
    S = history.history["val_binary_accuracy"][0]
    if S > s:
        s = S
        model.save_weights("model_v2_weights.h5")

7/7 - 5s - loss: 0.8011 - binary_accuracy: 0.5335 - precision_2: 0.5294 - recall_2: 0.5442 - val_loss: 0.7035 - val_binary_accuracy: 0.6531 - val_precision_2: 0.6667 - val_recall_2: 0.4545 - 5s/epoch - 697ms/step
7/7 - 3s - loss: 0.6218 - binary_accuracy: 0.6767 - precision_2: 0.6984 - recall_2: 0.6140 - val_loss: 0.7192 - val_binary_accuracy: 0.5306 - val_precision_2: 0.4815 - val_recall_2: 0.5909 - 3s/epoch - 485ms/step
7/7 - 4s - loss: 0.6489 - binary_accuracy: 0.7090 - precision_2: 0.7014 - recall_2: 0.7209 - val_loss: 0.6978 - val_binary_accuracy: 0.6327 - val_precision_2: 0.6111 - val_recall_2: 0.5000 - 4s/epoch - 597ms/step
7/7 - 4s - loss: 0.6138 - binary_accuracy: 0.7344 - precision_2: 0.7427 - recall_2: 0.7116 - val_loss: 0.7110 - val_binary_accuracy: 0.6327 - val_precision_2: 0.5833 - val_recall_2: 0.6364 - 4s/epoch - 590ms/step
7/7 - 4s - loss: 0.5670 - binary_accuracy: 0.7390 - precision_2: 0.7277 - recall_2: 0.7581 - val_loss: 0.7756 - val_binary_accuracy: 0.6531 - val_pr

In [29]:
model = myModel()
s1 = 0
s2 = 0
s3 = 0

for i in range(1, EPOCHS+1):
    history = model.fit(x=[x_train_video, x_train_text], 
              y=y_train_video, 
              batch_size=BATCH_SIZE, 
              epochs=1, 
              verbose=2)
    
    a = model.evaluate(
            x=[x_valid_video, x_valid_text], 
            y=y_valid_video,
            batch_size=BATCH_SIZE
        )
    
    if a[2] > s1 or (a[2]==s1 and a[3]>s2) or (a[2]==s1 and a[3]==s2 and a[1]>s3):
        s3,s1,s2 = a[1:]
        model.save_weights("model_v2_weights.h5")

8/8 - 15s - loss: 1.0543 - binary_accuracy: 0.5207 - precision_9: 0.5143 - recall_9: 0.4557 - 15s/epoch - 2s/step
8/8 - 11s - loss: 0.8077 - binary_accuracy: 0.6224 - precision_9: 0.6291 - recall_9: 0.5654 - 11s/epoch - 1s/step
8/8 - 9s - loss: 0.7712 - binary_accuracy: 0.6556 - precision_9: 0.6485 - recall_9: 0.6540 - 9s/epoch - 1s/step
8/8 - 14s - loss: 0.6766 - binary_accuracy: 0.7116 - precision_9: 0.6929 - recall_9: 0.7426 - 14s/epoch - 2s/step
8/8 - 17s - loss: 0.6941 - binary_accuracy: 0.7012 - precision_9: 0.6914 - recall_9: 0.7089 - 17s/epoch - 2s/step
8/8 - 12s - loss: 0.6559 - binary_accuracy: 0.7178 - precision_9: 0.7095 - recall_9: 0.7215 - 12s/epoch - 1s/step
8/8 - 11s - loss: 0.7215 - binary_accuracy: 0.7033 - precision_9: 0.7217 - recall_9: 0.6456 - 11s/epoch - 1s/step
8/8 - 10s - loss: 0.4670 - binary_accuracy: 0.7884 - precision_9: 0.7897 - recall_9: 0.7764 - 10s/epoch - 1s/step
8/8 - 8s - loss: 0.5029 - binary_accuracy: 0.7905 - precision_9: 0.7833 - recall_9: 0.7932

In [30]:
newModel = myModel()
newModel.load_weights("model_v2_weights.h5")

newModel.evaluate(
    x=[x_valid_video, x_valid_text], 
    y=y_valid_video,
    batch_size=BATCH_SIZE
)



[0.7499933838844299,
 0.6394230723381042,
 0.7796609997749329,
 0.42592594027519226]

In [None]:
# [0.6902378797531128, 0.6634615659713745, 0.737500011920929, 0.5462962985038757]