In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
with open('../video_features/resnet_features_same_num_frames.pkl', 'rb') as f:
    video_features_dict = pickle.load(f)

In [3]:
data = pd.read_csv('../datasets/mustard_dataset/sarcasm_with_id.csv')
video_features = []

ids = list(data['id'])

for i in ids:
    if i[-2:] == "_1":
        video_features.append(video_features_dict[i[:-2]])

video_features = np.array(video_features)
video_features.shape

(690, 100, 2048)

In [4]:
import h5py

f = h5py.File("../../bert embeddings/bert_features.h5")
text_features_dict = {}
for k in list(f.keys()):
    text_features_dict[k] = np.array(f[k])
f.close()

In [5]:
text_features = []

for i in ids:
    if i[-2:] == "_1":
        text_features.append(text_features_dict[i])

text_features = np.array(text_features)
text_features.shape

(690, 20, 768)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,utterance,sarcasm
0,0,1_60_1,It's just a privilege to watch your mind at work.,True
1,1,1_60_2,It's just a privilege to look at your mind at ...,True
2,2,1_70_1,I don't think I'll be able to stop thinking ab...,True
3,3,1_70_2,I don't think I can stop thinking about it.,True
4,4,1_80_1,"Since it's not bee season, you can have my epi...",False


In [7]:
labelDict = {}

sarcasm = list(data['sarcasm'])

for i in range(len(ids)):
    labelDict[ids[i]] = sarcasm[i]
    
labels = []

for i in ids:
    if i[-2:] == "_1":
        labels.append(int(labelDict[i]))

labels = np.array(labels)
labels.shape

(690,)

In [8]:
import keras.backend as K
from tensorflow.keras.layers import Layer, Dense

class coAttention_para(Layer):
    """
    self-defined parallel co-attention layer.
    inputs: [tFeature, iFeature]
    outputs: [coFeature]
    dimension:
    input dimensions: [(batch_size, seq_length, embedding_size), (batch_size, num_img_region, 2*hidden_size)]
        considering subsequent operation, better to set embedding_size == 2*hidden_size
    output dimensions:[(batch_size, 2*hidden_size)]
    """
    def __init__(self, dim_k, **kwargs):
        super(coAttention_para, self).__init__(**kwargs)
        self.dim_k = dim_k  # internal tensor dimension
        self.supports_masking = True

    def build(self, input_shape):
        if not isinstance(input_shape, list):
            raise ValueError('A Co-Attention_para layer should be called '
                             'on a list of inputs.')
        if len(input_shape) != 2:
            raise ValueError('A Co-Attention_para layer should be called on a list of 2 inputs.'
                             'Got '+str(len(input_shape))+'inputs.')
        self.embedding_size = input_shape[0][-1]
        self.num_region = input_shape[1][1]
        self.seq_len = input_shape[0][1]
        """
        naming variables following the VQA paper
        """
        self.Wb = self.add_weight(name="Wb",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.embedding_size),
                                  trainable=True)
        self.Wq = self.add_weight(name="Wq",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.dim_k),
                                  trainable=True)
        self.Wv = self.add_weight(name="Wv",
                                  initializer="random_normal",
                                  # initializer="ones",
                                  shape=(self.embedding_size, self.dim_k),
                                  trainable=True)
        self.Whv = self.add_weight(name="Whv",
                                   initializer="random_normal",
                                   # initializer="ones",
                                   shape=(self.dim_k, 1),
                                   trainable=True)
        self.Whq = self.add_weight(name="Whq",
                                   initializer="random_normal",
                                   # initializer="ones",
                                   shape=(self.dim_k, 1),
                                   trainable=True)

        super(coAttention_para, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs, mask=None):
        tFeature = inputs[0]
        iFeature = inputs[1]
        # affinity matrix C
        affi_mat = K.dot(tFeature, self.Wb)
        affi_mat = K.batch_dot(affi_mat, K.permute_dimensions(iFeature, (0, 2, 1)))  # (batch_size, seq_len, num_region)
        # Hq, Hv, av, aq
        tmp_Hv = K.dot(tFeature, self.Wq)
        Hv = K.dot(iFeature, self.Wv) + K.batch_dot(K.permute_dimensions(affi_mat, (0, 2, 1)), tmp_Hv)
        Hv = K.tanh(Hv)
        av = K.softmax(K.squeeze(K.dot(Hv, self.Whv), axis=-1))

        tmp_Hq = K.dot(iFeature, self.Wv)
        Hq = K.dot(tFeature, self.Wq) + K.batch_dot(affi_mat, tmp_Hq)
        Hq = K.tanh(Hq)
        aq = K.softmax(K.squeeze(K.dot(Hq, self.Whq), axis=-1))

        av = K.permute_dimensions(K.repeat(av, self.embedding_size), (0, 2, 1))
        aq = K.permute_dimensions(K.repeat(aq, self.embedding_size), (0, 2, 1))

        tfeature = K.sum(aq * tFeature, axis=1)
        ifeature = K.sum(av * iFeature, axis=1)

        return tfeature+ifeature

    def get_config(self):
        return super(coAttention_para, self).get_config()

    def compute_mask(self, inputs, mask=None):
        return None

    def compute_output_shape(self, input_shape):
        output_shape = (input_shape[0][0], input_shape[0][-1])
        return output_shape
    
T = np.ones((32,20,768))
V = np.ones((32,100,768))
coAttention_para(300)([T,V]).shape

TensorShape([32, 768])

In [9]:
from tensorflow.keras.layers import Input, Concatenate, Dropout, Dense, LSTM
from tensorflow.keras.models import Model
import tensorflow as tf

In [10]:
def myModel(video=True, text=True):        
    finalInputs = []

    if video and text:
        V = Input(shape=(100, 2048))
        T = Input(shape=(20,768))
        
        Tf = LSTM(768, return_sequences=True)(T)
        Vf = LSTM(768, return_sequences=True)(V)
        
        f = coAttention_para(300)([Tf, Vf])

        finalInputs.append(V)
        finalInputs.append(T)

    elif video:
        V = Input(shape=(100, 2048))
        f = LSTM(768, return_sequences=False)(V)
        finalInputs.append(V)
        
    elif text:
        T = Input(shape=(20, 2048))
        f = LSTM(768, return_sequences=False)(T)
        finalInputs.append(T)

    else:
        raise Exception("Model can't have 0 inputs")        

    f = Dense(128)(f)
    f = Dense(64)(f)
    dropout = Dropout(0.5)(f)
    output = Dense(1, activation="sigmoid", use_bias=True)(dropout)
    
    model = Model(inputs=finalInputs, outputs=output)
    model.compile(
        optimizer="adam", 
        loss='binary_crossentropy', 
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ])
    return model

model = myModel()
print(model.summary())

del model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20, 768)]    0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 100, 2048)]  0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 20, 768)      4721664     ['input_2[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 100, 768)     8653824     ['input_1[0][0]']                
                                                                                              

In [17]:
# from sklearn.utils import shuffle
# RANDOM_STATE = 50

# def create_train_valid(features,labels,train_fraction = 0.7,max_valid=1000):

#     features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

#     train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

#     train_features = np.asarray(features[:train_end])
#     valid_features = np.asarray(features[train_end:])

#     train_labels = np.asarray(labels[:train_end])
#     valid_labels = np.asarray(labels[train_end:])
    
#     return train_features,valid_features,train_labels,valid_labels


# x_train_video, x_valid_video, y_train_video, y_valid_video = create_train_valid(video_features, labels)
# x_train_text, x_valid_text, y_train_text, y_valid_text = create_train_valid(text_features, labels)

In [11]:
EPOCHS = 20
BATCH_SIZE = 64

In [14]:
from sklearn.model_selection import KFold

def getF1Score(pre, rec):
    if pre == 0 or rec == 0:
        return 0
    return (2*pre*rec)/(pre+rec)

def kFoldResults(video_feat, text_feat, labels, video=True, text=True, n_splits=5):

    kf = KFold(n_splits)

    results = []

    count = 0
    for train_index, test_index in kf.split(video_feat):
        print("Fold " + str(count + 1) + ":\n")
        model = myModel(video, text)
        s1 = 0

        train_x = []
        test_x = []
        
        if video:
            train_x.append(np.array([video_feat[index] for index in train_index], dtype='float64'))
            test_x.append(np.array([video_feat[index] for index in test_index], dtype='float64'))
        if text:
            train_x.append(np.array([text_feat[index] for index in train_index], dtype='float64'))
            test_x.append(np.array([text_feat[index] for index in test_index], dtype='float64'))

        train_y = np.array([labels[index] for index in train_index])
        test_y = np.array([labels[index] for index in test_index])

        for i in range(1, EPOCHS+1):
            history = model.fit(x=train_x, 
                    y=train_y, 
                    batch_size=BATCH_SIZE, 
                    epochs=1, 
                    verbose=2)
            
            a = model.evaluate(
                    x=test_x, 
                    y=test_y,
                    batch_size=BATCH_SIZE
                )

            f1 = getF1Score(a[2], a[3])
            
            if f1 > s1:
                s1 = f1
                model.save_weights("model_v2_weights_" + str(count) + ".h5")

        count += 1

In [None]:
kFoldResults(video_features, text_features, labels)

Fold 1:

2
(552, 100, 2048)
(552, 20, 768)
9/9 - 93s - loss: 2.2719 - binary_accuracy: 0.4891 - precision_2: 0.4800 - recall_2: 0.4871 - 93s/epoch - 10s/step


In [None]:
model = myModel()
s1 = 0
s2 = 0
s3 = 0

for i in range(1, EPOCHS+1):
    history = model.fit(x=[x_train_video, x_train_text], 
              y=y_train_video, 
              batch_size=BATCH_SIZE, 
              epochs=1, 
              verbose=2)
    
    a = model.evaluate(
            x=[x_valid_video, x_valid_text], 
            y=y_valid_video,
            batch_size=BATCH_SIZE
        )
    
    if a[2] > s1 or (a[2]==s1 and a[3]>s2) or (a[2]==s1 and a[3]==s2 and a[1]>s3):
        s3,s1,s2 = a[1:]
        model.save_weights("model_v2_weights.h5")

In [None]:
newModel = myModel()
newModel.load_weights("model_v2_weights.h5")

newModel.evaluate(
    x=[x_valid_video, x_valid_text], 
    y=y_valid_video,
    batch_size=BATCH_SIZE
)

In [None]:
# [0.6902378797531128, 0.6634615659713745, 0.737500011920929, 0.5462962985038757]