In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
with open('../../video_features/resnet_features.pkl', 'rb') as f:
    video_features_dict = pickle.load(f)

for k in video_features_dict:
    video_features_dict[k] = np.array(video_features_dict[k])

In [3]:
a = [video_features_dict[k].shape[0] for k in video_features_dict]
np.percentile(a, [0,25,50,75,100])

array([  9.,  72., 120., 168., 480.])

In [10]:
import random
def getUpdatedFeatures(a, nf):
    b = a.copy()
    while b.shape[0]<nf:
        i = random.randint(0,b.shape[0]-1)
        b = np.concatenate((b[:i],np.array([b[i]]),b[i:]))
        
    while b.shape[0]>nf:
        i = random.randint(0,b.shape[0]-1)
        b = np.concatenate((b[:i],b[i+1:]))
    
    return b

getUpdatedFeatures(video_features_dict[k], numFrames).shape

(100, 2048)

In [11]:
video_features_dict2 = {}
for k in video_features_dict:
    video_features_dict2[k] = getUpdatedFeatures(video_features_dict[k], numFrames)

In [12]:
output = open('../../video_features/resnet_features_same_num_frames.pkl', 'wb')
pickle.dump(video_features_dict2, output)
output.close()

In [30]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')
video_features = []

ids = list(data['id'])

for i in ids:
    if i[-2:] == "_1":
        video_features.append(video_features_dict2[i[:-2]])

video_features = np.array(video_features)
video_features.shape

(690, 100, 2048)

In [13]:
with open('../../bert embeddings/sarcasm_aug_embeddings.pkl', 'rb') as f:
    text_features_dict = pickle.load(f)

text_features_dict[list(text_features_dict.keys())[0]].shape

(768,)

In [14]:
text_features = []
video_features = []

for i in text_features_dict:
    video_features.append(video_features_dict2[i[:-2]])
    text_features.append(text_features_dict[i])

video_features = np.array(video_features)
text_features = np.array(text_features)
video_features.shape, text_features.shape

((1322, 100, 2048), (1322, 768))

In [17]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')
idToLabel = {}

ids = list(data['id'])
lab = [int(k) for k in list(data['sarcasm'])]

iToL = {}
for i in range(len(ids)):
    iToL[ids[i]] = lab[i]

labels = []
for i in text_features_dict:
    labels.append(iToL[i])

labels = np.array(labels)
labels.shape

(1322,)

In [18]:
import keras.backend as K
from tensorflow.keras.layers import Layer, Dense

class TextBasedAttention(Layer):
    """
    Inputs:
        V: (nf, vd)
        T: (td)
    Outputs:
        Vp: (nf)
    """
    def __init__(self, **kwargs):
        super(TextBasedAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.w = self.add_weight(name='w',
                                   shape=(input_shape[0][-1], input_shape[1][-1]),
                                   initializer='random_normal',
                                   trainable=True)
        self.dense = Dense(input_shape[0][-2], activation='softmax')
        super(TextBasedAttention, self).build(input_shape)

    def call(self, x, mask=None):
        V = x[0]
        T = K.expand_dims(x[1], -1)
        
        p1 = K.permute_dimensions(K.dot(self.w,T), (1, 0, 2))
        prod = K.squeeze(K.batch_dot(V, p1), axis=-1)
        
        scores = self.dense(prod)
        scores = K.permute_dimensions(K.repeat(scores, V.shape[-1]), (0, 2, 1))
        
        vp = K.sum(V*scores, axis=-2)
        
        return vp

    def compute_output_shape(self, input_shape):
        return (input_shape[0][-2])

    def get_config(self):
        return super(TextBasedAttention, self).get_config()
    
T = np.ones((32,768))
V = np.ones((32,100,2048))
TextBasedAttention()([V,T]).shape

TensorShape([32, 2048])

In [19]:
from tensorflow.keras.layers import Input, Concatenate, Dropout, Dense
from tensorflow.keras.models import Model

In [20]:
import tensorflow as tf
def myModel():
    V = Input(shape=(100, 2048))
    T = Input(shape=(768,))
    
    Vp = TextBasedAttention()([V,T])

    f = Concatenate()([Vp, T])
    dropout = Dropout(0.5)(f)
    output = Dense(1, activation="sigmoid", use_bias=True)(dropout)
    
    model = Model(inputs=[V,T], outputs=output)
    model.compile(
        optimizer="adam", 
        loss='binary_crossentropy', 
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ])
    return model

model = myModel()
print(model.summary())

del model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100, 2048)]  0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 768)]        0           []                               
                                                                                                  
 text_based_attention_1 (TextBa  (None, 2048)        1582964     ['input_1[0][0]',                
 sedAttention)                                                    'input_2[0][0]']                
                                                                                                  
 concatenate (Concatenate)      (None, 2816)         0           ['text_based_attention_1[0][0

In [21]:
from sklearn.utils import shuffle
RANDOM_STATE = 50

def create_train_valid(features,labels,train_fraction = 0.7,max_valid=1000):

    features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels


x_train_video, x_valid_video, y_train_video, y_valid_video = create_train_valid(video_features, labels)
x_train_text, x_valid_text, y_train_text, y_valid_text = create_train_valid(text_features, labels)

In [25]:
EPOCHS = 100
BATCH_SIZE = 64

In [31]:
model = myModel()
s1 = 0
s2 = 0
s3 = 0

for i in range(1, EPOCHS+1):
    history = model.fit(x=[x_train_video, x_train_text], 
              y=y_train_video, 
              batch_size=BATCH_SIZE, 
              epochs=1, 
              verbose=2)
    
    S = model.evaluate(
        x=[x_valid_video, x_valid_text], 
        y=y_valid_video,
        batch_size=BATCH_SIZE
    )[1:]
    
#     S = history.history["val_binary_accuracy"][0]
    if S[0] > s1 or (S[0] == s1 and S[1] > s2) or (S[0] == s1 and S[1] == s2 and S[2] > s3):
        s1, s2, s3 = S
        model.save_weights("model_aug_weights.h5")

15/15 - 3s - loss: 0.7313 - binary_accuracy: 0.5405 - precision_8: 0.5577 - recall_8: 0.5542 - 3s/epoch - 200ms/step
15/15 - 2s - loss: 0.6940 - binary_accuracy: 0.5978 - precision_8: 0.5993 - recall_8: 0.6792 - 2s/epoch - 157ms/step
15/15 - 2s - loss: 0.6337 - binary_accuracy: 0.6465 - precision_8: 0.6638 - recall_8: 0.6458 - 2s/epoch - 157ms/step
15/15 - 2s - loss: 0.6121 - binary_accuracy: 0.6703 - precision_8: 0.6834 - recall_8: 0.6792 - 2s/epoch - 159ms/step
15/15 - 2s - loss: 0.5872 - binary_accuracy: 0.6908 - precision_8: 0.6932 - recall_8: 0.7250 - 2s/epoch - 158ms/step
15/15 - 2s - loss: 0.5608 - binary_accuracy: 0.7103 - precision_8: 0.7154 - recall_8: 0.7333 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.5382 - binary_accuracy: 0.7373 - precision_8: 0.7328 - recall_8: 0.7771 - 2s/epoch - 154ms/step
15/15 - 2s - loss: 0.5367 - binary_accuracy: 0.7373 - precision_8: 0.7328 - recall_8: 0.7771 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.5238 - binary_accuracy: 0.7384 - precision_

15/15 - 2s - loss: 0.3516 - binary_accuracy: 0.8627 - precision_8: 0.8639 - recall_8: 0.8729 - 2s/epoch - 153ms/step
15/15 - 2s - loss: 0.3520 - binary_accuracy: 0.8562 - precision_8: 0.8563 - recall_8: 0.8687 - 2s/epoch - 153ms/step
15/15 - 2s - loss: 0.3577 - binary_accuracy: 0.8595 - precision_8: 0.8586 - recall_8: 0.8729 - 2s/epoch - 154ms/step
15/15 - 2s - loss: 0.3551 - binary_accuracy: 0.8616 - precision_8: 0.8478 - recall_8: 0.8938 - 2s/epoch - 154ms/step
15/15 - 2s - loss: 0.3421 - binary_accuracy: 0.8659 - precision_8: 0.8589 - recall_8: 0.8875 - 2s/epoch - 153ms/step
15/15 - 2s - loss: 0.3408 - binary_accuracy: 0.8605 - precision_8: 0.8774 - recall_8: 0.8500 - 2s/epoch - 154ms/step
15/15 - 2s - loss: 0.3323 - binary_accuracy: 0.8692 - precision_8: 0.8763 - recall_8: 0.8708 - 2s/epoch - 153ms/step
15/15 - 2s - loss: 0.3455 - binary_accuracy: 0.8584 - precision_8: 0.8483 - recall_8: 0.8854 - 2s/epoch - 157ms/step
15/15 - 2s - loss: 0.3414 - binary_accuracy: 0.8638 - precision_

15/15 - 2s - loss: 0.2965 - binary_accuracy: 0.8941 - precision_8: 0.8851 - recall_8: 0.9146 - 2s/epoch - 156ms/step
15/15 - 2s - loss: 0.3065 - binary_accuracy: 0.8703 - precision_8: 0.8673 - recall_8: 0.8854 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.2799 - binary_accuracy: 0.8973 - precision_8: 0.8969 - recall_8: 0.9062 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.2908 - binary_accuracy: 0.8908 - precision_8: 0.8859 - recall_8: 0.9062 - 2s/epoch - 157ms/step
15/15 - 2s - loss: 0.2823 - binary_accuracy: 0.8962 - precision_8: 0.8887 - recall_8: 0.9146 - 2s/epoch - 157ms/step
15/15 - 2s - loss: 0.2897 - binary_accuracy: 0.8865 - precision_8: 0.8981 - recall_8: 0.8813 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.2909 - binary_accuracy: 0.8865 - precision_8: 0.8882 - recall_8: 0.8938 - 2s/epoch - 155ms/step
15/15 - 2s - loss: 0.3036 - binary_accuracy: 0.8832 - precision_8: 0.8827 - recall_8: 0.8938 - 2s/epoch - 158ms/step
15/15 - 2s - loss: 0.2994 - binary_accuracy: 0.8897 - precision_

15/15 - 2s - loss: 0.2827 - binary_accuracy: 0.8854 - precision_8: 0.8912 - recall_8: 0.8875 - 2s/epoch - 156ms/step


In [32]:
newModel = myModel()
newModel.load_weights("model_aug_weights.h5")

newModel.evaluate(
    x=[x_valid_video, x_valid_text], 
    y=y_valid_video,
    batch_size=BATCH_SIZE
)



[0.38396698236465454,
 0.8539043068885803,
 0.8275862336158752,
 0.8795811533927917]