In [2]:
import numpy as np
import pandas as pd
import pickle

In [21]:
with open('../../video_features/resnet_features.pkl', 'rb') as f:
    video_features_dict = pickle.load(f)

for k in video_features_dict:
    video_features_dict[k] = np.array(video_features_dict[k])

In [24]:
a = [video_features_dict[k].shape[0] for k in video_features_dict]
np.percentile(a, [0,25,50,75,100])

array([  9.,  72., 120., 168., 480.])

In [27]:
numFrames = 100

In [28]:
import random
def getUpdatedFeatures(a, nf):
    b = a.copy()
    while b.shape[0]<nf:
        i = random.randint(0,b.shape[0]-1)
        b = np.concatenate((b[:i],np.array([b[i]]),b[i:]))
        
    while b.shape[0]>nf:
        i = random.randint(0,b.shape[0]-1)
        b = np.concatenate((b[:i],b[i+1:]))
    
    return b

getUpdatedFeatures(video_features_dict[k], numFrames).shape

(100, 2048)

In [29]:
video_features_dict2 = {}
for k in video_features_dict:
    video_features_dict2[k] = getUpdatedFeatures(video_features_dict[k], numFrames)

In [31]:
output = open('../../video_features/resnet_features_same_num_frames.pkl', 'wb')
pickle.dump(video_features_dict2, output)
output.close()

In [30]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_with_id.csv')
video_features = []

ids = list(data['id'])

for i in ids:
    if i[-2:] == "_1":
        video_features.append(video_features_dict2[i[:-2]])

video_features = np.array(video_features)
video_features.shape

(690, 100, 2048)

In [32]:
data = pd.read_csv('../../datasets/mustard_dataset/sarcasm_data.csv').dropna(axis=0,how='any')

features = data['text'].to_list()
labels = data['sarcasm'].to_list()

text_features = np.load('../../bert embeddings/sarcasm_data_embeddings.npy')
text_features.shape

(690, 768)

In [74]:
labels = np.array(labels)

In [68]:
import keras.backend as K
from tensorflow.keras.layers import Layer, Dense

class TextBasedAttention(Layer):
    """
    Inputs:
        V: (nf, vd)
        T: (td)
    Outputs:
        Vp: (nf)
    """
    def __init__(self, **kwargs):
        super(TextBasedAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.w = self.add_weight(name='w',
                                   shape=(input_shape[0][-1], input_shape[1][-1]),
                                   initializer='random_normal',
                                   trainable=True)
        self.dense = Dense(input_shape[0][-2], activation='softmax')
        super(TextBasedAttention, self).build(input_shape)

    def call(self, x, mask=None):
        V = x[0]
        T = K.expand_dims(x[1], -1)
        
        p1 = K.permute_dimensions(K.dot(self.w,T), (1, 0, 2))
        prod = K.squeeze(K.batch_dot(V, p1), axis=-1)
        
        scores = self.dense(prod)
        scores = K.permute_dimensions(K.repeat(scores, V.shape[-1]), (0, 2, 1))
        
        vp = K.sum(V*scores, axis=-2)
        
        return vp

    def compute_output_shape(self, input_shape):
        return (input_shape[0][-2])

    def get_config(self):
        return super(TextBasedAttention, self).get_config()
    
T = np.ones((32,768))
V = np.ones((32,100,2048))
TextBasedAttention()([V,T]).shape

TensorShape([32, 2048])

In [70]:
from tensorflow.keras.layers import Input, Concatenate, Dropout, Dense
from tensorflow.keras.models import Model

In [105]:
import tensorflow as tf
def myModel():
    V = Input(shape=(100, 2048))
    T = Input(shape=(768,))
    
    Vp = TextBasedAttention()([V,T])

    f = Concatenate()([Vp, T])
    dropout = Dropout(0.5)(f)
    output = Dense(1, activation="sigmoid", use_bias=True)(dropout)
    
    model = Model(inputs=[V,T], outputs=output)
    model.compile(b
        optimizer="adam", 
        loss='binary_crossentropy', 
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ])
    return model

model = myModel()
print(model.summary())

del model

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_25 (InputLayer)          [(None, 100, 2048)]  0           []                               
                                                                                                  
 input_26 (InputLayer)          [(None, 768)]        0           []                               
                                                                                                  
 text_based_attention_47 (TextB  (None, 2048)        1582964     ['input_25[0][0]',               
 asedAttention)                                                   'input_26[0][0]']               
                                                                                                  
 concatenate_12 (Concatenate)   (None, 2816)         0           ['text_based_attention_47[

In [102]:
from sklearn.utils import shuffle
RANDOM_STATE = 50

def create_train_valid(features,labels,train_fraction = 0.7,max_valid=1000):

    features,labels = shuffle(features,labels,random_state = RANDOM_STATE)

    train_end = max(int(train_fraction*len(labels)),len(labels)-max_valid)

    train_features = np.asarray(features[:train_end])
    valid_features = np.asarray(features[train_end:])

    train_labels = np.asarray(labels[:train_end])
    valid_labels = np.asarray(labels[train_end:])
    
    return train_features,valid_features,train_labels,valid_labels


x_train_video, x_valid_video, y_train_video, y_valid_video = create_train_valid(video_features, labels)
x_train_text, x_valid_text, y_train_text, y_valid_text = create_train_valid(text_features, labels)

In [None]:
# 32 sentences
# 10 words
# 768

# 32,10,768 => mean => 32,768

In [138]:
EPOCHS = 50
BATCH_SIZE = 64

In [139]:
model = myModel()
s = 0

for i in range(1, EPOCHS+1):
    history = model.fit(x=[x_train_video, x_train_text], 
              y=y_train_video, 
              batch_size=BATCH_SIZE, 
              epochs=1, 
              verbose=2,
              validation_split=0.1)
    
    S = history.history["val_binary_accuracy"][0]
    if S > s:
        s = S
        model.save_weights("model_weights.h5")

7/7 - 2s - loss: 0.7413 - binary_accuracy: 0.5381 - precision_28: 0.5333 - recall_28: 0.5581 - val_loss: 0.7780 - val_binary_accuracy: 0.3673 - val_precision_28: 0.3333 - val_recall_28: 0.4091 - 2s/epoch - 311ms/step
7/7 - 1s - loss: 0.6473 - binary_accuracy: 0.6328 - precision_28: 0.6647 - recall_28: 0.5256 - val_loss: 0.7686 - val_binary_accuracy: 0.4694 - val_precision_28: 0.4091 - val_recall_28: 0.4091 - 1s/epoch - 194ms/step
7/7 - 1s - loss: 0.6705 - binary_accuracy: 0.6166 - precision_28: 0.6231 - recall_28: 0.5767 - val_loss: 0.7711 - val_binary_accuracy: 0.4082 - val_precision_28: 0.3793 - val_recall_28: 0.5000 - 1s/epoch - 179ms/step
7/7 - 1s - loss: 0.5834 - binary_accuracy: 0.6767 - precision_28: 0.6556 - recall_28: 0.7349 - val_loss: 0.7531 - val_binary_accuracy: 0.4898 - val_precision_28: 0.4400 - val_recall_28: 0.5000 - 1s/epoch - 172ms/step
7/7 - 1s - loss: 0.6260 - binary_accuracy: 0.6513 - precision_28: 0.6569 - recall_28: 0.6233 - val_loss: 0.7390 - val_binary_accurac

7/7 - 1s - loss: 0.3578 - binary_accuracy: 0.8637 - precision_28: 0.8514 - recall_28: 0.8791 - val_loss: 0.7258 - val_binary_accuracy: 0.6122 - val_precision_28: 0.5652 - val_recall_28: 0.5909 - 1s/epoch - 184ms/step
7/7 - 1s - loss: 0.3319 - binary_accuracy: 0.8868 - precision_28: 0.8640 - recall_28: 0.9163 - val_loss: 0.7172 - val_binary_accuracy: 0.5918 - val_precision_28: 0.5556 - val_recall_28: 0.4545 - 1s/epoch - 176ms/step
7/7 - 1s - loss: 0.3659 - binary_accuracy: 0.8453 - precision_28: 0.8491 - recall_28: 0.8372 - val_loss: 0.7178 - val_binary_accuracy: 0.5918 - val_precision_28: 0.5556 - val_recall_28: 0.4545 - 1s/epoch - 182ms/step
7/7 - 1s - loss: 0.3527 - binary_accuracy: 0.8637 - precision_28: 0.8421 - recall_28: 0.8930 - val_loss: 0.7137 - val_binary_accuracy: 0.5918 - val_precision_28: 0.5556 - val_recall_28: 0.4545 - 1s/epoch - 172ms/step
7/7 - 1s - loss: 0.3439 - binary_accuracy: 0.8707 - precision_28: 0.8383 - recall_28: 0.9163 - val_loss: 0.7190 - val_binary_accurac

In [140]:
newModel = myModel()
newModel.load_weights("model_weights.h5")

newModel.evaluate(
    x=[x_valid_video, x_valid_text], 
    y=y_valid_video,
    batch_size=BATCH_SIZE
)



[0.5931963324546814, 0.7067307829856873, 0.723809540271759, 0.7037037014961243]