In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, MaxPool3D, Conv2D, LSTM
import pickle
from sklearn.metrics import roc_auc_score
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
f_name_tr = 'Train'
f_name_cv = 'CV'
f_name_te = 'Test'

In [4]:
def data_loader(f_name):   
    lst = []
    for i in tqdm(os.listdir(f_name)):
        with open(f_name + '/' + i, 'rb') as f:
            data = pickle.load(f)
        t_place = tf.reshape(data['place'], (1, -1, 2048))
        t_cast = tf.reshape(data['cast'], (1, -1, 512))
        t_action = tf.reshape(data['action'], (1, -1, 512))
        t_audio = tf.reshape(data['audio'], (1, -1, 512))
        t_labels = tf.reshape(data['labels'], (1, -1, 1))
        
        d_tuple = (t_place, t_cast, t_action, t_audio, t_labels)
        lst.append(d_tuple)
        
    return lst

In [5]:
data_train = data_loader(f_name_tr)
data_cv = data_loader(f_name_cv)
data_test = data_loader(f_name_te)

100%|██████████████████████████████████████████████████████████████████████████████████| 56/56 [00:12<00:00,  4.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.75it/s]


In [6]:
data_train[1][0].shape

TensorShape([1, 1144, 2048])

## Model Creation and Training

In [7]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights"""

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [8]:
class MultiHead_Self_Attention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHead_Self_Attention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)  
        v = self.wv(v)  

        q = self.split_heads(q, batch_size)  
        k = self.split_heads(k, batch_size)  
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [9]:
class cosine_similarity(tf.keras.layers.Layer):
    def __init__(self):
        super(cosine_similarity, self).__init__()
        self.conv1 = Conv2D(124, (4), data_format='channels_last', padding='same')
        
    def call(self, x):
        
        if x.shape[1] % 2 == 0:
            x1, x2 = tf.split(x, 2, 1)
        else:
            x = x[:, 1:, :]
            x1, x2 = tf.split(x, 2, 1)
        
        x1 = tf.reshape(x1, (-1, 1, x1.shape[1], x1.shape[2]))
        cv1 = self.conv1(x1)
        
        x2 = tf.reshape(x2, (-1, 1, x2.shape[1], x2.shape[2]))
        cv2 = self.conv1(x2)
        
        cs = tf.keras.layers.dot( [cv1, cv2],axes=2)
        cs = tf.reshape(cs, (-1, 1, cs.shape[2], cs.shape[4]))
        cs = tf.add(cs[:,:,2], cs[:,:,3])
        
        return cs

In [34]:
class BNet_place(tf.keras.layers.Layer):
    def __init__(self):
        super(BNet_place, self).__init__()
        self.conv1 = Conv2D(128, 4, data_format='channels_last', padding='same')        
        self.maxpool = MaxPool3D((1,1,5),1, data_format = 'channels_last')
        self.cos = cosine_similarity()
        self.lstm = LSTM(128, return_sequences=True)
        self.dense1 = Dense(64, activation='relu', kernel_initializer='he_normal')
        self.dense2 = Dense(1, activation='sigmoid')
        self.self_attention = MultiHead_Self_Attention(d_model=128, num_heads=4)
        
    def call(self, x):
        
        x1 = tf.reshape(x, (-1, 1, x.shape[1], x.shape[2]))  
        cv1 = self.conv1(x1)        
        cv1 = tf.reshape(cv1, (-1, 1,1, cv1.shape[3], cv1.shape[2]))
        
        m1 = self.maxpool(cv1)
        m1 = tf.reshape(m1, (-1, m1.shape[4], m1.shape[3]))        
        cosine_sim = self.cos(x)
        
        add = tf.add(m1, cosine_sim)

        s_att, _ = self.self_attention(add,add,add,None)

        fc1 = self.dense1(s_att)
        fc2 = self.dense2(fc1)
        print('FC2: ', fc2.shape)

        return fc2[:,:-1,:]
    
class BNet(tf.keras.layers.Layer):
    def __init__(self):
        super(BNet, self).__init__()
        self.conv1 = Conv2D(128, 4, data_format='channels_last', padding='same')        
        self.maxpool = MaxPool3D((1,1,5),1, data_format = 'channels_last')
        self.cos = cosine_similarity()
        self.lstm = LSTM(128, return_sequences=True)
        self.dense1 = Dense(64, activation='relu', kernel_initializer='he_normal')
        self.dense2 = Dense(1, activation='sigmoid')
        self.self_attention = MultiHead_Self_Attention(d_model=128, num_heads=4)
        
    def call(self, x):
        

        x1 = tf.reshape(x, (-1, 1, x.shape[1], x.shape[2]))
        cv1 = self.conv1(x1)
        
        cv1 = tf.reshape(cv1, (-1, 1,1, cv1.shape[3], cv1.shape[2]))
        
        m1 = self.maxpool(cv1)
        m1 = tf.reshape(m1, (-1, m1.shape[4], m1.shape[3]))
        
        cosine_sim = self.cos(x)
        
        add = tf.add(m1, cosine_sim)

        s_att, _ = self.self_attention(add,add,add,None)
        
        fc1 = self.dense1(s_att)
        fc2 = self.dense2(fc1)
        
        return fc2[:,:-1,:]

In [35]:
class LGSS(tf.keras.layers.Layer):
    def __init__(self):
        super(LGSS, self).__init__()
        self.bnet_place = BNet_place()
        self.bnet_cast = BNet()
        self.bnet_action = BNet()
        self.bnet_audio = BNet()
    def call(self, place_feat, cast_feat, action_feat, audio_feat):
        
        output = 0
       # print('\nPlace Feature: ', place_feat.shape)
        p_bnet = self.bnet_place(place_feat)
        output += 0.5*p_bnet
       # print('\nCast Feature: ', cast_feat.shape)
        c_bnet = self.bnet_cast(cast_feat)
        output += 0.2*c_bnet
       # print('\nAction Feature: ',action_feat.shape)
        ac_bnet = self.bnet_action(action_feat)
        output += 0.2*ac_bnet
       # print('\nAudio Feature: ', audio_feat.shape)
        a_bnet = self.bnet_audio(audio_feat)
        output += 0.1*a_bnet
        
        return output

In [36]:
class Local_to_Global_Model(tf.keras.Model):
    def __init__(self):
        super(Local_to_Global_Model, self).__init__()
        self.lgss = LGSS()
    
    def call(self, data):
        place_features = data[0]
        cast_features = data[1]
        action_features = data[2]
        audio_features = data[3]
        
        output = self.lgss(place_features, cast_features, action_features, audio_features)
        
        return output

In [37]:
LGSS_model = Local_to_Global_Model()

In [38]:
optimizer = tf.keras.optimizers.Adam(0.001)
LGSS_model.compile(optimizer=optimizer, loss='binary_crossentropy')

In [39]:
epochs = 12
epoch_train_loss = []
epoch_cv_loss = []

In [40]:
data_train[3][0].shape

TensorShape([1, 1244, 2048])

In [41]:
for epoch in range(epochs):
    print('EPOCH: ', epoch+1)
    start = time.time()
    t_loss = 0
    c_loss = 0
    c_ = 0
    for i in range(len(data_train)):
        t_loss += LGSS_model.train_on_batch(data_train[i][:4], data_train[i][4])
        c_+=1
        print(c_)
    train_loss = t_loss/len(data_train)

    for i in range(len(data_cv)):
        c_loss += LGSS_model.test_on_batch(data_cv[i][:4], data_cv[i][4])
    cv_loss = c_loss/len(data_cv)

    epoch_train_loss.append(train_loss)
    epoch_cv_loss.append(cv_loss)

    print('Training Loss: {},  Validation Loss: {}'.format(train_loss, cv_loss))
    print('Time Taken for this Epoch : {} sec'.format(time.time()-start))   
    LGSS_model.save_weights('Weights_Model2/epoch_'+ str(epoch+1) + '.h5')

EPOCH:  1
Add:  (None, 1101, 124)
S_att:  (None, None, 128)
FC2:  (None, None, 1)
Add:  (None, 1101, 124)
S_att:  (None, None, 128)
FC2:  (None, None, 1)
Add:  (None, 1101, 124)
S_att:  (None, None, 128)
FC2:  (None, None, 1)
Add:  (None, 1101, 124)
S_att:  (None, None, 128)
FC2:  (None, None, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
Add:  (1, 1101, 124)
S_att:  (1, 1101, 128)
FC2:  (1, 1101, 1)
1
Add:  (1, 1144, 124)
S_att:  (1, 1144, 128)
FC2:  (1, 1144, 1)
Add:  (1, 1144, 124)
S_att:  (1, 1144, 128)
FC2:  (1, 1144, 1)
Add:  (1, 1144, 124)
S_att:  (1, 1144, 128)
FC2:  (1, 1144, 1)
Add:  (

TypeError: in converted code:

    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py:305 train_on_batch  *
        outs, total_loss, output_losses, masks = (
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py:253 _process_single_batch
        training=training))
    <ipython-input-36-3fe5e39cc309>:15 call  *
        output = self.lgss(place_features, cast_features, action_features, audio_features)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py:778 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    <ipython-input-11-46b80cbdf15b>:12 call  *
        p_bnet = self.bnet_place(place_feat)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py:778 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    <ipython-input-34-4c7c333ecb5f>:15 call  *
        x1 = tf.reshape(x, (-1, 1, x.shape[1], x.shape[2]))  # hereeeeeeeeeee
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\ops\array_ops.py:193 reshape
        result = gen_array_ops.reshape(tensor, shape, name)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py:7443 reshape
        "Reshape", tensor=tensor, shape=shape, name=name)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\op_def_library.py:471 _apply_op_helper
        raise err
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\op_def_library.py:468 _apply_op_helper
        preferred_dtype=default_dtype)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\ops.py:1314 convert_to_tensor
        ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\constant_op.py:317 _constant_tensor_conversion_function
        return constant(v, dtype=dtype, name=name)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\constant_op.py:258 constant
        allow_broadcast=True)
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\constant_op.py:296 _constant_impl
        allow_broadcast=allow_broadcast))
    D:\Anaconda\envs\py3-TF2.0\lib\site-packages\tensorflow_core\python\framework\tensor_util.py:547 make_tensor_proto
        "supported type." % (type(values), values))

    TypeError: Failed to convert object of type <class 'tuple'> to Tensor. Contents: (-1, 1, None, 2048). Consider casting elements to a supported type.


In [None]:
LGSS_model.fit(data_train[:4], data_train[4], batch_size=1, epochs=100, verbose=1, 
               validation_data=(data_cv[:4], data_cv[4]),
               callbacks=[AUC, checkpoint, lr_schedule, early_stopping])

Train on 56 samples, validate on 4 samples
Epoch 1/100
