In [1]:
# from tensorflow_docs.vis import embed
from tensorflow.keras import layers
from tensorflow import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
# import imageio
import cv2
import os

2023-07-06 15:38:17.564000: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-06 15:38:17.739246: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-06 15:38:17.788341: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-06 15:38:18.703387: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [2]:
MAX_SEQ_LENGTH = 32
NUM_FEATURES = 768
# IMG_SIZE = 128

EPOCHS = 5

In [3]:

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask


In [4]:

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


## Utility functions for training

In [5]:

def get_compiled_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 2
    num_heads = 2
    classes = 14

    inputs = keras.Input(shape=(None, None))
    x = PositionalEmbedding(sequence_length, embed_dim, name="frame_position_embedding")(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
#     x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
    )
    return model





In [6]:
model = get_compiled_model()
model.summary()

2023-07-06 15:38:23.634727: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-06 15:38:24.463075: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5755 MB memory:  -> device: 0, name: Quadro RTX 4000, pci bus id: 0000:b3:00.0, compute capability: 7.5


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None)]      0         
                                                                 
 frame_position_embedding (P  (None, None, 768)        24576     
 ositionalEmbedding)                                             
                                                                 
 transformer_layer (Transfor  (None, None, 768)        4730882   
 merEncoder)                                                     
                                                                 
 global_max_pooling1d (Globa  (None, 768)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 768)               0         
                                                             

In [7]:
clips_16_train = np.load("train_feat_array_lavila_16.npy")
clips_16_val = np.load("val_feat_array_lavila_16.npy")
clips_16_train.shape, clips_16_val.shape

((29955, 32, 768), (11495, 32, 768))

In [8]:
labels = np.array(pd.read_csv("train_samples_updated.csv"))[:, 5:]
labels

array([[1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [9]:

filepath = "/tmp/video_classifier"
# checkpoint = keras.callbacks.ModelCheckpoint(filepath, save_weights_only=True, save_best_only=True, verbose=1)

history = model.fit(clips_16_train, labels,validation_split = 0.15, batch_size = 4, epochs=10)
# model.load_weights(filepath)
#     # _, accuracy = model.evaluate(test_data, test_labels)
#     # print(f"Test accuracy: {round(accuracy * 100, 2)}%")

#     return model

Epoch 1/10
   5/6366 [..............................] - ETA: 1:26 - loss: 0.5487 - accuracy: 0.0000e+00   

2023-07-06 15:38:43.341076: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8201


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
predict = model.predict([clips_16_val])



In [11]:
predict

array([[2.00718746e-01, 2.24956102e-03, 7.26699736e-03, ...,
        4.90410477e-02, 9.21745785e-03, 2.05192477e-01],
       [1.17740430e-01, 3.32313664e-02, 7.82964472e-03, ...,
        1.14554241e-01, 6.62131759e-04, 3.79990637e-02],
       [5.06367795e-02, 9.76582229e-01, 2.47752145e-01, ...,
        9.33358967e-02, 9.64717381e-03, 1.13172494e-01],
       ...,
       [1.87345762e-02, 9.96279180e-01, 8.56871232e-02, ...,
        5.13514364e-03, 1.43548299e-03, 2.12646723e-02],
       [2.03205153e-01, 9.92361307e-01, 2.37280247e-03, ...,
        2.38737464e-01, 2.13764273e-02, 9.01310369e-02],
       [3.52495722e-02, 5.18394472e-06, 7.32414308e-04, ...,
        5.06485952e-03, 1.93260998e-01, 5.17681718e-01]], dtype=float32)

In [12]:
train_csv_file = pd.read_csv("val_samples_updated.csv")

col_names = ['rec_no', 'subject_pos', 'start_time', 'end_time' ]
new_train_csv = train_csv_file.drop(col_names, axis=1)

#preparing train and test csv
test_csv = new_train_csv
# train_csv = new_train_csv[3123:]
print(len(test_csv))

Column_names = ['Settle','Legs crossed','Groom','Hand-mouth','Fold arms','Leg movement','Scratch','Gesture','Hand-face','Adjusting clothing','Fumble','Shrug','Stretching','Smearing hands']
# Column_names = ['Hand-face','Hand-mouth','Gesture','Fumble','Scratch','Stretching','Smearing hands','Shrug','Adjusting clothing','Groom','Fold arms','Leg movement','Settle','Legs crossed']
extracted_col = test_csv["sample_id"]
test_pred_csv = pd.DataFrame(predict, columns = Column_names)
test_pred_csv.insert(0, "sample_id", extracted_col)
test_pred_csv.to_csv("test_predicted_transformer"  + ".csv", index=False)

11495


In [13]:
test_pred_csv

Unnamed: 0,sample_id,Settle,Legs crossed,Groom,Hand-mouth,Fold arms,Leg movement,Scratch,Gesture,Hand-face,Adjusting clothing,Fumble,Shrug,Stretching,Smearing hands
0,42222,0.200719,0.002250,0.007267,0.000456,0.054291,0.161696,0.059026,0.396869,0.004555,0.130969,0.028862,0.049041,0.009217,0.205192
1,5609,0.117740,0.033231,0.007830,0.000112,0.003384,0.050054,0.027142,0.113054,0.000151,0.026434,0.073629,0.114554,0.000662,0.037999
2,41081,0.050637,0.976582,0.247752,0.244747,0.365770,0.047252,0.219640,0.415477,0.494850,0.156899,0.014261,0.093336,0.009647,0.113172
3,6959,0.001635,0.155199,0.000358,0.000020,0.000034,0.039488,0.008709,0.285547,0.000308,0.001226,0.047742,0.001250,0.001737,0.022995
4,2744,0.633117,0.000903,0.016505,0.901506,0.022425,0.230416,0.026623,0.075590,0.504317,0.009208,0.008474,0.019357,0.002199,0.020083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11490,7675,0.131903,0.995654,0.009558,0.002090,0.004686,0.055797,0.021817,0.178705,0.014835,0.008297,0.080293,0.059472,0.001076,0.017341
11491,38967,0.008134,0.999186,0.000900,0.296096,0.000042,0.002938,0.005457,0.021391,0.037675,0.022670,0.371772,0.001566,0.001370,0.013296
11492,39394,0.018735,0.996279,0.085687,0.970206,0.011674,0.007154,0.031017,0.123173,0.411897,0.083956,0.056419,0.005135,0.001435,0.021265
11493,5006,0.203205,0.992361,0.002373,0.001442,0.246542,0.282288,0.036477,0.401045,0.001590,0.092185,0.043743,0.238737,0.021376,0.090131


In [14]:
test_csv

Unnamed: 0,sample_id,Settle,Legs crossed,Groom,Hand-mouth,Fold arms,Leg movement,Scratch,Gesture,Hand-face,Adjusting clothing,Fumble,Shrug,Stretching,Smearing hands
0,42222,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,5609,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,41081,0,1,0,0,0,1,0,0,1,0,0,0,0,0
3,6959,0,1,0,0,0,0,0,0,0,0,1,0,0,1
4,2744,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11490,7675,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11491,38967,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11492,39394,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11493,5006,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
import pandas as pd, numpy as np
from sklearn.metrics import average_precision_score


CLASSES = ['Hand-face','Hand-mouth','Gesture','Fumble','Scratch','Stretching','Smearing hands','Shrug','Adjusting clothing','Groom','Fold arms','Leg movement','Settle','Legs crossed']


def evaluate(test_annotation_file,user_submission_file):
#     test = pd.read_csv(test_annotation_file,index_col="sample_id").sort_values('sample_id')
#     user = pd.read_csv(user_submission_file,index_col="sample_id").sort_values('sample_id')
    
    test = test_annotation_file.sort_values('sample_id')
    
    user = user_submission_file.sort_values('sample_id')
    if not(np.all(test.index==user.index)):
        raise ValueError("Indexes of test and prediction files do not agree.")
        
    scores = []
    for behaviour in CLASSES:
        cur_score = average_precision_score(test[behaviour].values,user[behaviour].values)
        scores.append(cur_score)
    per_class_scores = pd.DataFrame({'behaviour':CLASSES,'score':scores}).set_index('behaviour')
    macro_average = np.mean(scores)
    return {'macro_average':macro_average,'per_class_scores':per_class_scores}



if __name__=='__main__':
    # example usage of evaluate function
    test_annotation_file = test_csv
    user_submission_file = test_pred_csv # use your own predictions here
    results = evaluate(test_annotation_file,user_submission_file)
    print('')
    print('--------------- MACRO AVERAGE: -----------------')
    print('')
    print(str(results['macro_average']))
    print('')
    print('--------------- PER CLASS: ---------------------')
    print(str(results['per_class_scores']))



--------------- MACRO AVERAGE: -----------------

0.25827088901385153

--------------- PER CLASS: ---------------------
                       score
behaviour                   
Hand-face           0.677132
Hand-mouth          0.400678
Gesture             0.514173
Fumble              0.284973
Scratch             0.075085
Stretching          0.006666
Smearing hands      0.016856
Shrug               0.016208
Adjusting clothing  0.074836
Groom               0.490816
Fold arms           0.160758
Leg movement        0.038847
Settle              0.069537
Legs crossed        0.789227


In [17]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None)]      0         
                                                                 
 frame_position_embedding (P  (None, None, 768)        24576     
 ositionalEmbedding)                                             
                                                                 
 transformer_layer (Transfor  (None, None, 768)        4730882   
 merEncoder)                                                     
                                                                 
 global_max_pooling1d (Globa  (None, 768)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 768)               0         
                                                             

In [16]:
from keras.models import Model
intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("global_max_pooling1d").output)

In [19]:
intermediate_layer_model.summary()

# intermediate_layer_model.save("transformer")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, None)]      0         
                                                                 
 frame_position_embedding (P  (None, None, 768)        24576     
 ositionalEmbedding)                                             
                                                                 
 transformer_layer (Transfor  (None, None, 768)        4730882   
 merEncoder)                                                     
                                                                 
 global_max_pooling1d (Globa  (None, 768)              0         
 lMaxPooling1D)                                                  
                                                                 
Total params: 4,755,458
Trainable params: 4,755,458
Non-trainable params: 0
_________________________________________________

In [17]:
# val = intermediate_layer_model.predict()
val = intermediate_layer_model.predict(clips_16_val)
np.save("val_transformer_10.npy", val)



In [25]:
intermediate_layer_model.save("intermediate_layer_model_transformer_0.258")





INFO:tensorflow:Assets written to: intermediate_layer_model_transformer_0.258/assets


INFO:tensorflow:Assets written to: intermediate_layer_model_transformer_0.258/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [18]:
train_list = []
for j in range(0,29955,50):
    train_list.append(intermediate_layer_model.predict(clips_16_train[j:j+50]))
    

# train = intermediate_layer_model.predict(clips_16_train, batch_size = 1)









In [19]:
train = []
for k in range(len(train_list)):
    arr = train_list[k]
    for r in range(arr.shape[0]):
        train.append(arr[r])


In [20]:
np.array(train)

array([[-0.05605034, -0.19405366, -0.07525194, ..., -0.15538219,
        -0.09632658, -0.11747314],
       [-0.05914288, -0.18418866, -0.08713749, ..., -0.14331588,
        -0.11057828, -0.16816646],
       [-0.06623034, -0.13931355, -0.07413673, ..., -0.05528142,
        -0.10654216,  0.2577652 ],
       ...,
       [ 0.597189  , -0.12341958,  0.2897927 , ..., -0.0393292 ,
        -0.1147424 , -0.16316684],
       [ 0.54958826, -0.10242663,  0.18978652, ...,  0.18037584,
        -0.12815897, -0.1624589 ],
       [-0.0136978 , -0.12506744,  0.00122154, ...,  0.06337672,
         0.0326833 , -0.15327793]], dtype=float32)

In [21]:
np.array(train).shape

(29955, 768)

In [22]:
# train.shape
np.save("train_transformer_10.npy", np.array(train))

In [23]:
test_features = np.load("test_feat_array_lavila_16.npy")

test_features.shape

(995, 32, 768)

In [24]:
val = intermediate_layer_model.predict(test_features)
np.save("test_transformer_10.npy", val)



## Model training and inference

In [None]:
trained_model = run_experiment()

**Note**: This model has ~4.23 Million parameters, which is way more than the sequence
model (99918 parameters) we used in the prequel of this example.  This kind of
Transformer model works best with a larger dataset and a longer pre-training schedule.

In [None]:

def prepare_single_video(frames):
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    # Pad shorter videos.
    if len(frames) < MAX_SEQ_LENGTH:
        diff = MAX_SEQ_LENGTH - len(frames)
        padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
        frames = np.concatenate(frames, padding)

    frames = frames[None, ...]

    # Extract features from the frames of the current video.
    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            if np.mean(batch[j, :]) > 0.0:
                frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
            else:
                frame_features[i, j, :] = 0.0

    return frame_features


def predict_action(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features = prepare_single_video(frames)
    probabilities = trained_model.predict(frame_features)[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = predict_action(test_video)
to_gif(test_frames[:MAX_SEQ_LENGTH])

In [8]:
from tensorflow import keras
model = keras.models.load_model('transformer')


ValueError: Exception encountered when calling layer "transformer_layer" (type TransformerEncoder).

Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (2 total):
    * <tf.Tensor 'inputs:0' shape=(None, None, 768) dtype=float32>
    * None
  Keyword arguments: {'training': False}

 Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, None, 768), dtype=tf.float32, name='inputs')
    * TensorSpec(shape=(None, None), dtype=tf.bool, name='mask')
  Keyword arguments: {'training': False}

Option 2:
  Positional arguments (2 total):
    * TensorSpec(shape=(None, None, 768), dtype=tf.float32, name='inputs')
    * TensorSpec(shape=(None, None), dtype=tf.bool, name='mask')
  Keyword arguments: {'training': True}

Call arguments received by layer "transformer_layer" (type TransformerEncoder):
  • args=('tf.Tensor(shape=(None, None, 768), dtype=float32)',)
  • kwargs=<class 'inspect._empty'>

In [9]:
model.summary()

NameError: name 'model' is not defined

The performance of our model is far from optimal, because it was trained on a
small dataset.