## Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt 
# import skvideo.io  
import os 
import cv2

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import io
import imageio
import ipywidgets
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
from tqdm import tqdm


## Install Scikit-Video

In [None]:
!pip install scikit-video
import skvideo.io

## Add surveillance fight dataset from github

In [None]:
# !git clone https://github.com/seymanurakti/fight-detection-surv-dataset.git

In [None]:
def frame_crop_center(video,cropf):
    f,_,_,_ = video.shape
    startf = f//2 - cropf//2
    return video[startf:startf+cropf, :, :, :]

## Load, resize and trim the videos

In [None]:
from tqdm import tqdm 
import cv2

def extract_tarian(path, frame_size, seq_len):
    list_video = []
    list_label = []
    label_index = 0
    video_dims = []
    for folder in path:
        for f in tqdm(os.listdir(folder)):
            f = os.path.join(folder, f)
        # checking if it is a file
            
            video = skvideo.io.vread(f)
            video_dims.append(video.shape)
            L=[]

            #resize video dimensions
            for i in range(video.shape[0]):
                frame = cv2.resize(video[i], (frame_size,frame_size), interpolation=cv2.INTER_CUBIC)
                L.append(frame)

            video = np.asarray(L)

            #center crop video to have consistent video frame number
            video = frame_crop_center(video, seq_len)

            list_video.append(video)
            list_label.append(label_index)
        label_index += 1
        
    return list_video, list_label, video_dims

In [None]:
# # images, labels = load_video(path)
import h5py

hf = h5py.File("/kaggle/input/ucf101-dalam-format-h5/dataset_ucf50_80_15.h5", 'r')
videos, labels = hf['videos'][()], hf['labels'][()]

hf.close()



In [None]:
print(np.array(videos).shape)

In [None]:
# Setting seed for reproducibility
SEED = 77
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
tf.random.set_seed(SEED)

# DATA
DATASET_NAME = "fight/nofights"
BATCH_SIZE = 4
AUTO = tf.data.AUTOTUNE
FRAME_SIZE = np.array(videos).shape[2]
SEQ_LEN = np.array(videos).shape[1]
# INPUT_SHAPE = (3, 60, 60, 3)
INPUT_SHAPE = (SEQ_LEN, FRAME_SIZE, FRAME_SIZE, 3)


# OPTIMIZER
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5

# TRAINING
EPOCHS = 50

# TUBELET EMBEDDING
# PATCH_SIZE = (8, 8, 8)
PATCH_SIZE = (8, 8, 8)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2

# ViViT ARCHITECTURE
LAYER_NORM_EPS = 1e-6
# PROJECTION_DIM = 30
PROJECTION_DIM = 64
NUM_HEADS = 2
NUM_LAYERS = 2

# Load Video and Write H5

In [None]:
# import gc

# # del images
# # del labels
# gc.collect()

# path=[]
# dir_path = "/kaggle/input/tarian/tari"
# for d in os.listdir(dir_path):
#     f_path = os.path.join("/kaggle/input/tarian/tari",d)
#     path.append(f_path)
# list_video, list_label, video_dims = extract_tarian(path, FRAME_SIZE, SEQ_LEN)



In [None]:
# print(video_dims)

In [None]:
# import h5py

# with h5py.File("dataset_tarian.h5", "w") as f:
#     f.create_dataset("images", data=np.asarray(images))
#     f.create_dataset("labels", data=np.asarray(labels))

In [None]:
NUM_CLASSES = len(np.unique(labels))

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test  = train_test_split(images, labels, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


## Video duration and dimension analysis 

The mean number of frames in the videos is 54 which corresponds to around 2 seconds. 
To make the video duration consistent without losing lots of data the videos are trimmed to have 42 frames in total with a center crop. 

Using a smaller frame number results in poorer performance. 

In [None]:
# data= pd.DataFrame(video_dims, columns=['frame_length', 'height', 'width', 'channels'])
# data.describe()

# del data

## Aggregate data and create labels 

In [None]:
# surv_fights = [video for video in surv_fights if video.shape[0] == 42]
# surv_no_fights = [video for video in surv_no_fights if video.shape[0] == 42]

# videos = fights + surv_fights + nofights + surv_no_fights
# videos = np.asarray(videos)

# labels = np.concatenate([np.ones(len(fights)+len(surv_fights)) , np.zeros(len(nofights)+len(surv_no_fights))])

# del fights
# del nofights
# del surv_fights
# del surv_no_fights

In [None]:
# [video for video in surv_fights if video.shape[0] == 42]
# videos, labels = [video, lb for video, lb in zip(list_video, list_label) if video.shape[0] == 42]
# videos = []
# labels = []
# for video,label in zip(list_video,list_label):
#     if video.shape[0] == SEQ_LEN:
#         videos.append(video)
#         labels.append(label)
# #         print(np.array(video).shape)

# # print(np.array(videos).shape)
# # print(np.array(labels).shape)
# videos = np.asarray(videos)
# labels = np.asarray(labels)

# del list_video
# del list_label

In [None]:
# print(videos.shape)
# import h5py

# with h5py.File("dataset_tarian"+str(FRAME_SIZE)+"_"+str(SEQ_LEN)+".h5", "w") as f:
#     f.create_dataset("videos", data=np.asarray(videos))
#     f.create_dataset("labels", data=np.asarray(labels))

## Train, test, val split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(videos, labels, test_size=0.2, random_state=2334)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=124567)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

## Set ViVit model hyperparameters 

## Preprocess and prepare dataloader

Takes around 5 ~ 10 minutes to execute

In [None]:
@tf.function
def preprocess(frames: tf.Tensor, label: tf.Tensor):
    """Preprocess the frames tensors and parse the labels"""
    # Preprocess images
    frames = tf.image.convert_image_dtype(
        frames[
            ..., tf.newaxis
        ],  # The new axis is to help for further processing with Conv3D layers
        tf.float32,
    )

    # Parse label
    label = tf.cast(label, tf.float32)
    return frames, label


def prepare_dataloader(
    videos: np.ndarray,
    labels: np.ndarray,
    loader_type: str = "train",
    batch_size: int = BATCH_SIZE,
):
    """Utility function to prepare dataloader"""
    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))

    if loader_type == "train":
        dataset = dataset.shuffle(BATCH_SIZE * 2)

    dataloader = (
        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )

    return dataloader


## Define some model classes 

In [None]:
class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="VALID",
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))

    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches

class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )
        self.positions = tf.range(start=0, limit=num_tokens, delta=1)

    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        encoded_positions = self.position_embedding(self.positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens

## Create model 

In [None]:
def create_vivit_classifier(
    tubelet_embedder,
    positional_encoder,
    input_shape=INPUT_SHAPE,
    transformer_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    embed_dim=PROJECTION_DIM,
    layer_norm_eps=LAYER_NORM_EPS,
    num_classes=NUM_CLASSES,
):
    # Get the input layer
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = tubelet_embedder(inputs)
    # Encode patches.
    encoded_patches = positional_encoder(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization and MHSA
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer Normalization and MLP
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4, activation=tf.nn.gelu),
                layers.Dense(units=embed_dim, activation=tf.nn.gelu),
            ]
        )(x3)

        # Skip connection
        encoded_patches = layers.Add()([x3, x2])

    # Layer normalization and Global average pooling.
    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # Classify outputs.
    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

## Define metrics, build, train and save ViVit model

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def run_experiment():
    # Initialize model
    model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
    )

    # Compile the model with the optimizer, loss function
    # and the metrics.
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            recall_m,
            precision_m,
            f1_m,
        ],
    )

    # Train the model.
    _ = model.fit(trainloader, epochs=EPOCHS, validation_data=validloader)

    _, accuracy, recall, precision, f1 = model.evaluate(testloader)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"Test recall: {round(recall * 100, 2)}%")
    print(f"Test precision: {round(precision * 100, 2)}%")
    print(f"Test F1: {round(f1 * 100, 2)}%")
    
    vivit_scores = [accuracy, recall, precision, f1]
    return model, vivit_scores





In [None]:
from sklearn.model_selection import KFold


kfold = KFold(n_splits=5, shuffle=True)

In [None]:
fold_no = 1
history_acc = []
history_pre = []
history_rec = []
history_f1 = []
history_loss = []
histories = []
for train, test in tqdm(kfold.split(videos, labels)):
    print(fold_no)
    model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
    )

    # Compile the model with the optimizer, loss function
    # and the metrics.
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            recall_m,
            precision_m,
            f1_m,
        ],
    )
    
    
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
#     history = model.fit(inputs[train], targets[train],
#               batch_size=batch_size,
#               epochs=no_epochs,
#               verbose=verbosity)
    
    X_train, y_train = videos[train], labels[train]
    trainloader = prepare_dataloader(X_train, y_train, "train")
    
    
    
    
    X_val, y_val = videos[test], labels[test]
    testloader = prepare_dataloader(X_val, y_val, "valid")
#     testloader = prepare_dataloader(X_test, y_test, "test")

    print(X_train.shape)
    print(X_val.shape)

    # Train the model.
    history = model.fit(trainloader, epochs=EPOCHS)
    histories.append(history.history)
    _, accuracy, recall, precision, f1 = model.evaluate(testloader)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"Test recall: {round(recall * 100, 2)}%")
    print(f"Test precision: {round(precision * 100, 2)}%")
    print(f"Test F1: {round(f1 * 100, 2)}%")
    
    vivit_scores = [accuracy, recall, precision, f1]
    history_acc.append(accuracy)
    history_pre.append(precision)
    history_rec.append(recall)
    history_f1.append(f1)
    
    fold_no += 1

In [None]:
import pprint as pp
pp.pprint(histories)

In [None]:
# model, vivit_scores = run_experiment()

model.save('vivit_model')

### Plot Confusion Matrix

In [None]:
y_preds = np.argmax(model.predict(testloader), axis=1)

#Generate the confusion matrix
cf_matrix = confusion_matrix(y_test, y_preds)

ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
# ax.xaxis.set_ticklabels(['False','True'])
# ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()
ax.figure.savefig("vivit_cf.png") 
