In [None]:
!pip install -qq medmnist

In [None]:
!pip install --upgrade keras
!pip install albumentations

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip "/content/drive/MyDrive/Bakis/cnnlstm/split.zip" -d "/content/dataset"

Archive:  /content/drive/MyDrive/Bakis/cnnlstm/split.zip
  inflating: /content/dataset/training/labas/1-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/2-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/3-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/4-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/5-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/6-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/7-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/8-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/9-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/training/labas/10-653c0632-adc0-42be-8453-98dc71a55361.webm  
  inflating: /content/dataset/tr

In [None]:
import os
import io
import imageio
import pandas as pd
import ipywidgets
import numpy as np
import tensorflow as tf  # for data preprocessing only
import keras
from keras import layers, ops
import cv2
from sklearn.preprocessing import LabelEncoder
import albumentations as A
import random

In [None]:
grab_frames = 30
target_shape = 160

In [None]:
train_dir = '/content/dataset/training'
test_dir = '/content/dataset/test'
val_dir = '/content/dataset/validation'

In [None]:
labels = np.array(['aciu', 'berniukas', 'kamuolys', 'koks', 'labas', 'mama', 'namas', 'tevas', 'valgyti', 'vardas'])

In [None]:
def process_dataset(train_dir, set_size, augment=False):
    X_data = np.zeros((set_size, grab_frames, target_shape, target_shape, 3), dtype=np.uint8)
    y_temp = np.zeros((set_size,), dtype=object)

    index = 0
    random.seed(7)

    transform = A.ReplayCompose([
      A.HorizontalFlip(p=0.5),
      A.MotionBlur(blur_limit=5, p=0.3),
      A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.02, rotate_limit=10, p=0.5),
      A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.4),
      A.CLAHE(clip_limit=2, p=0.3),
      A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.3),
      A.ISONoise(color_shift=(0.01, 0.03), intensity=(0.1, 0.3), p=0.3),
    ])

    data = []

    source_directory = os.path.abspath(train_dir)
    sub_folders = os.listdir(source_directory)

    for folder in sub_folders:
        folder_path = os.path.join(source_directory, folder)
        files = os.listdir(folder_path)

        for file in files:

            video_path = os.path.join(folder_path, file)
            video = cv2.VideoCapture(video_path)
            frames = []
            framesAug = []
            count = 1
            lastFrame = None
            skip = False

            while count <= grab_frames:
                ret, frame = video.read()
                if not ret:
                    break

                if skip == True:
                  skip = not skip
                  continue

                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (target_shape, target_shape))
                lastFrame = frame

                if augment != True:
                  frames.append(frame)

                count += 1

                if augment:
                  if data == []:
                    data = transform(image=frame)
                    framesAug.append(data['image'])
                  else:
                    augImg = A.ReplayCompose.replay(data['replay'], image=frame)
                    framesAug.append(augImg['image'])

                skip = not skip

            video.release()
            cv2.destroyAllWindows()

            if augment != True:
              X_data[index] = np.array(frames)
              y_temp[index] = folder
              index += 1
            if augment:
              X_data[index] = np.array(framesAug)
              y_temp[index] = folder
              index += 1
              data = []
            print(index)


    return X_data, y_temp

In [None]:
X_train, y_train = process_dataset(train_dir, 2100, True)

In [None]:
X_valid, y_valid = process_dataset(val_dir, 200)

In [None]:
X_test, y_test = process_dataset(test_dir, 200)

In [None]:
le = LabelEncoder()

# Fit the encoder on the labels array
le.fit(labels)  # Learning the mapping from the labels array

# Transform the dataset using the learned encoder
y_train = le.transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

In [None]:
SEED = 42
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
keras.utils.set_random_seed(SEED)

In [None]:
BATCH_SIZE = 128
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = (grab_frames, target_shape, target_shape, 3)
NUM_CLASSES = 10

# OPTIMIZER
LEARNING_RATE = 0.0001
# WEIGHT_DECAY = 1e-5

# TRAINING
EPOCHS = 1000

# TUBELET EMBEDDING
PATCH_SIZE = (16, 16, 16)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2

# ViViT ARCHITECTURE
LAYER_NORM_EPS = 1e-6
PROJECTION_DIM = 128
NUM_HEADS = 8
NUM_LAYERS = 8

In [None]:
def preprocess(frames: tf.Tensor, label: tf.Tensor):
    """Preprocess the frames tensors and parse the labels."""
    # Preprocess images
    frames = tf.image.convert_image_dtype(
        frames[
            ..., tf.newaxis
        ],  # The new axis is to help for further processing with Conv3D layers
        tf.float32,
    )
    # Parse label
    label = tf.cast(label, tf.float32)
    return frames, label


def prepare_dataloader(
    videos: np.ndarray,
    labels: np.ndarray,
    loader_type: str = "train",
    batch_size: int = BATCH_SIZE,
):
    """Utility function to prepare the dataloader."""
    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))

    if loader_type == "train":
        dataset = dataset.shuffle(BATCH_SIZE * 2)

    dataloader = (
        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    return dataloader

In [None]:
X_train_loader = prepare_dataloader(X_train, y_train, "train")

In [None]:
X_valid_loader = prepare_dataloader(X_valid, y_valid, "valid")

In [None]:
X_test_loader = prepare_dataloader(X_test, y_valid, "test")

In [None]:
X_train = []
y_train = []
X_valid = []
y_valid = []
X_test = []
y_test = []

In [None]:
class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="VALID",
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))

    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches

In [None]:
class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )
        self.positions = ops.arange(0, num_tokens, 1)

    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        encoded_positions = self.position_embedding(self.positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens

In [None]:
def create_vivit_classifier(
    tubelet_embedder,
    positional_encoder,
    input_shape=INPUT_SHAPE,
    transformer_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    embed_dim=PROJECTION_DIM,
    layer_norm_eps=LAYER_NORM_EPS,
    num_classes=NUM_CLASSES,
):
    # Get the input layer
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = tubelet_embedder(inputs)
    # Encode patches.
    encoded_patches = positional_encoder(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization and MHSA
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer Normalization and MLP
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4, activation=ops.gelu),
                layers.Dense(units=embed_dim, activation=ops.gelu),
            ]
        )(x3)

        # Skip connection
        encoded_patches = layers.Add()([x3, x2])

    # Layer normalization and Global average pooling.
    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # Classify outputs.
    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
def run_experiment():
    # Initialize model
    model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
    )

    # Compile the model with the optimizer, loss function
    # and the metrics.
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
        ],
    )

    # Train the model.
    history = model.fit(X_train_loader, epochs=EPOCHS, validation_data=X_valid_loader)

    hist = pd.DataFrame(history.history)
    plot = hist.plot()
    plot.figure.savefig("/content/drive/MyDrive/Bakis/vivit/train.jpg")

    eval = model.evaluate(X_test_loader)

    return model

In [None]:
model = run_experiment()
model.save("/content/drive/MyDrive/Bakis/vivit/model.h5")

In [None]:
from google.colab import runtime
runtime.unassign()