In [None]:
!pip install --qq medmnist

In [None]:
import os 
import io 
import imageio 
import medmnist 
import ipywidgets 
import numpy as np 
import tensorflow as tf 
from tensorflow import keras 
from keras import layers 

SEED = 42 
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
keras.utils.set_random_seed(SEED)


In [None]:
# data 
DATASET_NAME = 'orfanmnist3d'
BATCH_SIZE = 32  
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = ( 28 , 28 , 28,1)
NUM_CLASSES = 11

# OPTIMIZER 
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5

# TRAINING 
EPOCHS = 60 
# TUPLE EMBEDDING 
PATCH_SIZE = ( 8, 8 , 8)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) **2

# VIT ARCHITECTURE 
LAYER_NORM_EPS = 1e-6
PROJECTION_DIM = 128 
NUM_HEADS = 8
NUM_LAYERS = 8

In [None]:
def dowload_and_prepare_dataset(data_infor : dict):

    data_path = keras.utils.get_file(origin=data_infor['url'], md5_hash=data_infor['MD5'])

    with np.load(data_path) as data:
        # get video 
        train_videos = data['train_images']
        valid_videos = data['val_images']
        test_videos = data['test_images']

        # get labels 
        train_labels = data['train_labels'].flatten()
        valid_labels = data['val_labels'].flatten()
        test_labels = data['test_labels'].flatten()

    return (
        (train_videos , train_labels),
        (valid_videos , valid_labels),
        (test_videos , test_labels),
    )

# get the metadata of the dataset
# lấy siêu dữ liệu của tập dữ liệu 
infor = medmnist.INFO[DATASET_NAME]

# LẤY RA DỮ LIỆU 
prepare_dataste = dowload_and_prepare_dataset(infor)
(train_videos , train_labels) = prepare_dataste[0]
(valid_videos , valid_labels) = prepare_dataste[1]
(test_videos , test_labels) = prepare_dataste[2]



In [None]:
@tf.function 
def preprocess(frame: tf.Tensor , label: tf.Tensor):
    """ Xử lý khung hình và phân tích nhãn """
    # xử lý hình ảnh 
    frames = tf.image.convert_image_dtype(
        frames[
            ..., tf.newaxis
        ], 
        tf.float32,
    )
    # Phân tích nhãn 
    label = tf.cast(label , tf.float32)
    return frames , label 


def prepare_dataloader(
        videos: np.ndarray,
        labels: np.ndarray, 
        loader_type: str = "train",
        batch_size: int = BATCH_SIZE,
):
    """"Utility function to prepare the dataloader
        Chức năng tiện ích để chuẩn bị cho bộ nạp dữ liệu
    """
    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))

    if loader_type == "train":
        dataset = dataset.shuffle(BATCH_SIZE * 2)

    data_loader = (
        dataset.map(preprocess , num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    return data_loader

trainloader = prepare_dataloader(train_videos , train_labels , 'train')
validloader = prepare_dataloader(valid_videos, valid_labels, "valid")
testloader = prepare_dataloader(test_videos, test_labels, "test")

Tuple Embedding 

In [None]:
# xây dựng hàm nhúng tuyến tính các video 
class TubeletEmbedding(layers.Layer):
    def __init__(self , embed_dim , patch_size , **kwargs):
        super().__init__(**kwargs)
        self.projection_embedding = layers.Conv3D(
            filters=embed_dim ,
            kernel_size=patch_size, #
            strides = patch_size,
            padding = 'VALID',
        )
        # flatetn shape = [batch_size , embed_dim]
        self.flatten = layers.Flatten(target_shape=(-1 , embed_dim))
    
    def call(self, videos):
        projection_patches = self.projection_embedding(videos)
        flatten_patches = self.flatten(projection_patches)
        return flatten_patches

Positional Embedding 

In [None]:
class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim , **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def buil(self, input_shape):
        _, num_tokes, _ = input_shape
        self.position_embedding = layers.Embedding(
            input_dim=num_tokes , output_dim=self.embed_dim 
        )
        self.positional = tf.range(start= 0, limit=num_tokes, delta=1)
    
    def call(self, encoded_tokens):
        # encoded the position and add it to the encoded tokens 
        # mã hóa vị trí và thêm nó vào mã hóa thông báo 
        encoded_positons = self.position_embedding(self.positional)
        encoded_tokens = encoded_tokens + encoded_positons 
        return encoded_tokens
    # trả về mã hóa mã thông báo + mã hóa vị trí của ảnh 
    

Video Vision Transformer 

In [None]:
def create_vitvit_classifier(
        tublet_embed , 
        positional_encoder, 
        input_shape = INPUT_SHAPE, 
        transformer_layers = NUM_LAYERS, 
        num_heads = NUM_HEADS, 
        embed_dim = PROJECTION_DIM,
        layers_norm_eps = LAYER_NORM_EPS, 
        num_classes = NUM_CLASSES,
):
    # get the input layer 
    inputs = layers.Input(shape=input_shape)
    # create patches 
    patches = tublet_embed(inputs)
    # encoded_patches 
    encoded_patches = positional_encoder(patches)
    # create multi layers of the transformer block 
    for _ in range(transformer_layers):
        # layer normlization 1 
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # add attention layers 
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads , key_dim=embed_dim // num_heads , 
            dropout=0.1
        )(x1 , x1)
        # add skip conection 
        x2 = layers.Add()([attention_output , encoded_patches])
        # add layers normalization 
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # add multi layers mlp 
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4 , activation=tf.nn.gele), 
                layers.Dropout(0.2),
                layers.Dense(units=embed_dim , activation=tf.nn.gelu),
            ]
        )(x3)
        # skip conmection 2 
        encoded_patches = layers.Add()([x3 , x2])
    
    # layers normalization and global average pooling 
    representation = layers.LayerNormalization(epsilon=layers_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # classifier output 
    outputs = layers.Dense(units=num_classes , activation='softmax')(representation)

    # create keras model 
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model 

Train experiment

In [None]:
def run_experiment():
    # Initialize model 
    model = create_vitvit_classifier(
        tublet_embed=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, 
            patch_size=PATCH_SIZE,
        ),
        positional_encoder= PositionalEncoder(
            embed_dim=PROJECTION_DIM),
    )
    # compi;e the model with the optimizer , loss funcion 
    # and the metrics 
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(
        optimizer=optimizer,
        loss = "spare_categorical_crossentropy", 
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
            keras.metrics.SparseTopKCategoricalAccuracy(5 , name='top-5-accuracy'),
        ],
    )
    # train the model 
    _ = model.fit(trainloader, epochs=EPOCHS , validation_data=validloader)
    _, accuracy, top_5_accuracy = model.evaluate(testloader)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")

    return model


model = run_experiment()