# Image classification with Swin Transformers

This example implements [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
by Liu et al. for image classification, and demonstrates it on the
[CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html).

Swin Transformer (**S**hifted **Win**dow Transformer) can serve as a general-purpose backbone
for computer vision. Swin Transformer is a hierarchical Transformer whose
representations are computed with _shifted windows_. The shifted window scheme
brings greater efficiency by limiting self-attention computation to
non-overlapping local windows while also allowing for cross-window connections.
This architecture has the flexibility to model information at various scales and has
a linear computational complexity with respect to image size.

This example requires TensorFlow 2.5 or higher, as well as TensorFlow Addons,
which can be installed using the following commands:

In [1]:
!pip install -U tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: tensorflow-addons
  Attempting uninstall: tensorflow-addons
    Found existing installation: tensorflow-addons 0.14.0
    Uninstalling tensorflow-addons-0.14.0:
      Successfully uninstalled tensorflow-addons-0.14.0
Successfully installed tensorflow-addons-0.19.0
[0m

## Setup

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
import os

 The versions of TensorFlow you are currently using is 2.6.3 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [3]:
import numpy as np
import os
from sklearn.metrics import confusion_matrix
#import seaborn as sn; sn.set(font_scale=1.4)
from sklearn.utils import shuffle           
import matplotlib.pyplot as plt             
import cv2                                 
import tensorflow as tf                
from tqdm import tqdm
#from sklearn.metrics import classification_report, log_loss, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras.utils import np_utils
import tensorflow as tf
import datetime
import numpy as np
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
class_names = ['glioma_tumor','meningioma_tumor','no_tumor','pituitary_tumor']
class_names_label = {class_name:i for i, class_name in enumerate(class_names)}

nb_classes = len(class_names)

IMAGE_SIZE = (128, 128)

In [5]:
def load_data():
    """
        Load the data:
            - 14,034 images to train the network.
            - 3,000 images to evaluate how accurately the network learned to classify images.
    """
    TrainF=r"/kaggle/input/brain-tumor-classification-mri/Training"
    TestF=r"/kaggle/input/brain-tumor-classification-mri/Testing"
    datasets =  [TrainF,TestF]
    output = []
    
    # Iterate through training and test sets
    for dataset in datasets:
        
        images = []
        labels = []
        
        print("Loading {}".format(dataset))
        
        # Iterate through each folder corresponding to a category
        for folder in os.listdir(dataset):
            label = class_names_label[folder]
            
            # Iterate through each image in our folder
            for file in tqdm(os.listdir(os.path.join(dataset, folder))):
                
                # Get the path name of the image
                img_path = os.path.join(os.path.join(dataset, folder), file)
                
                # Open and resize the img
                image = cv2.imread(img_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, IMAGE_SIZE) 
                
                # Append the image and its corresponding label to the output
                images.append(image)
                labels.append(label)
                
        images = np.array(images, dtype = 'float32')
        labels = np.array(labels, dtype = 'int32')   
        
        output.append((images, labels))

    return output

In [6]:
num_classes=4
(x_train, y_train), (x_test, y_test) = load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")

#plt.figure(figsize=(10, 10))
#for i in range(25):
#    plt.subplot(5, 5, i + 1)
#    plt.xticks([])
#    plt.yticks([])
#    plt.grid(False)
#    plt.imshow(x_train[i])
#plt.show()

Loading /kaggle/input/brain-tumor-classification-mri/Training


100%|██████████| 395/395 [00:03<00:00, 117.00it/s]
100%|██████████| 827/827 [00:08<00:00, 96.72it/s] 
100%|██████████| 822/822 [00:07<00:00, 115.96it/s]
100%|██████████| 826/826 [00:07<00:00, 113.40it/s]


Loading /kaggle/input/brain-tumor-classification-mri/Testing


100%|██████████| 105/105 [00:00<00:00, 163.68it/s]
100%|██████████| 74/74 [00:00<00:00, 78.32it/s]
100%|██████████| 115/115 [00:00<00:00, 128.53it/s]
100%|██████████| 100/100 [00:00<00:00, 110.89it/s]


x_train shape: (2870, 128, 128, 3) - y_train shape: (2870, 4)
x_test shape: (394, 128, 128, 3) - y_test shape: (394, 4)


## Configure the hyperparameters

A key parameter to pick is the `patch_size`, the size of the input patches.
In order to use each pixel as an individual input, you can set `patch_size` to `(1, 1)`.
Below, we take inspiration from the original paper settings
for training on ImageNet-1K, keeping most of the original settings for this example.

In [7]:
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd

In [38]:
tumor_dir=r'/kaggle/input/brain-tumor-mri-dataset/Training/glioma'
tumor_dir1=r'/kaggle/input/brain-tumor-mri-dataset/Training/meningioma'
tumor_dir2=r'/kaggle/input/brain-tumor-mri-dataset/Training/pituitary'
healthy_dir=r'/kaggle/input/brain-tumor-mri-dataset/Training/notumor'
filepaths = []
labels= []
dict_list = [tumor_dir,tumor_dir1, tumor_dir2, healthy_dir]
for i, j in enumerate(dict_list):
    flist=os.listdir(j)
    for f in flist:
        fpath=os.path.join(j,f)
        filepaths.append(fpath)
        if i==0:
          labels.append('glioma_tumor')
        elif i==1:
          labels.append('meningioma_tumor')
        elif i==2:
          labels.append('pituitary_tumor')
        else:
          labels.append('no_tumor')
    
Fseries = pd.Series(filepaths, name="filepaths")
Lseries = pd.Series(labels, name="labels")
tumor_data = pd.concat([Fseries,Lseries], axis=1)
tumor_df = pd.DataFrame(tumor_data)
print(tumor_df.head())
print(tumor_df["labels"].value_counts())

#shape of datatset
tumor_df.shape

                                           filepaths        labels
0  /kaggle/input/brain-tumor-mri-dataset/Training...  glioma_tumor
1  /kaggle/input/brain-tumor-mri-dataset/Training...  glioma_tumor
2  /kaggle/input/brain-tumor-mri-dataset/Training...  glioma_tumor
3  /kaggle/input/brain-tumor-mri-dataset/Training...  glioma_tumor
4  /kaggle/input/brain-tumor-mri-dataset/Training...  glioma_tumor
no_tumor            1595
pituitary_tumor     1457
meningioma_tumor    1339
glioma_tumor        1321
Name: labels, dtype: int64


(5712, 2)

In [39]:
from sklearn.model_selection import train_test_split 
train_images, test_images = train_test_split(tumor_df, test_size=0.15, random_state=42)
train_set, val_set = train_test_split(tumor_df, test_size=0.2, random_state=42)

In [10]:
input_shape=(128,128,3)

In [51]:
patch_size = (32, 32)  # 2-by-2 sized patches
dropout_rate = 0.03  # Dropout rate
num_heads =8  # Attention heads
embed_dim = 64  # Embedding dimension
num_mlp = 256  # MLP layer size
qkv_bias = True  # Convert embedded patches to query, key, and values with a learnable additive value
window_size = 2  # Size of attention window
shift_size = 1  # Size of shifting window
image_dimension = 128  # Initial image size

num_patch_x = input_shape[0] // patch_size[0]
num_patch_y = input_shape[1] // patch_size[1]

learning_rate = 1e-3
batch_size = 64
num_epochs = 40
validation_split = 0.1
weight_decay = 0.0001
label_smoothing = 0.1

In [14]:
patch_size = (10, 10)  # Larger patch size to capture more context
dropout_rate = 0.2  # Slightly higher dropout rate for larger input
num_heads = 8  # Keep the same number of attention heads
embed_dim = 128  # Increase embedding dimension for larger input
num_mlp = 512  # Increase MLP layer size for more complex data
qkv_bias = True  # Keep the same
window_size = 5  # Increase attention window size for larger context
shift_size = 2  # Increase shifting window for more overlap
image_dimension = 150  # Update initial image size to match the new input

num_patch_x = input_shape[0] // patch_size[0]
num_patch_y = input_shape[1] // patch_size[1]

learning_rate = 1e-3
batch_size = 32  # Reduce batch size for larger input (adjust as needed)
num_epochs = 40
validation_split = 0.1
weight_decay = 0.0001
label_smoothing = 0.1


## Helper functions

We create two helper functions to help us get a sequence of
patches from the image, merge patches, and apply dropout.

In [12]:

def window_partition(x, window_size):
    _, height, width, channels = x.shape
    patch_num_y = height // window_size
    patch_num_x = width // window_size
    x = tf.reshape(
        x, shape=(-1, patch_num_y, window_size, patch_num_x, window_size, channels)
    )
    x = tf.transpose(x, (0, 1, 3, 2, 4, 5))
    windows = tf.reshape(x, shape=(-1, window_size, window_size, channels))
    return windows


def window_reverse(windows, window_size, height, width, channels):
    patch_num_y = height // window_size
    patch_num_x = width // window_size
    x = tf.reshape(
        windows,
        shape=(-1, patch_num_y, patch_num_x, window_size, window_size, channels),
    )
    x = tf.transpose(x, perm=(0, 1, 3, 2, 4, 5))
    x = tf.reshape(x, shape=(-1, height, width, channels))
    return x


class DropPath(layers.Layer):
    def __init__(self, drop_prob=None, **kwargs):
        super(DropPath, self).__init__(**kwargs)
        self.drop_prob = drop_prob

    def call(self, x):
        input_shape = tf.shape(x)
        batch_size = input_shape[0]
        rank = x.shape.rank
        shape = (batch_size,) + (1,) * (rank - 1)
        random_tensor = (1 - self.drop_prob) + tf.random.uniform(shape, dtype=x.dtype)
        path_mask = tf.floor(random_tensor)
        output = tf.math.divide(x, 1 - self.drop_prob) * path_mask
        return output


## Window based multi-head self-attention

Usually Transformers perform global self-attention, where the relationships between
a token and all other tokens are computed. The global computation leads to quadratic
complexity with respect to the number of tokens. Here, as the [original paper](https://arxiv.org/abs/2103.14030)
suggests, we compute self-attention within local windows, in a non-overlapping manner.
Global self-attention leads to quadratic computational complexity in the number of patches,
whereas window-based self-attention leads to linear complexity and is easily scalable.

In [13]:

class WindowAttention(layers.Layer):
    def __init__(
        self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0, **kwargs
    ):
        super(WindowAttention, self).__init__(**kwargs)
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
        self.dropout = layers.Dropout(dropout_rate)
        self.proj = layers.Dense(dim)

    def build(self, input_shape):
        num_window_elements = (2 * self.window_size[0] - 1) * (
            2 * self.window_size[1] - 1
        )
        self.relative_position_bias_table = self.add_weight(
            shape=(num_window_elements, self.num_heads),
            initializer=tf.initializers.Zeros(),
            trainable=True,
        )
        coords_h = np.arange(self.window_size[0])
        coords_w = np.arange(self.window_size[1])
        coords_matrix = np.meshgrid(coords_h, coords_w, indexing="ij")
        coords = np.stack(coords_matrix)
        coords_flatten = coords.reshape(2, -1)
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = relative_coords.transpose([1, 2, 0])
        relative_coords[:, :, 0] += self.window_size[0] - 1
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)

        self.relative_position_index = tf.Variable(
            initial_value=tf.convert_to_tensor(relative_position_index), trainable=False
        )

    def call(self, x, mask=None):
        _, size, channels = x.shape
        head_dim = channels // self.num_heads
        x_qkv = self.qkv(x)
        x_qkv = tf.reshape(x_qkv, shape=(-1, size, 3, self.num_heads, head_dim))
        x_qkv = tf.transpose(x_qkv, perm=(2, 0, 3, 1, 4))
        q, k, v = x_qkv[0], x_qkv[1], x_qkv[2]
        q = q * self.scale
        k = tf.transpose(k, perm=(0, 1, 3, 2))
        attn = q @ k

        num_window_elements = self.window_size[0] * self.window_size[1]
        relative_position_index_flat = tf.reshape(
            self.relative_position_index, shape=(-1,)
        )
        relative_position_bias = tf.gather(
            self.relative_position_bias_table, relative_position_index_flat
        )
        relative_position_bias = tf.reshape(
            relative_position_bias, shape=(num_window_elements, num_window_elements, -1)
        )
        relative_position_bias = tf.transpose(relative_position_bias, perm=(2, 0, 1))
        attn = attn + tf.expand_dims(relative_position_bias, axis=0)

        if mask is not None:
            nW = mask.get_shape()[0]
            mask_float = tf.cast(
                tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32
            )
            attn = (
                tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size))
                + mask_float
            )
            attn = tf.reshape(attn, shape=(-1, self.num_heads, size, size))
            attn = keras.activations.softmax(attn, axis=-1)
        else:
            attn = keras.activations.softmax(attn, axis=-1)
        attn = self.dropout(attn)

        x_qkv = attn @ v
        x_qkv = tf.transpose(x_qkv, perm=(0, 2, 1, 3))
        x_qkv = tf.reshape(x_qkv, shape=(-1, size, channels))
        x_qkv = self.proj(x_qkv)
        x_qkv = self.dropout(x_qkv)
        return x_qkv


## The complete Swin Transformer model

Finally, we put together the complete Swin Transformer by replacing the standard multi-head
attention (MHA) with shifted windows attention. As suggested in the
original paper, we create a model comprising of a shifted window-based MHA
layer, followed by a 2-layer MLP with GELU nonlinearity in between, applying
`LayerNormalization` before each MSA layer and each MLP, and a residual
connection after each of these layers.

Notice that we only create a simple MLP with 2 Dense and
2 Dropout layers. Often you will see models using ResNet-50 as the MLP which is
quite standard in the literature. However in this paper the authors use a
2-layer MLP with GELU nonlinearity in between.

In [14]:

class SwinTransformer(layers.Layer):
    def __init__(
        self,
        dim,
        num_patch,
        num_heads,
        window_size=7,
        shift_size=0,
        num_mlp=1024,
        qkv_bias=True,
        dropout_rate=0.0,
        **kwargs,
    ):
        super(SwinTransformer, self).__init__(**kwargs)

        self.dim = dim  # number of input dimensions
        self.num_patch = num_patch  # number of embedded patches
        self.num_heads = num_heads  # number of attention heads
        self.window_size = window_size  # size of window
        self.shift_size = shift_size  # size of window shift
        self.num_mlp = num_mlp  # number of MLP nodes

        self.norm1 = layers.LayerNormalization(epsilon=1e-5)
        self.attn = WindowAttention(
            dim,
            window_size=(self.window_size, self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            dropout_rate=dropout_rate,
        )
        self.drop_path = DropPath(dropout_rate)
        self.norm2 = layers.LayerNormalization(epsilon=1e-5)

        self.mlp = keras.Sequential(
            [
                layers.Dense(num_mlp),
                layers.Activation(keras.activations.gelu),
                layers.Dropout(dropout_rate),
                layers.Dense(dim),
                layers.Dropout(dropout_rate),
            ]
        )

        if min(self.num_patch) < self.window_size:
            self.shift_size = 0
            self.window_size = min(self.num_patch)

    def build(self, input_shape):
        if self.shift_size == 0:
            self.attn_mask = None
        else:
            height, width = self.num_patch
            h_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            w_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            mask_array = np.zeros((1, height, width, 1))
            count = 0
            for h in h_slices:
                for w in w_slices:
                    mask_array[:, h, w, :] = count
                    count += 1
            mask_array = tf.convert_to_tensor(mask_array)

            # mask array to windows
            mask_windows = window_partition(mask_array, self.window_size)
            mask_windows = tf.reshape(
                mask_windows, shape=[-1, self.window_size * self.window_size]
            )
            attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims(
                mask_windows, axis=2
            )
            attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask)
            attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask)
            self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False)

    def call(self, x):
        height, width = self.num_patch
        _, num_patches_before, channels = x.shape
        x_skip = x
        x = self.norm1(x)
        x = tf.reshape(x, shape=(-1, height, width, channels))
        if self.shift_size > 0:
            shifted_x = tf.roll(
                x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
            )
        else:
            shifted_x = x

        x_windows = window_partition(shifted_x, self.window_size)
        x_windows = tf.reshape(
            x_windows, shape=(-1, self.window_size * self.window_size, channels)
        )
        attn_windows = self.attn(x_windows, mask=self.attn_mask)

        attn_windows = tf.reshape(
            attn_windows, shape=(-1, self.window_size, self.window_size, channels)
        )
        shifted_x = window_reverse(
            attn_windows, self.window_size, height, width, channels
        )
        if self.shift_size > 0:
            x = tf.roll(
                shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
            )
        else:
            x = shifted_x

        x = tf.reshape(x, shape=(-1, height * width, channels))
        x = self.drop_path(x)
        x = x_skip + x
        x_skip = x
        x = self.norm2(x)
        x = self.mlp(x)
        x = self.drop_path(x)
        x = x_skip + x
        return x


## Model training and evaluation

### Extract and embed patches

We first create 3 layers to help us extract, embed and merge patches from the
images on top of which we will later use the Swin Transformer class we built.

In [15]:

class PatchExtract(layers.Layer):
    def __init__(self, patch_size, **kwargs):
        super(PatchExtract, self).__init__(**kwargs)
        self.patch_size_x = patch_size[0]
        self.patch_size_y = patch_size[0]

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=(1, self.patch_size_x, self.patch_size_y, 1),
            strides=(1, self.patch_size_x, self.patch_size_y, 1),
            rates=(1, 1, 1, 1),
            padding="VALID",
        )
        patch_dim = patches.shape[-1]
        patch_num = patches.shape[1]
        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))


class PatchEmbedding(layers.Layer):
    def __init__(self, num_patch, embed_dim, **kwargs):
        super(PatchEmbedding, self).__init__(**kwargs)
        self.num_patch = num_patch
        self.proj = layers.Dense(embed_dim)
        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)

    def call(self, patch):
        pos = tf.range(start=0, limit=self.num_patch, delta=1)
        return self.proj(patch) + self.pos_embed(pos)


class PatchMerging(tf.keras.layers.Layer):
    def __init__(self, num_patch, embed_dim):
        super(PatchMerging, self).__init__()
        self.num_patch = num_patch
        self.embed_dim = embed_dim
        self.linear_trans = layers.Dense(2 * embed_dim, use_bias=False)

    def call(self, x):
        height, width = self.num_patch
        _, _, C = x.get_shape().as_list()
        x = tf.reshape(x, shape=(-1, height, width, C))
        x0 = x[:, 0::2, 0::2, :]
        x1 = x[:, 1::2, 0::2, :]
        x2 = x[:, 0::2, 1::2, :]
        x3 = x[:, 1::2, 1::2, :]
        x = tf.concat((x0, x1, x2, x3), axis=-1)
        x = tf.reshape(x, shape=(-1, (height // 2) * (width // 2), 4 * C))
        return self.linear_trans(x)


### Build the model

We put together the Swin Transformer model.

In [52]:
num_classes=4
input = layers.Input(input_shape)
x = layers.RandomCrop(image_dimension, image_dimension)(input)
x = layers.RandomFlip("horizontal")(x)
x = PatchExtract(patch_size)(x)
x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(x)
x = SwinTransformer(
    dim=embed_dim,
    num_patch=(num_patch_x, num_patch_y),
    num_heads=num_heads,
    window_size=window_size,
    shift_size=0,
    num_mlp=num_mlp,
    qkv_bias=qkv_bias,
    dropout_rate=dropout_rate,
)(x)
x = SwinTransformer(
    dim=embed_dim,
    num_patch=(num_patch_x, num_patch_y),
    num_heads=num_heads,
    window_size=window_size,
    shift_size=shift_size,
    num_mlp=num_mlp,
    qkv_bias=qkv_bias,
    dropout_rate=dropout_rate,
)(x)
x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
output = layers.Dense(num_classes, activation="softmax")(x)

### Train on CIFAR-100

We train the model on CIFAR-100. Here, we only train the model
for 40 epochs to keep the training time short in this example.
In practice, you should train for 150 epochs to reach convergence.

In [48]:
image_gen = ImageDataGenerator(preprocessing_function= tf.keras.applications.mobilenet_v2.preprocess_input)
train = image_gen.flow_from_dataframe(dataframe= train_set,x_col="filepaths",y_col="labels",
                                      target_size=(128,128),
                                      color_mode='rgb',
                                      class_mode="categorical", #used for Sequential Model
                                      batch_size=32,
                                      shuffle=False            #do not shuffle data
                                     )
test = image_gen.flow_from_dataframe(dataframe= test_images,x_col="filepaths", y_col="labels",
                                     target_size=(128,128),
                                     color_mode='rgb',
                                     class_mode="categorical",
                                     batch_size=32,
                                     shuffle= False
                                    )
val = image_gen.flow_from_dataframe(dataframe= val_set,x_col="filepaths", y_col="labels",
                                    target_size=(128,128),
                                    color_mode= 'rgb',
                                    class_mode="categorical",
                                    batch_size=32,
                                    shuffle=False
                                   )

Found 4569 validated image filenames belonging to 4 classes.
Found 857 validated image filenames belonging to 4 classes.
Found 1143 validated image filenames belonging to 4 classes.


In [53]:
model = keras.Model(input, output)
model.compile(
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
    #optimizer=tfa.optimizers.AdamW(
    #    learning_rate=learning_rate, weight_decay=weight_decay
    #),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3, decay=weight_decay),
    metrics=[
        keras.metrics.CategoricalAccuracy(name="accuracy"),
        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
    ],
)

history = model.fit(
    train,
    #y_train,
    #batch_size=128,
    epochs=num_epochs,
    #validation_split=validation_split,
    #validation_data=(x_test,y_test),
    validation_data=val,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [35]:
model = keras.Model(input, output)
model.compile(
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
    optimizer=tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    ),
    #optimizer=keras.optimizers.Adam(learning_rate=1e-2, decay=weight_decay),
    metrics=[
        keras.metrics.CategoricalAccuracy(name="accuracy"),
        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
    ],
)

history = model.fit(
    x_train,
    y_train,
    batch_size=256,
    epochs=num_epochs,
    #validation_split=validation_split,
    validation_data=(x_test,y_test),
    #validation_data=val,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


Let's visualize the training progress of the model.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
ax.plot(history.history["accuracy"], label="Training Accuracy")
ax.plot(history.history["val_accuracy"], label="Validation Accuracy")

# Set the plot title and axis labels
#ax.set_title('ARIMA Performance')
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')

# Remove the grid lines
ax.grid(False)

# Set the legend
ax.legend(loc='lower right')
ax.set_facecolor('white')
ax.xaxis.label.set_color('black')
ax.yaxis.label.set_color('black')

# Set the color of tick labels to black
ax.tick_params(axis='x', colors='black')
ax.tick_params(axis='y', colors='black')
#fig.set_facecolor('white')
# Show the plot
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the data
ax.plot(history.history["loss"], label="Training Loss")
ax.plot(history.history["val_loss"], label="Validation Loss")

# Set the plot title and axis labels
#ax.set_title('ARIMA Performance')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')

# Remove the grid lines
ax.grid(False)

# Set the legend
ax.legend(loc='upper right')
ax.set_facecolor('white')
ax.xaxis.label.set_color('black')
ax.yaxis.label.set_color('black')

# Set the color of tick labels to black
ax.tick_params(axis='x', colors='black')
ax.tick_params(axis='y', colors='black')
#fig.set_facecolor('white')
# Show the plot
plt.show()

Let's display the final results of the training on CIFAR-100.

In [None]:
loss, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
#loss, accuracy, top_5_accuracy = model.evaluate(x_test)
print(f"Test loss: {round(loss, 2)}")
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")

In [None]:
pred = model.predict(test)
#pred = np.argmax(pred, axis=1) #pick class with highest  probability

#pred

In [None]:
vit_y_pred = [np.argmax(probas) for probas in pred]


In [None]:
pred = model.predict(test)
pred = np.argmax(pred, axis=1) #pick class with highest  probability
labels = (train.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred2 = [labels[k] for k in pred]
from sklearn.metrics import classification_report,accuracy_score

In [None]:
pred = model.predict(test)
pred = np.argmax(pred, axis=1) #pick class with highest  probability

labels = (train.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred2 = [labels[k] for k in pred]

In [None]:
from sklearn.metrics import classification_report,accuracy_score

y_test = test_images.labels # set y_test to the expected output
print(classification_report(y_test, pred2))
print("Accuracy of the Model:",accuracy_score(y_test, pred2)*100,"%")

In [None]:
classification_report.strip()

In [None]:
#from sklearn.metrics import classification_report
report_lines = classification_report.strip().split('\n')
class_names = []
precisions = []
recalls = []
supports = []

for line in report_lines[2:-5]:  # Exclude headers and footer lines
    parts = line.split()
    class_names.append(parts[0])
    precisions.append(float(parts[1]))
    recalls.append(float(parts[2]))
    supports.append(int(parts[-1]))

# Calculate TP, FP, TN, FN for each class
TP = [int(recalls[i] * supports[i]) for i in range(len(class_names))]
FN = [supports[i] - TP[i] for i in range(len(class_names))]
FP = [int((TP[i] / precisions[i]) - TP[i]) for i in range(len(class_names))]
TN = [sum(supports) - (TP[i] + FP[i] + FN[i]) for i in range(len(class_names))]

# Print the results
for i in range(len(class_names)):
    print(f'Class: {class_names[i]} - TP: {TP[i]}, FP: {FP[i]}, TN: {TN[i]}, FN: {FN[i]}')

In [None]:
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix

# Assuming your model is called 'model' and you have the test data 'x_test' and 'y_test'
predictions = model.predict(test)
predicted_labels = (predictions > 0.5).astype(int)  # Thresholding predicted probabilities

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
y_test = test_images.labels 
# Calculate multilabel confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_test, predicted_labels)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print("Multilabel Confusion Matrix:")
print(confusion_matrix)



In [None]:
from sklearn.metrics import classification_report

# Assuming your model is called 'model' and you have the test data 'x_test' and 'y_test'
predictions = model.predict(test)
predicted_labels = (predictions > 0.5).astype(int)  # Thresholding predicted probabilities

# Reshape the predicted_labels to match the shape of y_test
predicted_labels = predicted_labels.reshape(y_test.shape)

# Generate classification report
classification_report = classification_report(y_test, predicted_labels)
print(classification_report)


In [None]:
from sklearn.metrics import multilabel_confusion_matrix

# Assuming your model is called 'model' and you have the test data 'x_test' and 'y_test'
predictions = model.predict(x_test)
predicted_labels = (predictions > 0.5).astype(int)  # Thresholding predicted probabilities

# Reshape the predicted_labels to match the shape of y_test
predicted_labels = predicted_labels.reshape(y_test.shape)

# Calculate multilabel confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_test, predicted_labels)
print("Multilabel Confusion Matrix:")
for i, matrix in enumerate(confusion_matrix):
    print(f"Class {i + 1}:")
    print(matrix)


In [None]:
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

# Assuming your model is called 'model' and you have the test data 'x_test' and 'y_test'
predictions = model.predict(x_test)
predicted_labels = (predictions > 0.5).astype(int)  # Thresholding predicted probabilities

# Reshape the predicted_labels to match the shape of y_test
predicted_labels = predicted_labels.reshape(y_test.shape)

# Calculate multilabel confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_test, predicted_labels)

# Aggregate the individual confusion matrices into a single 4x4 confusion matrix
overall_confusion_matrix = np.vstack((
    np.hstack((confusion_matrix[0], confusion_matrix[1])),
    np.hstack((confusion_matrix[2], confusion_matrix[3]))
))

print("Overall Confusion Matrix:")
print(overall_confusion_matrix)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming your model is called 'model' and you have the test data 'x_test' and 'y_test'
predictions = model.predict(x_test)
predicted_labels = np.argmax(predictions, axis=1)  # Get the predicted labels

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)

# Calculate confusion matrix
confusion_matrix = confusion_matrix(y_test, predicted_labels)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")
print("Confusion Matrix:")
print(confusion_matrix)


In [None]:
#labels = (x_train.class_indices)
labels = dict((v,k) for k,v in class_names_label.items())
pred2 = [labels[k] for k in pred]

In [None]:
y_test.labels

In [None]:
from sklearn.metrics import classification_report,accuracy_score

#y_test = test_images.labels # set y_test to the expected output
print(classification_report(y_test, vit_y_pred))
print("Accuracy of the Model:",accuracy_score(y_test, pred2)*100,"%")

In [None]:
cnf_matrix = confusion_matrix(y_test, vit_y_pred)

The Swin Transformer model we just trained has just 152K parameters, and it gets
us to ~75% test top-5 accuracy within just 40 epochs without any signs of overfitting
as well as seen in above graph. This means we can train this network for longer
(perhaps with a bit more regularization) and obtain even better performance.
This performance can further be improved by additional techniques like cosine
decay learning rate schedule, other data augmentation techniques. While experimenting,
I tried training the model for 150 epochs with a slightly higher dropout and greater
embedding dimensions which pushes the performance to ~72% test accuracy on CIFAR-100
as you can see in the screenshot.

![Results of training for longer](https://i.imgur.com/9vnQesZ.png)

The authors present a top-1 accuracy of 87.3% on ImageNet. The authors also present
a number of experiments to study how input sizes, optimizers etc. affect the final
performance of this model. The authors further present using this model for object detection,
semantic segmentation and instance segmentation as well and report competitive results
for these. You are strongly advised to also check out the
[original paper](https://arxiv.org/abs/2103.14030).

This example takes inspiration from the official
[PyTorch](https://github.com/microsoft/Swin-Transformer) and
[TensorFlow](https://github.com/VcampSoldiers/Swin-Transformer-Tensorflow) implementations.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

input_shape = (128, 128, 3)  # Specify the desired input shape

# Swin Transformer block
class SwinTransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, mlp_dim, qkv_bias, dropout_rate=0.0):
        super(SwinTransformerBlock, self).__init__()

        self.mlp_dim = mlp_dim
        self.qkv_bias = qkv_bias

        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=dropout_rate
        )
        self.mlp = keras.Sequential(
            [
                layers.Dense(units=mlp_dim, activation=keras.activations.gelu),
                layers.Dropout(rate=dropout_rate),
                layers.Dense(units=embed_dim),
                layers.Dropout(rate=dropout_rate),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-5)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-5)
        self.dropout1 = layers.Dropout(rate=dropout_rate)
        self.dropout2 = layers.Dropout(rate=dropout_rate)

    def call(self, inputs, training=False):
        # Cross-attention
        x = inputs
        x = self.layernorm1(x)
        attention_output = self.att(x, x, x)
        attention_output = self.dropout1(attention_output, training=training)
        out1 = x + attention_output

        # MLP
        x = out1
        x = self.layernorm2(x)
        x = self.mlp(x)
        x = self.dropout2(x, training=training)
        out2 = out1 + x

        return out2

# Swin Transformer model
class SwinTransformer(keras.Model):
    def __init__(
        self,
        input_shape=input_shape,
        patch_size=(2, 2),
        num_heads=4,
        embed_dim=64,
        num_mlp_layers=2,
        mlp_dim=256,
        qkv_bias=True,
        dropout_rate=0.0,
        num_classes=1000,
    ):
        super(SwinTransformer, self).__init__()

        self.num_classes = num_classes
        self.patch_size = patch_size
        self.embed_dim = embed_dim

        num_patches = (input_shape[0] // patch_size[0]) * (input_shape[1] // patch_size[1])
        self.patch_proj = layers.Conv2D(embed_dim, patch_size, strides=patch_size, padding="valid")
        self.pos_emb = self.add_weight(
            "pos_emb", shape=(1, num_patches + 1, embed_dim), initializer=keras.initializers.RandomNormal(), trainable=True
        )
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.blocks = [
            SwinTransformerBlock(
                embed_dim=embed_dim,
                num_heads=num_heads,
                mlp_dim=mlp_dim,
                qkv_bias=qkv_bias,
                dropout_rate=dropout_rate,
            )
            for _ in range(num_mlp_layers)
        ]
        self.layernorm = layers.LayerNormalization(epsilon=1e-5)
        self.mlp_head = keras.Sequential(
            [
                layers.Dense(units=mlp_dim, activation=keras.activations.gelu),
                layers.Dropout(rate=dropout_rate),
                layers.Dense(units=num_classes),
            ]
        )

    def call(self, inputs, training=False):
        # Patch projection
        x = self.patch_proj(inputs)
        x = tf.reshape(x, shape=(-1, x.shape[1] * x.shape[2], x.shape[3]))

        # Positional embedding
        x = x + self.pos_emb

        # Dropout
        x = self.dropout(x, training=training)

        # Transformer blocks
        for block in self.blocks:
            x = block(x, training=training)

        # Layer normalization
        x = self.layernorm(x)

        # MLP head
        x = tf.reduce_mean(x, axis=1)
        x = self.mlp_head(x)

        return x

# Create the Swin Transformer model
model = SwinTransformer()

# Print model summary
model.build(input_shape=(None, *input_shape))
model.summary()


In [22]:
!pip install timm

Collecting timm
  Downloading timm-0.9.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting safetensors
  Downloading safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, timm
Successfully installed safetensors-0.3.3 timm-0.9.5
[0m

In [24]:
import torchvision
import torch
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
import random
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision.transforms as transforms
import cv2
import glob
import math
import timm
from PIL import ImageFilter
#from einops import rearrange
from timm.loss import LabelSmoothingCrossEntropy

In [25]:
timm.list_models('swin*', pretrained=True)

['swin_base_patch4_window7_224.ms_in1k',
 'swin_base_patch4_window7_224.ms_in22k',
 'swin_base_patch4_window7_224.ms_in22k_ft_in1k',
 'swin_base_patch4_window12_384.ms_in1k',
 'swin_base_patch4_window12_384.ms_in22k',
 'swin_base_patch4_window12_384.ms_in22k_ft_in1k',
 'swin_large_patch4_window7_224.ms_in22k',
 'swin_large_patch4_window7_224.ms_in22k_ft_in1k',
 'swin_large_patch4_window12_384.ms_in22k',
 'swin_large_patch4_window12_384.ms_in22k_ft_in1k',
 'swin_s3_base_224.ms_in1k',
 'swin_s3_small_224.ms_in1k',
 'swin_s3_tiny_224.ms_in1k',
 'swin_small_patch4_window7_224.ms_in1k',
 'swin_small_patch4_window7_224.ms_in22k',
 'swin_small_patch4_window7_224.ms_in22k_ft_in1k',
 'swin_tiny_patch4_window7_224.ms_in1k',
 'swin_tiny_patch4_window7_224.ms_in22k',
 'swin_tiny_patch4_window7_224.ms_in22k_ft_in1k',
 'swinv2_base_window8_256.ms_in1k',
 'swinv2_base_window12_192.ms_in22k',
 'swinv2_base_window12to16_192to256.ms_in22k_ft_in1k',
 'swinv2_base_window12to24_192to384.ms_in22k_ft_in1k',


In [27]:
!pip install huggingface_hub

[0m

In [28]:
model = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
model.head = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, class_len)) # Modify head according to this task

model = model.to(device)

criterion = LabelSmoothingCrossEntropy() # this is better than nn.CrossEntropyLoss
criterion = criterion.to(device)

optimizer = torch.optim.AdamW(model.head.parameters(), lr=lr) # Setting for transfer learning

RuntimeError: Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.