All the imports

In [None]:
import torch
import matplotlib.pyplot as plt
from PIL import Image
from torchsummary import summary
import torch.nn.functional as F
from torch import nn
from torchvision.transforms import Compose, Resize, ToTensor
import torchvision
import numpy as np
from tqdm import tqdm
from torch.optim import Adam, SGD
from torch.nn import CrossEntropyLoss
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader

import pandas as pd
import os
import time

This is used to establish connection to the google drive, we are storing the data on google drive and extracting the zipped folder and storing the saved back on the drive

In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


This is used to extract zipped data folder

In [None]:
import shutil  # package used to extract data

shutil.unpack_archive("/content/drive/MyDrive/rvl-cdip.tar.gz", "/data")

Data preprocessing of testing data and cross-validation data

In [None]:
# Read and process the testing data labels
test_p = open("/data/labels/test.txt").readlines()
test = []  # List to store the testing image paths
y_test = []  # List to store corresponding testing labels
for path in test_p:
    pth = path.split("\n")  # split the line to get the image path and label
    pth, l = pth[0].split(" ")  # split the path and label

    p = "/data/images/" + pth  # Construct the full image path
    if (
        p == "/data/images/imagese/e/j/e/eje42e00/2500126531_2500126536.tif"
    ):  # removing the specific test image as it is corrupt
        continue
    test.append(p)  # add image path to the test list
    y_test.append(int(l))  # Convert label to integer and add to y_test

# Read and process the cv(cross-validation) data labels
cv_p = open(
    "/data/labels/val.txt"
).readlines()  # split the line to get the image path and label
cv = []  # list to store cv image paths
y_cv = []  # list to store corresponding cv labels
for path in cv_p:
    pth = path.split("\n")  # split the line to get the image path and label
    pth, l = pth[0].split(" ")  # split the path and label
    p = "/data/images/" + pth  # Construct the full image path

    cv.append(p)  # Add image path to the cv list
    y_cv.append(int(l))  # Convert label to integer and add to y_cv

Storing the data into Dataframes of image paths and labels as columns

In [None]:
df_test = pd.DataFrame(list(zip(test, y_test)), columns=["paths", "labels"])
df_test.labels = df_test.labels.astype(int)

df_cv = pd.DataFrame(list(zip(cv, y_cv)), columns=["paths", "labels"])
df_cv.labels = df_cv.labels.astype(int)

Creating subfolders for each class and storing images for that particular class within them, used later with ImageFolder for generating DataLoader object

In [None]:
# move the images to test folders
for i in range(0, 16):
    if not os.path.exists("/dataset/test/" + str(i)):
        os.makedirs("/dataset/test/" + str(i))
for i in range(0, 16):
    paths = list(df_test[df_test.labels == i].paths.values)
    for path in paths:
        try:
            shutil.move(path, "/dataset/test/" + str(i))
        except:
            pass

# move the images to cv folders
for i in range(0, 16):
    if not os.path.exists("/dataset/cv/" + str(i)):
        os.makedirs("/dataset/cv/" + str(i))
for i in range(0, 16):
    paths = list(df_cv[df_cv.labels == i].paths.values)
    for path in paths:
        try:
            shutil.move(path, "/dataset/cv/" + str(i))
        except:
            pass

Transformation that is applied to the image

In [None]:
# Define a sequence of transformation to be applied to an image
transform_t = transforms.Compose(
    [
        transforms.Resize((224, 224)),  # Resize the image to 224x224 pixels
        transforms.ToTensor(),  # Convert the image to a PyTorch tensor
        transforms.Normalize(
            mean=[0.5, 0.5, 0.5],  # Normalize the tensor by subtracting mean values
            std=[
                0.5,
                0.5,
                0.5,
            ],  # Normalize the tensor by dividing by standard deviation values
        ),
    ]
)

Creating dataset and DataLoader objects of testing data

In [None]:
test_dataset = torchvision.datasets.ImageFolder(
    root="/dataset/test/", transform=transform_t
)  # Create a testing dataset using the ImageFolder class from torchvision
test_data_loader = DataLoader(
    test_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=False
)

cv_dataset = torchvision.datasets.ImageFolder(
    root="/dataset/cv/", transform=transform_t
)  # Create a cross-validation dataset using the ImageFolder class from torchvision
cv_data_loader = DataLoader(
    cv_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=False
)

Install timm

In [None]:
!pip install timm     # install timm module to import pretrained ViT models
import timm

Train and validate functions

In [None]:
# funtion to train the model, takes in model,train_data_loader,optimizer and the loss function
def train(model, trainloader, optimizer, criterion):
    # set the model to train mode
    model.train()
    print("Training")
    # Initialize the variables to track loss and accuracy
    train_running_loss = 0.0
    train_running_correct = 0
    counter = 0
    for i, data in tqdm(enumerate(trainloader), total=len(trainloader)):
        counter += 1
        image, labels = data  # iterating over the dataloader
        image = image.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()  # resetting all the gradients
        outputs = model(image)  # forward pass

        loss = criterion(outputs, labels)  # calculating the loss
        train_running_loss += loss.item()  # keeping track of the loss

        _, preds = torch.max(outputs.data, 1)  # predicted class
        train_running_correct += (
            (preds == labels).sum().item()
        )  # calculate the accuracy

        loss.backward()  # backpropagation
        optimizer.step()  # update the optimizer parameters

    # loss and accuracy for the complete epoch
    epoch_loss = train_running_loss / counter
    epoch_acc = 100.0 * (train_running_correct / len(trainloader.dataset))
    return epoch_loss, epoch_acc


def validate(model, testloader, criterion):
    model.eval()  # model is set to eval state so that no parameters are updated
    print("Validation")
    valid_running_loss = 0.0  # initialize to store the loss
    valid_running_correct = 0  # initialize to store the accuracy
    count = 0
    with torch.no_grad():
        for i, data in tqdm(enumerate(testloader), total=len(testloader)):
            count += 1
            image, labels = data
            image = image.to(device)
            labels = labels.to(device)

            outputs = model(image)  # forward pass

            loss = criterion(outputs, labels)  # calculate the loss
            valid_running_loss += loss.item()

            _, preds = torch.max(outputs.data, 1)  # calculate the accuracy
            valid_running_correct += (preds == labels).sum().item()

    epoch_loss = valid_running_loss / count  # loss and accuracy for the complete epoch
    epoch_acc = 100.0 * (valid_running_correct / len(testloader.dataset))
    return epoch_loss, epoch_acc

Setting GPU/CPU for training

In [None]:
# Check if CUDA (GPU) is available and set the device accordingly
if torch.cuda.is_available():
    print("Training on GPU")
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

Training on GPU


Loading models so that we have the same architecture

In [None]:
model_valid = timm.create_model(
    "vit_base_patch16_224.augreg2_in21k_ft_in1k", pretrained=False, num_classes=16
).to(
    device
)  # loads the pretrained ViT data that is trained on ImageNet-21k and fine tuned on ImageNet-1k

summary(model_valid, (3, 224, 224))  # to print the architecture of the model

Loading the weights from the pretrained model

In [None]:
check_point = torch.load(
    "/content/drive/MyDrive/model_DOC_VIT_6.pth"
)  # path to where the pretrained model is stored

model_valid.load_state_dict(
    check_point["model_state_dict"]
)  # load the weights to the model

<All keys matched successfully>

Function to extract features

In [None]:
import joblib  # package to dump files
import numpy


def feature_extraction(model, testloader, file_name):
    model.eval()  # model is set to eval state so that no parameters are updated
    print("Feature extraction")
    valid_running_loss = 0.0  # initialize the values to calculate loss
    valid_running_correct = 0  # initialize the values to calculate accuracy
    counter = 0
    total_features = []  # list to store features
    total_labels = []  # list to store the labels
    with torch.no_grad():
        for i, data in tqdm(enumerate(testloader), total=len(testloader)):
            counter += 1
            try:
                image, labels = data
                image = image.to(device)
                labels = labels.to(device)
                outputs = model.forward_features(image)  # used to extract the features
                output = (
                    outputs[:, 0].cpu().numpy()
                )  # transfer the data from cuda to cpu before dumping it using joblib
                total_features.append(output)  # store all the features in this list
                total_labels.append(labels)  # store all the labels in this list
            except Exception as e:  # to catch exception
                print(e)
    joblib.dump(
        total_features, file_name
    )  # dump all the extracted features into a list
    return total_labels

Extracting features for Test and cross-validation data and storing them in test_feat.joblib and cv_feat.joblib file

In [None]:
test_labels = feature_extraction(
    model_valid, test_data_loader, "test_feat"
)  # extracts the features of test data and returns the labels
valid_labels = feature_extraction(
    model_valid, cv_data_loader, "cv_feat"
)  # extracts the features of cv data and returns the labels

Feature extraction


100%|██████████| 1250/1250 [07:36<00:00,  2.74it/s]


Feature extraction


100%|██████████| 1250/1250 [07:42<00:00,  2.70it/s]


concatenating the features and labels

In [None]:
features_test = joblib.load(
    "test_feat"
)  # load the features that was dumped in the test_feat.joblib
features_numpy_test = numpy.concatenate(
    features_test, axis=0
)  # concatenate the features and convert them to numpy array
print(features_numpy_test.shape)
features_valid = joblib.load(
    "cv_feat"
)  # load the features that was dumped in the cv_feat.joblib
features_numpy_valid = numpy.concatenate(
    features_valid, axis=0
)  # concatenate the features and convert them to numpy array
print(features_numpy_valid.shape)

labels_test = np.zeros((0))  # initialize a numpy array to store labels
labels_valid = np.zeros((0))  # initialize a numpy array to store labels

for l in test_labels:
    labels_test = np.concatenate(
        (labels_test, l.cpu()), axis=0
    )  # concatenate the test labels
for l in valid_labels:
    labels_valid = np.concatenate(
        (labels_valid, l.cpu()), axis=0
    )  # concatenate the cv labels

# test_labels_numpy = numpy.concatenate(test_labels,axis=0)
print(labels_valid.shape)
print(labels_test.shape)

(39996, 768)
(39995, 768)
(39995,)
(39996,)


**Meta-Classifiers**

SVM - Linear Classifier - KNN

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

X_train = features_numpy_valid  # the cv features that are used to train the classifier
Y_train = labels_valid  # cv labels used to train the classifier
SVC = OneVsRestClassifier(SVC()).fit(X_train, Y_train)  # SVM classifier
X_test = features_numpy_test  #  test features used for evaluation


def evaluate(classifier, test_data, true_labels):  # function to evaluate the classifier
    # Predict on the test set
    y_pred = classifier.predict(test_data)

    # Calculate the accuracy
    accuracy = accuracy_score(true_labels, y_pred)
    return accuracy


acc_SVC = evaluate(SVC, X_test, labels_test)  # accuracy from SVM
print("SVM accuracy:{}".format(acc_SVC * 100), "%")

LC = OneVsRestClassifier(LogisticRegression()).fit(X_train, Y_train)
acc_LC = evaluate(LC, X_test, labels_test)  # accuracy from LC
print("LC accuracy:{}".format(acc_LC * 100), "%")

KNN = OneVsRestClassifier(KNeighborsClassifier()).fit(X_train, Y_train)
acc_KNN = evaluate(KNN, X_test, labels_test)  # accuracy from KNN
print("KNN accuracy:{}".format(acc_KNN * 100), "%")

MLP

In [None]:
# MLP for classifier of three hidden layers of size 512,256 and num_classes
class mlp(nn.Module):
    def __init__(self, num_classes):
        super(mlp, self).__init__()
        self.ln1 = nn.Linear(768, 512)
        self.ln2 = nn.Linear(512, 256)
        self.ln4 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = torch.relu(self.ln1(x))
        x = torch.relu(self.ln2(x))
        x = self.ln4(x)
        sm = nn.Softmax(dim=1)
        x = sm(x)
        return x

Dataloader for MLP classifier

In [None]:
num_classes = 16
from torch.utils.data import TensorDataset, DataLoader

mlp_model = mlp(num_classes).to(device)  # initialize the mlp model
valid_mlp_data = torch.Tensor(
    features_numpy_valid
)  # convert numpy data to torch tensor
valid_mlp_lab = torch.Tensor(labels_valid).type(
    torch.LongTensor
)  # convert float type to long for weight updation on cuda
valid_mlp_dataset = TensorDataset(
    valid_mlp_data, valid_mlp_lab
)  # load the torch dataset for cv data
valid_mlp_dataloader = DataLoader(
    valid_mlp_dataset, batch_size=32
)  # load the torch dataloader for cv data

test_mlp_data = torch.Tensor(features_numpy_test)  # convert numpy data to torch tensor
test_mlp_lab = torch.Tensor(labels_test).type(
    torch.LongTensor
)  # convert float type to long for weight updation on cuda
test_mlp_dataset = TensorDataset(
    test_mlp_data, test_mlp_lab
)  # load the torch dataset for test data
test_mlp_dataloader = DataLoader(
    test_mlp_dataset, batch_size=32
)  # load the torch dataloader for test data

MLP training

In [None]:
mlp_optimizer = SGD(mlp_model.parameters(), lr=0.1)  # optimizer for MLP training
mlp_loss = nn.CrossEntropyLoss()  # loss function
train_loss = []  # list to store the cv loss during training
train_acc = []  # list to store the cv accuracy
valid_loss = []  # list to store the test loss
valid_acc = []  # list to store the test accuracy
for i in range(30):
    train_epoch_loss, train_epoch_acc = train(
        mlp_model, valid_mlp_dataloader, mlp_optimizer, mlp_loss
    )  # train the mlp
    valid_epoch_loss, valid_epoch_acc = validate(
        mlp_model, test_mlp_dataloader, mlp_loss
    )  # validate on the test dataset

    # Append loss and accuracy values to lists
    train_loss.append(train_epoch_loss)
    valid_loss.append(valid_epoch_loss)
    train_acc.append(train_epoch_acc)
    valid_acc.append(valid_epoch_acc)

    # Print training and validation metrics
    print(f"Training loss: {train_epoch_loss:.3f}, training acc: {train_epoch_acc:.3f}")
    print(
        f"Validation loss: {valid_epoch_loss:.3f}, validation acc: {valid_epoch_acc:.3f}"
    )
    print("-" * 50)

Evaluation Metric - Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix


def evaluate(
    classifier, test_data, true_labels
):  # function to evaluate and return the ground truth and predicted class
    # Predict on the test set
    y_pred = classifier.predict(test_data)

    # Calculate the accuracy
    accuracy = accuracy_score(true_labels, y_pred)
    return accuracy, true_labels, y_pred


_, gt_test, pred_test = evaluate(SVC, X_test, labels_test)

import seaborn as sns

confusion = confusion_matrix(gt_test.ravel(), pred_test.ravel())
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()