This code provides a comprehensive setup to pretrain a ViT model on ImageNet and fine-tune it on the BRACS dataset, including data preprocessing, augmentation, training, evaluation, and visualization of results. Adjust paths, hyperparameters, and dataset details as need:

## Step 1: Setup Colab Environment
I ran out of GPU and my Colab environment is set up to use a cpu.



In [1]:
# Ensure GPU is enabled
import torch
print(torch.cuda.is_available())  # Should return True if GPU is available


False


## Step 2: Install Required Libraries
Install the necessary libraries including PyTorch, Hugging Face transformers, and torchvision.

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()


In [None]:
!pip install torch torchvision transformers huggingface_hub

# Authenticate with Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

In [2]:
!pip install torch torchvision transformers


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

## Step 3: Load and Preprocess the ImageNet Dataset for Pretraining


In [3]:
from transformers import ViTFeatureExtractor, ViTModel, ViTForImageClassification
from PIL import Image
import requests

# Load a sample image from ImageNet (this is for demonstration; ImageNet should be loaded properly for real pretraining)
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

# Initialize processor and model
processor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

# Process image
inputs = processor(images=image, return_tensors="pt")

# Forward pass
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

## Step 4: Prepare the BRACS Dataset

In [1]:
!pip install torch torchvision transformers huggingface_hub

# Authenticate with Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm.notebook import tqdm
from google.colab import drive
import os
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split

# Mount Google Drive
drive.mount('/content/drive')

# Data preparation
data_dir = '/content/drive/My Drive/BRACS'

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the dataset
dataset = ImageFolder(root=data_dir, transform=transform)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 5: Define and Fine-Tune the Model on BRACS


In [None]:
from transformers import ViTForImageClassification
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm.notebook import tqdm

# Load the pre-trained ViT model for image classification
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=7)
model.to('cpu')  # Move the model to CPU

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=1e-4)
criterion = CrossEntropyLoss()

# Training loop
num_epochs = 10
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in tqdm(train_loader):
        images, labels = images.to('cpu'), labels.to('cpu')  # Move images and labels to CPU

        # Ensure labels are in the correct range
        labels = labels.long()  # Ensure labels are long type

        # Forward pass
        outputs = model(images).logits
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/114 [00:00<?, ?it/s]

Epoch 1/10, Loss: 0.7302


  0%|          | 0/114 [00:00<?, ?it/s]

## Step 6: Evaluate the Model

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total:.2f}%')


## Step 6_1:Adding Interpretability

In [None]:
# Example: Applying GradCAM to validation images

model.eval()  # Ensure the model is in evaluation mode

# Choose a few validation images for interpretation
num_images = 5
images, labels = next(iter(val_loader))
images = images[:num_images]
labels = labels[:num_images]

for i, image in enumerate(images):
    input_tensor = image.unsqueeze(0)  # Add batch dimension
    mask = grad_cam.generate_cam(input_tensor, target_class=labels[i].item())
    image = image.permute(1, 2, 0).numpy()  # Convert to HWC format
    cam_image = show_cam_on_image(image, mask)
    plt.figure()
    plt.title(f'Label: {labels[i].item()}')
    plt.imshow(cam_image)
    plt.show()


## Step 7: Plot Training and Validation Loss and Accuracy

In [None]:
import matplotlib.pyplot as plt

# Assuming you've stored loss and accuracy values during training and validation
epochs = range(1, num_epochs + 1)
train_losses = [...]  # Replace with actual training loss values
val_losses = [...]    # Replace with actual validation loss values
train_accuracies = [...]  # Replace with actual training accuracy values
val_accuracies = [...]    # Replace with actual validation accuracy values

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


## Step 8: Display Confusion Matrix


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Get predictions and true labels
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Compute confusion matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['no cancer', 'cancer'])
disp.plot(cmap=plt.cm.Blues)
plt.show()
