In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import numpy as np
from transformers import AutoImageProcessor, AutoModelForImageClassification


In [6]:
data = np.load('data/images.npy')   
label = np.load('data/labels.npy')  
labels = np.argmax(label, axis=1)

In [7]:
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)

In [8]:
class NumpyDataset(Dataset):
    def __init__(self, images, labels, processor):
        self.images = images
        self.labels = labels
        self.processor = processor
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=self.processor.image_mean, std=self.processor.image_std)
        ])
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img = self.images[idx]
        img = self.transform(img)
        lbl = self.labels[idx]
        return img, lbl

In [9]:
dataset = NumpyDataset(data, labels, image_processor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [10]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [12]:
for param in model.vit.parameters():
    param.requires_grad = False


In [13]:
num_classes = 10
model.classifier = nn.Linear(model.config.hidden_size, num_classes).to(device)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
epochs = 15
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, labels_batch in train_loader:
        images = images.to(device)
        labels_batch = labels_batch.to(device)

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for images, labels_batch in val_loader:
        images = images.to(device)
        labels_batch = labels_batch.to(device)
        outputs = model(images).logits
        predicted = torch.argmax(outputs, dim=1)
        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()

print(f"Validation Accuracy: {100*correct/total:.2f}%")