In [6]:
! pip install torch numpy timm==0.5.4 tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [None]:
!git clone https://github.com/tsungchiehchen/Vision-Transformer.git

In [None]:
%cd ./Vision-Transformer

In [7]:
import argparse
import datetime
import os
import sys
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

from timm.models import create_model

from engine import train_one_epoch, train_one_epoch_distillation, evaluate
from utils import get_training_dataloader, get_test_dataloader, get_imagenet_train_dataloader, get_imagenet_test_dataloader
import models



In [4]:
MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
CHECKPOINT_PATH = './checkpoint'
MODEL_NAME = 'vit_tiny_patch16_224'
num_classes = 10
EPOCHS = 5
LR = 0.0001
WD = 0.0
shots = 1000

print(f"Creating model: {MODEL_NAME}")
model = create_model(
        MODEL_NAME,
        pretrained=False,
        num_classes=10,
        img_size=224)
device = 'cuda:0' # device = 'cpu'
model = model.to(device)

cifar10_training_loader = get_training_dataloader(
    MEAN,
    STD,
    num_workers=2,
    batch_size=16,
    shuffle=True,
    shots=shots
)

assert (shots*num_classes == len(cifar10_training_loader.dataset))

cifar10_test_loader = get_test_dataloader(
    MEAN,
    STD,
    num_workers=4,
    batch_size=256,
    shuffle=False
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)


n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)

Creating model: vit_tiny_patch16_224
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:16<00:00, 10475902.77it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
number of params: 5526346


In [5]:
print(f"Start training for {EPOCHS} epochs")

for epoch in range(1, EPOCHS+1):
    train_stats = train_one_epoch(
        model, criterion, cifar10_training_loader,
        optimizer, device, epoch)
    if epoch % 10 == 9: 
        test_stats = evaluate(cifar10_test_loader, model, criterion, device)
        print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")        
        
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")            

Start training for 5 epochs




AssertionError: Torch not compiled with CUDA enabled

In [None]:
# Calculate througput 
start_time = time.time()
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
end_time = time.time()
num_samples = len(cifar10_test_loader.dataset)
throughput = num_samples / (end_time - start_time)
print("Throughput: {}".format(throughput))

# Q2 ImageNet Dataset

In [None]:
# Define paths and parameters
batch_size = 256
num_workers = 4

# Load ImageNet data
train_loader = get_imagenet_train_dataloader(batch_size=batch_size, num_workers=num_workers)
val_loader = get_imagenet_test_dataloader(batch_size=batch_size, num_workers=num_workers)

# Define model, criterion, optimizer, etc.
MODEL_NAME = 'vit_base_patch16_224'
num_classes = 1000  # ImageNet has 1000 classes
EPOCHS = 5
LR = 0.0001
WD = 0.0

print(f"Creating model: {MODEL_NAME}")
model = create_model(
        MODEL_NAME,
        pretrained=True,
        num_classes=num_classes,
        img_size=224)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)

# Training loop
print(f"Start training for {EPOCHS} epochs")
for epoch in range(1, EPOCHS+1):
    train_stats = train_one_epoch(
        model, criterion, train_loader,
        optimizer, device, epoch)
    if epoch % 1 == 0: 
        test_stats = evaluate(val_loader, model, criterion, device)
        print(f"Accuracy of the network on the validation images: {test_stats['acc1']:.1f}%")        
        
test_stats = evaluate(val_loader, model, criterion, device)
print(f"Accuracy of the network on the validation images: {test_stats['acc1']:.1f}%")

In [None]:
# Calculate througput 
start_time = time.time()
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
end_time = time.time()
num_samples = len(cifar10_test_loader.dataset)
throughput = num_samples / (end_time - start_time)
print("Throughput: {}".format(throughput))

# Q3 ViT model on a small device

In [None]:
MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
CHECKPOINT_PATH = './checkpoint'
MODEL_NAME = 'vit_tiny_patch16_224'
num_classes = 10
EPOCHS = 5
LR = 0.0001
WD = 0.0
shots = 1000

print(f"Creating model: {MODEL_NAME}")
model = create_model(
        MODEL_NAME,
        pretrained=False,
        num_classes=10,
        img_size=224)
device = 'cuda:0' # device = 'cpu'
model = model.to(device)

cifar10_training_loader = get_training_dataloader(
    MEAN,
    STD,
    num_workers=2,
    batch_size=16,
    shuffle=True,
    shots=shots
)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)


n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)

In [None]:
print(f"Start training for {EPOCHS} epochs")

for epoch in range(1, EPOCHS+1):
    train_stats = train_one_epoch(
        model, criterion, cifar10_training_loader,
        optimizer, device, epoch)
    if epoch % 10 == 9: 
        test_stats = evaluate(cifar10_test_loader, model, criterion, device)
        print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")        
        
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")

In [None]:
# Calculate througput 
start_time = time.time()
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
end_time = time.time()
num_samples = len(cifar10_test_loader.dataset)
throughput = num_samples / (end_time - start_time)
print("Throughput: {}".format(throughput))

# Q4 Knowledge Distillation

In [None]:
# Step 1: Train the teacher

MODEL_NAME = 'vit_base_patch16_224'
num_classes = 10
EPOCHS = 5
LR = 0.0001
WD = 0.0

print(f"Creating model: {MODEL_NAME}")
teacher = create_model(
        MODEL_NAME,
        pretrained=True,
        num_classes=10,
        img_size=224)
device = 'cuda:0' # device = 'cpu'
teacher = teacher.to(device)



criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(teacher.parameters(), lr=LR, weight_decay=WD)


n_parameters = sum(p.numel() for p in teacher.parameters() if p.requires_grad)
print('number of params:', n_parameters)


In [None]:
print(f"Start training for {EPOCHS} epochs")

for epoch in range(1, EPOCHS+1):
    train_stats = train_one_epoch(
        teacher, criterion, cifar10_training_loader,
        optimizer, device, epoch)
    if epoch % 10 == 9: 
        test_stats = evaluate(cifar10_test_loader, teacher, criterion, device)
        print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")        
        
test_stats = evaluate(cifar10_test_loader, teacher, criterion, device)
print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")       


In [None]:
# save finetuned teacher model
torch.save(teacher.state_dict(), './teacher.pth')

In [None]:

teacher = create_model(
        'vit_base_patch16_224',
        pretrained=True,
        num_classes=10,
        img_size=224)
device = 'cuda:0' # device = 'cpu'
teacher = teacher.to(device)
teacher.load_state_dict(torch.load('./teacher.pth'))

test_stats = evaluate(cifar10_test_loader, teacher, criterion, device)
print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")            

# Train the student
for p in teacher.parameters(): 
    p.requires_grad = False

MODEL_NAME = 'vit_tiny_patch16_224'

model = create_model(
        MODEL_NAME,
        pretrained=True,
        num_classes=10,
        img_size=224)
device = 'cuda:0' # device = 'cpu'
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)


n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)


print(f"Start training for {EPOCHS} epochs")

for epoch in range(1, EPOCHS+1):
    train_stats = train_one_epoch_distillation(
        teacher, model, criterion, cifar10_training_loader,
        optimizer, device, epoch, alpha=2.0, temp=1.0)
    if epoch % 2 == 1: 
        test_stats = evaluate(cifar10_test_loader, model, criterion, device)
        print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")        
        
test_stats = evaluate(cifar10_test_loader, model, criterion, device)
print(f"Accuracy of the network on the {len(cifar10_test_loader)} test images: {test_stats['acc1']:.1f}%")            