<a href="https://colab.research.google.com/github/tusharvatsa32/VisTransformers/blob/main/Code/ViT_imagenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [148]:
!nvidia-smi

Fri Apr 30 00:34:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    42W / 300W |  15539MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [149]:
!pip install -q einops

In [150]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch import einsum
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

In [151]:
print(f"Torch: {torch.__version__}")

Torch: 1.8.1+cu101


In [152]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(11785)

In [153]:
device = 'cuda'

### Training

In [126]:
from google.colab import drive
import json
drive.mount("/content/gdrive", force_remount=True)

#Kaggle
!pip install kaggle
!mkdir .kaggle

# Kaggle data
token = {"username":"kaggle_username","key":"kaggle_key"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
  json.dump(token, file)

Mounted at /content/gdrive


In [128]:
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json

- path is now set to: {/content}


In [129]:
!pip install -qU kaggle==1.5.3

[?25l[K     |██████                          | 10kB 21.7MB/s eta 0:00:01[K     |████████████                    | 20kB 27.0MB/s eta 0:00:01[K     |██████████████████              | 30kB 32.0MB/s eta 0:00:01[K     |████████████████████████        | 40kB 27.0MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 17.7MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 6.5MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone


In [130]:
!kaggle datasets download -d ifigotin/imagenetmini-1000 -p /content/

Downloading imagenetmini-1000.zip to /content
100% 3.91G/3.92G [01:20<00:00, 75.1MB/s]
100% 3.92G/3.92G [01:20<00:00, 52.0MB/s]


In [132]:
!unzip -q \*.zip

In [156]:
img_size = ((224, 224)) 

transforms_train = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ColorJitter(hue=.05, saturation=.05),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomVerticalFlip(p=0.3),
    transforms.RandomRotation(10, interpolation=torchvision.transforms.InterpolationMode.BILINEAR),
    transforms.RandomCrop(img_size, fill=0),
    transforms.RandomAffine(10, scale=(0.8, 1.2)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

transforms_val = transforms.Compose([
    transforms.Resize(img_size),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [157]:
train_data = torchvision.datasets.ImageFolder(root='imagenet-mini/train', transform=transforms_train) 
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)

valid_data = torchvision.datasets.ImageFolder(root='imagenet-mini/val', transform=transforms_val)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=32, shuffle=False, num_workers=8, pin_memory=True)

  cpuset_checked))


In [164]:
print(len(train_data), len(valid_data))

34745 3923


In [158]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        attn = self.attend(dots)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = channels * patch_size ** 2
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

In [159]:
model = ViT(
    image_size=224,
    patch_size=16,
    num_classes=1000,
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=2048,
    dropout=0.2,
    emb_dropout=0.1
).to(device)

In [160]:
numEpochs = 100
in_features = 3 # RGB channels

learningRate = 0.1
weightDecay = 2e-5

num_classes = len(train_data.classes)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate, weight_decay=weightDecay, momentum=0.9, nesterov=True)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, threshold=0.002, verbose=True)

In [161]:
my_acc = []
my_loss = []

In [162]:
# Train!
for epoch in range(numEpochs):
    
    # Train
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    correct = 0

    for batch_num, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        
        x, y = x.to(device), y.to(device)

        outputs = model(x)

        correct += (torch.argmax(outputs, axis=1) == y).sum().item()

        loss = criterion(outputs, y.long())
        loss.backward()
        optimizer.step()

        del(outputs)

        train_loss += loss.item()

        if batch_num % 100 == 0:
            print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch, batch_num+1, train_loss/(batch_num+1)))

    train_accuracy = correct / len(train_data)

    # Validate
    model.eval()
    num_correct = 0
    for batch_num1, (x, y) in enumerate(valid_loader):
        x, y = x.to(device), y.to(device)
        outputs = model(x)

        num_correct += (torch.argmax(outputs, axis=1) == y).sum().item()

    val_accuracy = num_correct / len(valid_data)
    my_acc.append(val_accuracy)
    my_loss.append(train_loss/(batch_num+1))
    print('Epoch: {}\t Training Accuracy: {:.4f}\t Validation Accuracy: {:.4f}\t Avg-Loss: {:.4f}'.format(epoch, train_accuracy*100, val_accuracy * 100, train_loss/(batch_num+1)))
    scheduler.step(val_accuracy)

    #torch.save(network.state_dict(),'/content/drive/MyDrive/DL_CMU/HW2_P2/ResNet_Plateau_d3/Net_'+str(epoch)+'_'+str(val_accuracy)+'_checkpoint.t7')

  cpuset_checked))


Epoch: 0	Batch: 1	Avg-Loss: 7.0549
Epoch: 0	Batch: 101	Avg-Loss: 4.9342
Epoch: 0	Batch: 201	Avg-Loss: 4.5550
Epoch: 0	Batch: 301	Avg-Loss: 4.3501
Epoch: 0	 Training Accuracy: 10.9742	 Validation Accuracy: 33.6477	 Avg-Loss: 4.2269
Epoch: 1	Batch: 1	Avg-Loss: 3.9204
Epoch: 1	Batch: 101	Avg-Loss: 3.7086
Epoch: 1	Batch: 201	Avg-Loss: 3.6766
Epoch: 1	Batch: 301	Avg-Loss: 3.6614
Epoch: 1	 Training Accuracy: 20.8174	 Validation Accuracy: 43.6146	 Avg-Loss: 3.6328
Epoch: 2	Batch: 1	Avg-Loss: 3.7152
Epoch: 2	Batch: 101	Avg-Loss: 3.4975
Epoch: 2	Batch: 201	Avg-Loss: 3.4952
Epoch: 2	Batch: 301	Avg-Loss: 3.4861
Epoch: 2	 Training Accuracy: 24.3719	 Validation Accuracy: 48.5853	 Avg-Loss: 3.4786
Epoch: 3	Batch: 1	Avg-Loss: 3.4658
Epoch: 3	Batch: 101	Avg-Loss: 3.3782
Epoch: 3	Batch: 201	Avg-Loss: 3.3838
Epoch: 3	Batch: 301	Avg-Loss: 3.3846
Epoch: 3	 Training Accuracy: 26.6542	 Validation Accuracy: 54.4226	 Avg-Loss: 3.3828
Epoch: 4	Batch: 1	Avg-Loss: 3.1411
Epoch: 4	Batch: 101	Avg-Loss: 3.2992
Epoc

KeyboardInterrupt: ignored

In [None]:
# Train!
for epoch in range(numEpochs):
    
    # Train
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    correct = 0

    for batch_num, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        
        x, y = x.to(device), y.to(device)

        outputs = model(x)

        correct += (torch.argmax(outputs, axis=1) == y).sum().item()

        loss = criterion(outputs, y.long())
        loss.backward()
        optimizer.step()

        del(outputs)

        train_loss += loss.item()

        if batch_num % 100 == 0:
            print('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch, batch_num+1, train_loss/(batch_num+1)))

    train_accuracy = correct / len(train_data)

    # Validate
    model.eval()
    num_correct = 0
    for batch_num1, (x, y) in enumerate(valid_loader):
        x, y = x.to(device), y.to(device)
        outputs = model(x)

        num_correct += (torch.argmax(outputs, axis=1) == y).sum().item()

    val_accuracy = num_correct / len(valid_data)
    my_acc.append(val_accuracy)
    my_loss.append(train_loss/(batch_num+1))
    print('Epoch: {}\t Training Accuracy: {:.4f}\t Validation Accuracy: {:.4f}\t Avg-Loss: {:.4f}'.format(epoch, train_accuracy*100, val_accuracy * 100, train_loss/(batch_num+1)))
    scheduler.step(val_accuracy)

    #torch.save(network.state_dict(),'/content/drive/MyDrive/DL_CMU/HW2_P2/ResNet_Plateau_d3/Net_'+str(epoch)+'_'+str(val_accuracy)+'_checkpoint.t7')

  cpuset_checked))


Epoch: 0	Batch: 1	Avg-Loss: 2.1580
Epoch: 0	Batch: 101	Avg-Loss: 2.2533
Epoch: 0	Batch: 201	Avg-Loss: 2.2834
Epoch: 0	Batch: 301	Avg-Loss: 2.2795
Epoch: 0	 Training Accuracy: 57.0643	 Validation Accuracy: 100.6628	 Avg-Loss: 2.2886
Epoch: 1	Batch: 1	Avg-Loss: 2.3322
Epoch: 1	Batch: 101	Avg-Loss: 2.2069
Epoch: 1	Batch: 201	Avg-Loss: 2.2226
Epoch: 1	Batch: 301	Avg-Loss: 2.2375
Epoch: 1	 Training Accuracy: 58.1724	 Validation Accuracy: 101.2490	 Avg-Loss: 2.2544
Epoch: 2	Batch: 1	Avg-Loss: 1.7115
Epoch: 2	Batch: 101	Avg-Loss: 2.1885
Epoch: 2	Batch: 201	Avg-Loss: 2.2158
Epoch: 2	Batch: 301	Avg-Loss: 2.2295
Epoch: 2	 Training Accuracy: 58.6012	 Validation Accuracy: 100.8157	 Avg-Loss: 2.2380
Epoch: 3	Batch: 1	Avg-Loss: 2.0872
Epoch: 3	Batch: 101	Avg-Loss: 2.1582
Epoch: 3	Batch: 201	Avg-Loss: 2.1768
Epoch: 3	Batch: 301	Avg-Loss: 2.1940
Epoch: 3	 Training Accuracy: 59.7870	 Validation Accuracy: 103.3393	 Avg-Loss: 2.2069
Epoch: 4	Batch: 1	Avg-Loss: 2.0969
Epoch: 4	Batch: 101	Avg-Loss: 2.1191
