## VGG

Building on the work of AlexNet, VGG focuses on another crucial aspect of Convolutional Neural Networks (CNNs), depth. It was developed by Simonyan and Zisserman. It normally consists of 16 convolutional layers but can be extended to 19 layers as well (hence the two versions, VGG-16 and VGG-19). All the convolutional layers consists of 3x3 filters. You can read more about the network in the official paper [here](arxiv.org/pdf/1409.1556.pdf)

### Building the VGG Architecture:

- Import the necessary libraries, such as PyTorch or TensorFlow.
- Define VGG blocks, which consist of convolutional layers followed by max-pooling.
- Create the complete VGG architecture by stacking multiple VGG blocks.
- Add fully connected layers at the end for classification.
- Key Concepts and Layers in VGG:

### VGG architectures have a straightforward structure with repeated convolutional blocks.
- Convolutional layers use small 3x3 kernels with padding.
- Max-pooling layers downsample feature maps.
- VGG models come in different variants, such as VGG-16 and VGG-19.
- The final fully connected layers perform classification.

### Code Examples and Implementations:
- Use code to define VGG blocks and the complete VGG architecture.
- Specify the number of layers, channels, and classes as needed.
- Implement forward propagation to process input data.

### Training and Evaluation:

- Train the VGG model on a dataset of your choice.
- Evaluate model performance using test images.
- Monitor metrics like accuracy, loss, and more.

### Further Exploration and Improvements:
- Experiment with different VGG variants (e.g., VGG-16, VGG-19).
- Fine-tune the model on specific tasks or datasets.
- Explore transfer learning by using pretrained VGG models.
- Consider architectural modifications for specialized tasks.

In [None]:
import os
import numpy as np
from tqdm import tqdm

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
from torchvision.models import list_models, get_model, get_model_weights, get_weight
import transformers

#os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class ConvolutionNet(nn.Module):
    def __init__(self, in_channels: int, out_channels: int,
                 kernel_size: int, stride: int=1, dilation: int=1,
                 bias: bool=True, padding: int=1, padding_mode: str='zeros', add_conv1: bool=True):
        super().__init__()
        self.add_conv1 = add_conv1
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, stride,
                               dilation=dilation, bias=bias, padding=padding, padding_mode=padding_mode)
        self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
                               dilation=dilation, bias=bias, padding=padding, padding_mode=padding_mode)
        if add_conv1:
            self.one_kernel = nn.Conv2d(out_channels, out_channels, 1, bias=bias)
        self.batch_norm1 = nn.BatchNorm2d(in_channels)
        self.batch_norm2 = nn.BatchNorm2d(out_channels)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.activation(x)
        x = self.conv2(x)
        x = self.batch_norm2(x)
        if self.add_conv1:
            x = self.one_kernel(x)
        return self.activation(x)


class Pooling(nn.Module):
    def __init__(self, kernel_size, stride=1, padding=0):
        super().__init__()
        self.pooler = nn.MaxPool2d(kernel_size, stride=stride, padding=padding)
    
    def forward(self, x):
        return self.pooler(x)


class FCNet(nn.Module):
    def __init__(self, input_dim, num_classes, bias: bool=True, inter_dim: int=4096, number: int=3):
        super().__init__()
        self.layers = nn.Sequential()
        if number > 1 and inter_dim:
            self.layers.append(nn.Linear(input_dim, inter_dim, bias=bias))
            self.layers.append(nn.ReLU())
            for _ in range(number-2):
                self.layers.append(nn.Dropout(0.2))
                self.layers.append(nn.Linear(inter_dim, inter_dim, bias=bias))
                self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(inter_dim, num_classes, bias=bias))
            self.layers.append(nn.ReLU())
        elif number > 1:
            for _ in range(number-1):
                self.layers.append(nn.Dropout(0.2))
                self.layers.append(nn.Linear(input_dim, input_dim, bias=bias))
                self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(inter_dim, num_classes, bias=bias))
            self.layers.append(nn.ReLU())
        else:
            self.layers.append(nn.Dropout(0.2))
            self.layers.append(nn.Linear(input_dim, num_classes, bias=bias))
            self.layers.append(nn.ReLU())

    def forward(self, x):
        return self.layers(x)

class ImgClassifier(nn.Module):
    def __init__(self, num_classes, in_channels, input_dim_fc, out_channels, kernel_sizes, strides=None, dilations=None):
        super().__init__()
        self.block = nn.Sequential()
        K = len(kernel_sizes)
        if not strides:
            strides = [1 for _ in range(K)]
        if not dilations:
            dilations = [1 for _ in range(K)]
        assert len(kernel_sizes) == len(strides) == len(dilations), "lengths of kernel_sizes, strides, dilations should be equal"
        #self.conv1 = nn.Conv2d(in_channels, in_channels, 3)
        #self.batch_norm1 = nn.BatchNorm2d(in_channels)
        #self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        in_channel_curr = in_channels
        for out_channel, kernel, stride, dilation in zip(out_channels, kernel_sizes, strides, dilations):
            conv = ConvolutionNet(in_channels=in_channel_curr, out_channels=out_channel,
                 kernel_size=kernel, stride=stride, dilation=dilation)
            in_channel_curr = out_channel
            pool = Pooling(kernel_size=3, stride=2, padding=1)
            self.block.append(conv)
            self.block.append(pool)
        self.fc = FCNet(input_dim=input_dim_fc, num_classes=num_classes, number=1)
    
    def forward(self, x):
        #x = self.conv1(x)
        #x = self.batch_norm1(x)
        #x = self.maxpool(x)
        x = self.block(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        return x

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

train_dataset = datasets.CIFAR100("/archive/turganbay/cifar", train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR100("/archive/turganbay/cifar", train=False, download=True, transform=transform)
N = len(train_dataset)
T = int(N*0.9)
train_dataset, val_dataset = random_split(train_dataset, [T, N-T])

Files already downloaded and verified
Files already downloaded and verified


In [None]:
psum = torch.tensor([0.0, 0.0, 0.0])
psum_sq = torch.tensor([0.0, 0.0, 0.0])

for inputs, lbl in DataLoader(train_dataset, 64):
    psum += inputs.sum(axis = [0, 2, 3])
    psum_sq += (inputs**2).sum(axis = [0, 2, 3])

count = len(train_dataset) * 224 * 224

total_mean = psum / count
total_var  = (psum_sq / count) - (total_mean ** 2)
total_std  = torch.sqrt(total_var)

print('Training data stats:')
print('- mean: {}'.format(total_mean))
print('- std:  {}'.format(total_std))

In [4]:
class MyDataset(Dataset):
    def __init__(self, subset, transform=None):
        self.subset = subset
        self.transform = transform
        
    def __getitem__(self, index):
        x, y = self.subset[index]
        if self.transform:
            x = self.transform(x)
        return x, y
        
    def __len__(self):
        return len(self.subset)

    
normalize = transforms.Normalize(mean=[0.5073, 0.4868, 0.4410], std=[0.2623, 0.2515, 0.2716])
transform_aug = transforms.Compose([
    #transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(degrees=5),
    #transforms.RandomResizedCrop(size=(224, 224), antialias=True),
    normalize,
])


train_dataset = MyDataset(train_dataset, transform=transform_aug)
val_dataset = MyDataset(val_dataset, transform=normalize)
test_dataset = MyDataset(test_dataset, transform=normalize)

In [5]:
bs = 16
num_classes = 100
epochs = 40

#out_channels = [64, 128, 256, 512]
#kernel_sizes = [3] * len(out_channels)
#model = ImgClassifier(num_classes, 3, 100352, out_channels, kernel_sizes).to(device)

train_loader = DataLoader(train_dataset, batch_size=bs)
val_loader = DataLoader(val_dataset, batch_size=bs)
test_loader = DataLoader(test_dataset, batch_size=bs)

weights = torch.hub.load("pytorch/vision", "get_model_weights", name="vgg19_bn")
weight_name = [weight for weight in weights][0]
weights = get_weight(str(weight_name))
model = torch.hub.load("pytorch/vision", "vgg19_bn", weights=weight_name)
model.classifier = FCNet(input_dim=25088, num_classes=100, number=3)

loss_fct = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-02)  

grad_accum_steps = 4
num_train_epochs = 40
total_iters = len(train_loader) // grad_accum_steps * epochs
scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=50, num_training_steps=total_iters
)

# These lines configure model parallelism (use multiple GPUs if available) and specify the target device (e.g., CPU or GPU).
model= nn.DataParallel(model)
model.to(device)

Using cache found in /home/turganbay/.cache/torch/hub/pytorch_vision_main
Using cache found in /home/turganbay/.cache/torch/hub/pytorch_vision_main


DataParallel(
  (module): VGG(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (9): ReLU(inplace=True)
      (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (12): ReLU(inplace=True)
      (13): MaxPool2d(kernel_size=2, stride=2, padding=0

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

139990948

In [None]:
for epoch in range(epochs):
    for i, (images, labels) in tqdm(enumerate(train_loader), desc=f"Epoch {epoch}/{epochs}", 
                            unit="batch", total=len(train_loader)):  
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        loss = loss_fct(outputs, labels) / grad_accum_steps
        loss.backward()
        if (i + 1) % grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

    print(f'Loss: {round(loss.item()*grad_accum_steps, 3)}')
            
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
        print(f'Accuracy {round(correct/total*100, 3)}') 

Epoch 0/40:  57%|█████████████████████████████████████████████████████████████████████▌                                                    | 1603/2813 [03:22<02:24,  8.38batch/s]