<a href="https://colab.research.google.com/github/surendar-283/DA6401-Assignment-2/blob/main/DA6401_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split
import wandb
from tqdm import tqdm
import os

class NatureNet(nn.Module):
    def __init__(self,
                 conv_filters=[32, 32, 32, 32, 32],
                 kernel_sizes=[3, 3, 3, 3, 3],
                 dense_units=[128],
                 dropout_rate=0.2,
                 use_batchnorm=True,
                 activation='relu',
                 input_shape=(224, 224, 3),
                 num_classes=10):
        super(NatureNet, self).__init__()

        # Activation function mapping
        activation_fns = {
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(0.1),
            'gelu': nn.GELU()
        }
        self.activation = activation_fns.get(activation, nn.ReLU())

        self.conv_blocks = nn.ModuleList()
        in_channels = input_shape[2]

        # Create convolutional blocks
        for i in range(5):
            conv_layer = nn.Conv2d(
                in_channels=in_channels,
                out_channels=conv_filters[i],
                kernel_size=kernel_sizes[i],
                padding='same'
            )

            block = [conv_layer, self.activation]

            if use_batchnorm:
                block.append(nn.BatchNorm2d(conv_filters[i]))

            block.append(nn.MaxPool2d(kernel_size=2, stride=2))

            if dropout_rate > 0 and i < 4:
                block.append(nn.Dropout(dropout_rate))

            self.conv_blocks.append(nn.Sequential(*block))
            in_channels = conv_filters[i]

        # Calculate flattened size
        with torch.no_grad():
            dummy_input = torch.zeros(1, input_shape[2], input_shape[0], input_shape[1])
            for block in self.conv_blocks:
                dummy_input = block(dummy_input)
            flattened_size = dummy_input.view(1, -1).shape[1]

        # Create dense layers
        dense_layers = []
        prev_neurons = flattened_size

        for units in dense_units:
            dense_layers.append(nn.Linear(prev_neurons, units))
            dense_layers.append(self.activation)
            if dropout_rate > 0:
                dense_layers.append(nn.Dropout(dropout_rate))
            prev_neurons = units

        # Output layer
        dense_layers.append(nn.Linear(prev_neurons, num_classes))

        self.dense_layers = nn.Sequential(
            nn.Flatten(),
            *dense_layers
        )

    def forward(self, x):
        for block in self.conv_blocks:
            x = block(x)
        x = self.dense_layers(x)
        return x

def get_data_loaders(data_dir, batch_size=32, val_split=0.2, augment=False):
    # Define transforms
    base_transform = [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ]

    if augment:
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            *base_transform
        ])
    else:
        train_transform = transforms.Compose(base_transform)

    test_transform = transforms.Compose(base_transform)

    # Load dataset
    full_train = ImageFolder(os.path.join(data_dir, 'train'),
                           transform=train_transform)
    test_data = ImageFolder(os.path.join(data_dir, 'val'),
                          transform=test_transform)

    # Split train into train and validation
    val_size = int(val_split * len(full_train))
    train_size = len(full_train) - val_size
    train_data, val_data = random_split(full_train, [train_size, val_size])

    # Create dataloaders
    train_loader = DataLoader(train_data, batch_size=batch_size,
                            shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_data, batch_size=batch_size,
                          shuffle=False, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_data, batch_size=batch_size,
                           shuffle=False, num_workers=4, pin_memory=True)

    return train_loader, val_loader, test_loader, full_train.classes

def train_and_validate(config=None):
    with wandb.init(config=config) as run:
        config = wandb.config

        # Set descriptive run name
        run.name = (f"filters_{'-'.join(map(str, config.conv_filters))}_"
                   f"dense_{'-'.join(map(str, config.dense_units))}_"
                   f"lr_{config.learning_rate:.0e}_"
                   f"bs_{config.batch_size}")

        # Load data
        train_loader, val_loader, _, classes = get_data_loaders(
            data_dir='/content/drive/MyDrive/inaturalist_12K',
            batch_size=config.batch_size,
            augment=config.data_augmentation
        )

        # Initialize model
        model = NatureNet(
            conv_filters=config.conv_filters,
            kernel_sizes=config.kernel_sizes,
            dense_units=config.dense_units,
            dropout_rate=config.dropout_rate,
            use_batchnorm=config.use_batchnorm,
            activation=config.activation,
            num_classes=len(classes)
        )

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)

        # Initialize optimizer with weight decay
        optimizer = optim.Adam(
            model.parameters(),
            lr=config.learning_rate,
            weight_decay=config.weight_decay
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop
        for epoch in range(1, 21):  # Fixed 20 epochs
            model.train()
            train_loss, correct, total = 0.0, 0, 0

            with tqdm(train_loader, unit="batch") as train_bar:
                for inputs, labels in train_bar:
                    inputs, labels = inputs.to(device), labels.to(device)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()

                    train_bar.set_postfix({
                        'loss': f'{loss.item():.4f}',
                        'acc': f'{100.*correct/total:.1f}%'
                    })

            # Calculate epoch metrics
            train_acc = 100. * correct / total
            train_loss /= len(train_loader)

            # Validation
            val_loss, val_acc = evaluate_model(model, val_loader, criterion, device)

            # Log metrics
            wandb.log({
                'epoch': epoch,
                'train_loss': train_loss,
                'train_acc': train_acc,
                'val_loss': val_loss,
                'val_acc': val_acc
            })

def evaluate_model(model, loader, criterion, device):
    model.eval()
    loss, correct, total = 0.0, 0, 0

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss += criterion(outputs, labels).item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    return loss / len(loader), 100. * correct / total

if __name__ == '__main__':
    # Define sweep configuration
    sweep_config = {
        'method': 'bayes',
        'metric': {'name': 'val_acc', 'goal': 'maximize'},
        'parameters': {
            'conv_filters': {
                'values': [
                    [32, 32, 32, 32, 32],
                    [64, 64, 64, 64, 64],
                    [16, 32, 64, 128, 256],
                    [256, 128, 64, 32, 16]
                ]
            },
            'kernel_sizes': {
                'values': [
                    [3, 3, 3, 3, 3],
                    [5, 5, 5, 5, 5],
                    [3, 5, 3, 5, 3]
                ]
            },
            'dense_units': {
                'values': [
                    [64],
                    [128],
                    [64, 128],
                    [256, 128]
                ]
            },
            'learning_rate': {
                'values':[1e-3,1e-4]
            },
            'weight_decay': {
                'values': [0, 0.0001, 0.001, 0.01]
            },
            'dropout_rate': {
                'values': [0.0, 0.2, 0.3, 0.5]
            },
            'use_batchnorm': {
                'values': [True, False]
            },
            'batch_size': {
                'values': [32, 64, 128]
            },
            'data_augmentation': {
                'values': [True, False]
            },
            'activation': {
                'values': ['relu', 'leaky_relu', 'gelu']
            }
        }
    }

    # Initialize wandb
    wandb.login()

    # Create and run sweep
    sweep_id = wandb.sweep(sweep_config, project='DA6401_A2')
    wandb.agent(sweep_id, function=train_and_validate, count=30)

Create sweep with ID: 5fyrhqnv
Sweep URL: https://wandb.ai/surendarmohan283-indian-institute-of-technology-madras/DA6401_A2/sweeps/5fyrhqnv


[34m[1mwandb[0m: Agent Starting Run: hgybznr6 with config:
[34m[1mwandb[0m: 	activation: leaky_relu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	conv_filters: [64, 64, 64, 64, 64]
[34m[1mwandb[0m: 	data_augmentation: False
[34m[1mwandb[0m: 	dense_units: [128]
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	kernel_sizes: [5, 5, 5, 5, 5]
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	use_batchnorm: False
[34m[1mwandb[0m: 	weight_decay: 0


100%|██████████| 251/251 [01:59<00:00,  2.10batch/s, loss=2.2590, acc=14.1%]
100%|██████████| 251/251 [01:50<00:00,  2.28batch/s, loss=2.1465, acc=16.1%]
100%|██████████| 251/251 [01:52<00:00,  2.23batch/s, loss=2.0210, acc=15.1%]
100%|██████████| 251/251 [01:55<00:00,  2.18batch/s, loss=2.6807, acc=19.3%]
100%|██████████| 251/251 [01:57<00:00,  2.13batch/s, loss=2.2421, acc=22.2%]
100%|██████████| 251/251 [01:54<00:00,  2.19batch/s, loss=2.6362, acc=23.1%]
100%|██████████| 251/251 [01:55<00:00,  2.18batch/s, loss=1.8629, acc=24.7%]
100%|██████████| 251/251 [01:56<00:00,  2.16batch/s, loss=2.3063, acc=25.6%]
100%|██████████| 251/251 [01:59<00:00,  2.10batch/s, loss=1.9266, acc=26.7%]
100%|██████████| 251/251 [01:56<00:00,  2.16batch/s, loss=1.6313, acc=27.3%]
100%|██████████| 251/251 [01:55<00:00,  2.16batch/s, loss=2.1270, acc=28.6%]
100%|██████████| 251/251 [01:57<00:00,  2.13batch/s, loss=1.8506, acc=29.1%]
100%|██████████| 251/251 [01:57<00:00,  2.13batch/s, loss=1.7611, acc=30.3%]

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▂▁▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇██
train_loss,▃█▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁
val_acc,▃▁▄▄▄▄▄▅▆▆▆▆▇▇▆▇▇▇▇█
val_loss,▃█▃▂▃▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁

0,1
epoch,20.0
train_acc,37.94955
train_loss,1.77479
val_acc,34.48276
val_loss,1.89692


[34m[1mwandb[0m: Agent Starting Run: r0jvffwo with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	conv_filters: [256, 128, 64, 32, 16]
[34m[1mwandb[0m: 	data_augmentation: True
[34m[1mwandb[0m: 	dense_units: [64]
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	kernel_sizes: [5, 5, 5, 5, 5]
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	use_batchnorm: False
[34m[1mwandb[0m: 	weight_decay: 0.0001


100%|██████████| 251/251 [02:04<00:00,  2.01batch/s, loss=2.2610, acc=15.2%]
100%|██████████| 251/251 [02:01<00:00,  2.06batch/s, loss=1.9664, acc=20.8%]
100%|██████████| 251/251 [01:57<00:00,  2.14batch/s, loss=1.9696, acc=22.9%]
100%|██████████| 251/251 [01:57<00:00,  2.13batch/s, loss=2.0252, acc=23.7%]
100%|██████████| 251/251 [02:00<00:00,  2.08batch/s, loss=1.8070, acc=24.5%]
100%|██████████| 251/251 [01:55<00:00,  2.17batch/s, loss=1.8428, acc=26.1%]
100%|██████████| 251/251 [01:58<00:00,  2.11batch/s, loss=1.8896, acc=26.8%]
100%|██████████| 251/251 [01:59<00:00,  2.10batch/s, loss=2.1532, acc=27.4%]
100%|██████████| 251/251 [01:58<00:00,  2.12batch/s, loss=2.3810, acc=28.2%]
100%|██████████| 251/251 [02:01<00:00,  2.06batch/s, loss=1.9311, acc=29.7%]
100%|██████████| 251/251 [01:55<00:00,  2.17batch/s, loss=2.4487, acc=30.4%]
100%|██████████| 251/251 [01:59<00:00,  2.09batch/s, loss=1.6467, acc=30.5%]
100%|██████████| 251/251 [01:57<00:00,  2.13batch/s, loss=1.3628, acc=31.3%]

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇███
train_loss,█▇▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁
val_acc,▁▁▂▃▃▄▄▅▅▅▆▇▇▇▇▇▇███
val_loss,█▇▇▆▆▅▅▄▄▄▃▃▂▂▂▃▂▁▁▁

0,1
epoch,20.0
train_acc,34.95255
train_loss,1.84006
val_acc,36.08196
val_loss,1.83449


[34m[1mwandb[0m: Agent Starting Run: jlsqidfq with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	conv_filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	data_augmentation: True
[34m[1mwandb[0m: 	dense_units: [64]
[34m[1mwandb[0m: 	dropout_rate: 0
[34m[1mwandb[0m: 	kernel_sizes: [3, 5, 3, 5, 3]
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	use_batchnorm: True
[34m[1mwandb[0m: 	weight_decay: 0.0001


100%|██████████| 63/63 [01:50<00:00,  1.75s/batch, loss=2.1398, acc=22.5%]
100%|██████████| 63/63 [01:49<00:00,  1.74s/batch, loss=2.0561, acc=27.4%]
100%|██████████| 63/63 [01:52<00:00,  1.78s/batch, loss=1.9418, acc=29.1%]
100%|██████████| 63/63 [01:53<00:00,  1.80s/batch, loss=1.9049, acc=30.8%]
100%|██████████| 63/63 [01:46<00:00,  1.70s/batch, loss=1.9000, acc=31.3%]
100%|██████████| 63/63 [01:42<00:00,  1.63s/batch, loss=1.9387, acc=32.6%]
100%|██████████| 63/63 [01:44<00:00,  1.65s/batch, loss=1.9469, acc=34.6%]
100%|██████████| 63/63 [01:45<00:00,  1.68s/batch, loss=1.8709, acc=34.8%]
100%|██████████| 63/63 [01:46<00:00,  1.68s/batch, loss=1.7360, acc=35.2%]
100%|██████████| 63/63 [01:41<00:00,  1.62s/batch, loss=1.9548, acc=37.5%]
100%|██████████| 63/63 [01:47<00:00,  1.71s/batch, loss=1.7201, acc=37.8%]
100%|██████████| 63/63 [01:47<00:00,  1.70s/batch, loss=1.7250, acc=38.2%]
100%|██████████| 63/63 [01:44<00:00,  1.66s/batch, loss=1.9525, acc=38.7%]
100%|██████████| 63/63 [0

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇████
train_loss,█▇▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁
val_acc,▁▂▂▃▄▃▃▄▅▄▅▆▄▇▇▆█▅▇▇
val_loss,█▇▇▆▅▄▆▄▃▆▄▃▅▁▁▂▁▅▂▁

0,1
epoch,20.0
train_acc,43.03197
train_loss,1.62283
val_acc,38.03098
val_loss,1.7828


[34m[1mwandb[0m: Agent Starting Run: 9sy6k60j with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	conv_filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	data_augmentation: True
[34m[1mwandb[0m: 	dense_units: [128]
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	kernel_sizes: [3, 5, 3, 5, 3]
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	use_batchnorm: True
[34m[1mwandb[0m: 	weight_decay: 0.001


 70%|██████▉   | 44/63 [01:20<00:21,  1.15s/batch, loss=2.1625, acc=19.4%]