In [1]:
import zipfile
import os

zip_path = "/content/drive/MyDrive/nature_12K.zip"
extract_dir = "/content/nature"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import os

import wandb
from pytorch_lightning.loggers import WandbLogger

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

import matplotlib.pyplot as plt

In [2]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch_lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch_lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.1.0->pytorch_lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.1.0->pytorch_lightning)
  Downloadi

## CNN Model


In [3]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

class LitCNN(pl.LightningModule):
    def __init__(self, config):
        super().__init__()

        config_dict = dict(config)
        self.save_hyperparameters(config_dict)

        activation_fn = {
            "relu": nn.ReLU(),
            "gelu": nn.GELU(),
            "silu": nn.SiLU(),
            "mish": nn.Mish()
        }[config_dict["activation"]]

        filters = config_dict["filters"]
        kernel_size = config_dict["kernel_size"]
        dropout = config_dict["dropout"]

        layers = []
        in_channels = 3

        for f in filters:
            layers.append(nn.Conv2d(in_channels, f, kernel_size, padding=1))
            layers.append(activation_fn)
            layers.append(nn.MaxPool2d(2))
            in_channels = f

        self.conv = nn.Sequential(*layers)
        self.gap = nn.AdaptiveAvgPool2d((1, 1))  # Global Average Pooling
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(filters[-1], config_dict["dense_neurons"])
        self.output = nn.Linear(config_dict["dense_neurons"], 10)

        # Store final metrics for printing
        self.final_train_loss = 0.0
        self.final_train_acc = 0.0
        self.final_val_loss = 0.0
        self.final_val_acc = 0.0

    def forward(self, x):
        x = self.conv(x)
        x = self.gap(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        return self.output(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.cross_entropy(preds, y)
        acc = (preds.argmax(dim=1) == y).float().mean()
        self.final_train_loss = loss.item()
        self.final_train_acc = acc.item()
        self.log("train_loss", loss)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.cross_entropy(preds, y)
        acc = (preds.argmax(dim=1) == y).float().mean()
        self.final_val_loss = loss.item()
        self.final_val_acc = acc.item()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = F.cross_entropy(preds, y)
        acc = (preds.argmax(dim=1) == y).float().mean()
        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", acc, prog_bar=True)
        return {"test_loss": loss, "test_acc": acc}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

    def on_train_end(self):
        # Print final stats at the end of training
        print(f"\nTrain Loss={self.final_train_loss:.4f}, Train Acc={self.final_train_acc:.4f}, "
              f"Val Loss={self.final_val_loss:.4f}, Val Acc={self.final_val_acc:.4f}")


## Dataset Preparation

In [4]:
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import os
import pytorch_lightning as pl

class INaturalistDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, batch_size=64, augment=False, num_workers=2):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.augment = augment
        self.num_workers = num_workers

    def setup(self, stage=None):
        train_path = os.path.join(self.data_dir, "train")
        test_path = os.path.join(self.data_dir, "test")

        base_transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.ToTensor()
        ])

        augment_transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(10),
            transforms.ToTensor()
        ])

        full_train_dataset = datasets.ImageFolder(train_path, transform=augment_transform if self.augment else base_transform)
        test_dataset = datasets.ImageFolder(test_path, transform=base_transform)

        targets = np.array(full_train_dataset.targets)
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        for train_idx, val_idx in sss.split(np.zeros(len(targets)), targets):
            self.train_dataset = Subset(full_train_dataset, train_idx)
            self.val_dataset = Subset(full_train_dataset, val_idx)

        self.test_dataset = test_dataset

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)


In [None]:
import wandb
from pytorch_lightning.loggers import WandbLogger

def train_model(config=None):
    with wandb.init(config=config) as run:
        config = wandb.config

        run.name = (
            f"lr_{config.lr}_bs_{config.batch_size}_do_{config.dropout}_"
            f"{config.activation}_filters_{'-'.join(map(str, config.filters))}_"
            f"ks_{config.kernel_size}_dn_{config.dense_neurons}_"
            f"aug_{config.augment}_bn_{config.batch_norm}_fp16_{config.fp16}"
        )
        run.save()

        datamodule = INaturalistDataModule(
            data_dir="/content/nature/inaturalist_12K",
            batch_size=config.batch_size,
            augment=config.augment
        )
        datamodule.setup()

        model = LitCNN(config)

        wandb_logger = WandbLogger(project="cnn-nature", log_model="all")

        trainer = pl.Trainer(
            accelerator='gpu',
            devices=1,
            max_epochs=10,
            logger=wandb_logger,
            precision=16 if config.get("fp16", False) else 32,
        )

        trainer.fit(model, datamodule=datamodule)

In [None]:
sweep_config = {
    "method": "random",
    "metric": {"name": "val_acc", "goal": "maximize"},
    "parameters": {
        "lr": {"values": [1e-2, 1e-3, 1e-4]},
        "batch_size": {"values": [32, 64]},
        "dropout": {"values": [0.2, 0.3]},
        "activation": {"values": ["relu", "gelu", "silu", "mish"]},
        "filters": {"values": [[32]*5, [32, 64, 128, 128, 256], [64]*5]},
        "kernel_size": {"values": [3, 5]},
        "dense_neurons": {"values": [128, 256]},
        "augment": {"values": [True, False]},
        "batch_norm": {"values": [True, False]},
        "fp16": {"values": [True, False]}
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="Assignment2_Attempt1")
wandb.agent(sweep_id, function=train_model, count=100)

Create sweep with ID: kdpswt7u
Sweep URL: https://wandb.ai/vinyk-sd-indian-institute-of-technology-madras/Assignment2_Attempt1/sweeps/kdpswt7u


[34m[1mwandb[0m: Agent Starting Run: tzy8h5bj with config:
[34m[1mwandb[0m: 	activation: mish
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 5
[34m[1mwandb[0m: 	lr: 0.0001


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 104 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=1.9751, Train Acc=0.1613, Val Loss=2.2250, Val Acc=0.1875


0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇█████
train_acc,▁▂▃▃▃▄▂▅▅▂▅▃▅▄▄▆▅▅▅█▅▅▆▇▅▇▄▅▇▅▆▆▆▇▅▅▆▇▅▄
train_loss,▇▇▇▆▆▆▆▇▅▅▆▆▅█▅▄▅▆▅▃▆▄▆▃▄▃▅▁▅▃▄▅▃▄▅▄▃▄▅▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▂▄▅▅▆▇▇█▇
val_loss,██▆▄▄▃▂▂▁▂

0,1
epoch,9.0
train_acc,0.16129
train_loss,1.97514
trainer/global_step,2499.0
val_acc,0.2475
val_loss,2.07862


[34m[1mwandb[0m: Agent Starting Run: dol1wkdo with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: False
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [64, 64, 64, 64, 64]
[34m[1mwandb[0m: 	fp16: True
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.01


/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightni

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=2.3034, Train Acc=0.1111, Val Loss=2.2970, Val Acc=0.1250


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_acc,▃▇▃█▁▃▄▅▇▆█▃▅▁▆▂▂▇▅▁▁▅▂▄▅
train_loss,▄▁█▂▄█▄▂▃▂▂▅▄▅▃▄▄▃▃▄▅▂▃▄▃
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▁▁▁▁▁▁▁▁▁
val_loss,▅▁█▄▄▄▂▄▂▇

0,1
epoch,9.0
train_acc,0.11111
train_loss,2.30339
trainer/global_step,1249.0
val_acc,0.1
val_loss,2.30332


[34m[1mwandb[0m: Agent Starting Run: l0r935p7 with config:
[34m[1mwandb[0m: 	activation: mish
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 5
[34m[1mwandb[0m: 	lr: 0.01


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 104 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=2.3095, Train Acc=0.0323, Val Loss=2.2992, Val Acc=0.1875


0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇█████
train_acc,▄▁█▄▃▅▃▆▃▁▃▇▄▃▅▄▃▃▃▅▄▆▄▂▃▅▂▂▂▇▄█▄▆▃▅█▅▃▂
train_loss,▅▇▄▄▅▄▆▃▄▅▄▃▃▄▄▃▆▅▃▄█▅▄▄▄▅▄▆▄▃▅▃▄▃▅▅▁▄▅▅
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
val_acc,▁▁▁▁▁▁▁▁▁▁
val_loss,▄▁▄▄█▅▄▄▃▅

0,1
epoch,9.0
train_acc,0.03226
train_loss,2.30948
trainer/global_step,2499.0
val_acc,0.1
val_loss,2.30371


[34m[1mwandb[0m: Agent Starting Run: n5zghpme with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	fp16: True
[34m[1mwandb[0m: 	kernel_size: 5
[34m[1mwandb[0m: 	lr: 0.001


/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightni

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=1.8046, Train Acc=0.3333, Val Loss=1.8836, Val Acc=0.3125


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_acc,▁▂▃▁▄▃▄▅▃▃▅▄▆▅▆█▆▇▆▆██▇▇▇
train_loss,█▇▇█▇▆▅▅▇▅▃▄▄▄▂▃▃▂▃▃▁▁▄▄▁
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▃▅▄▆▆▇▆██
val_loss,█▇▆▅▄▄▃▄▂▁

0,1
epoch,9.0
train_acc,0.33333
train_loss,1.80458
trainer/global_step,1249.0
val_acc,0.329
val_loss,1.8819


[34m[1mwandb[0m: Agent Starting Run: nsw3th8e with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	augment: False
[34m[1mwandb[0m: 	batch_norm: False
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	filters: [64, 64, 64, 64, 64]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.0001


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 149 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=2.0533, Train Acc=0.3226, Val Loss=2.1438, Val Acc=0.1250


0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇█
train_acc,▂▂▂▃▂▃▄▄▃▅▄▄▃▇▅▃▄█▂▄▃▄▄▄▁▅▃▃▃▆▄▅▇▅▂█▄▆▅▆
train_loss,▇▇▇█▇▆▅▅▅▄▂▄▃▄▅▃▃▅▄▅▄█▅▅▇▅▄▆▄▁▄▄▃▄▄▁▁▁▃▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
val_acc,▁▅▆▆▇▆█▇█▇
val_loss,█▄▃▃▃▂▂▂▁▁

0,1
epoch,9.0
train_acc,0.32258
train_loss,2.05333
trainer/global_step,2499.0
val_acc,0.2235
val_loss,2.10367


[34m[1mwandb[0m: Agent Starting Run: bt7q8jjm with config:
[34m[1mwandb[0m: 	activation: mish
[34m[1mwandb[0m: 	augment: False
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_neurons: 128
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	filters: [64, 64, 64, 64, 64]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.01


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 149 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=2.3059, Train Acc=0.1111, Val Loss=2.2935, Val Acc=0.1875


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_acc,▂▅▅▄▁▄▅▃▆▄█▂▆▂▅▅▆▂▇▁▂▆▅▇▅
train_loss,▆▄▆▆▅▅▂▆▄▅▁▅▄▅▅▄▃█▃▄▆▂▇▄▅
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▁▁▁▁▁▁▁▁▁
val_loss,▁▁▁▁▁▁▁▁█▁

0,1
epoch,9.0
train_acc,0.11111
train_loss,2.30589
trainer/global_step,1249.0
val_acc,0.1
val_loss,2.30299


[34m[1mwandb[0m: Agent Starting Run: 7euy9rj2 with config:
[34m[1mwandb[0m: 	activation: mish
[34m[1mwandb[0m: 	augment: False
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.001


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 37.9 K | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=1.9403, Train Acc=0.2698, Val Loss=2.0478, Val Acc=0.0625


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_acc,▁▂▁▄▄▁▅▅▅▄▄▆▅▄██▅▆▆▃▇▅█▅▅
train_loss,█▇█▅▇▅▅▅▄▄▆▃▄▅▁▃▃▂▃▅▂▅▂▅▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_acc,▁▄▅▅▆▇█▇██
val_loss,█▅▅▄▃▂▂▂▁▁

0,1
epoch,9.0
train_acc,0.26984
train_loss,1.94026
trainer/global_step,1249.0
val_acc,0.2645
val_loss,2.02239


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tbm8ji53 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: False
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 32, 32, 32, 32]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 5
[34m[1mwandb[0m: 	lr: 0.001


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 104 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=2.0595, Train Acc=0.4194, Val Loss=2.0295, Val Acc=0.2500


0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train_acc,▃▂▁▃▁▂▂▃▃▃▆▁▂▂▄▆▃▄▆▂▃▄▅▃▃▃▅▄▅▃▅▄▆▄▃▆▄▃██
train_loss,██▇▆▇▇▇▇▇▅█▆▇▆▇▄▇▅▇▇▇▅▇▅▄▆▆▆▅▄▅▅▄▄▃▄▃▃▁▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_acc,▁▁▃▃▅▆▅███
val_loss,█▇▆▆▅▄▄▂▁▁

0,1
epoch,9.0
train_acc,0.41935
train_loss,2.05948
trainer/global_step,2499.0
val_acc,0.255
val_loss,2.04174


[34m[1mwandb[0m: Agent Starting Run: 770je7va with config:
[34m[1mwandb[0m: 	activation: silu
[34m[1mwandb[0m: 	augment: True
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [32, 64, 128, 128, 256]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.001


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 536 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.



Train Loss=1.8166, Train Acc=0.3226, Val Loss=1.8113, Val Acc=0.4375


0,1
epoch,▁▁▁▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇█████
train_acc,▁▂▁▄▄▅▃▅▅▃▅▄▅▄▃▄▃▆▃▃▅▅▅▂▄▃▃█▅▃▅▅█▆▅▆▇▆▅▆
train_loss,█▆▇▆▅▆▆▅▅▅▄▄▄▆▆▆▆▄▅▅▅▅▅▇▃▅▃▄▆▄▃▂▄▃▁▁▂▂▅▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_acc,▁▂▃▃▂▅▅▆▇█
val_loss,█▇▆▆▇▃▄▂▂▁

0,1
epoch,9.0
train_acc,0.32258
train_loss,1.81661
trainer/global_step,2499.0
val_acc,0.3145
val_loss,1.94967


[34m[1mwandb[0m: Agent Starting Run: 6a8n0on0 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	augment: False
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	filters: [64, 64, 64, 64, 64]
[34m[1mwandb[0m: 	fp16: False
[34m[1mwandb[0m: 	kernel_size: 3
[34m[1mwandb[0m: 	lr: 0.01


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 149 K  | train
1 | gap     | AdaptiveAvgP

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
!find /content/nature/inaturalist_12K/test -type d -name '.ipynb_checkpoints' -exec rm -r {} +

In [None]:
!find /content/nature/inaturalist_12K/train -type d -name '.ipynb_checkpoints' -exec rm -r {} +

In [None]:
import os
from PIL import Image

def check_image_sizes(folder_path, num_samples=5):
    image_sizes = []
    count = 0

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                img_path = os.path.join(root, file)
                try:
                    with Image.open(img_path) as img:
                        print(f"{img_path} => {img.size}")  # (width, height)
                        image_sizes.append(img.size)
                        count += 1
                        if count >= num_samples:
                            return image_sizes
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
    return image_sizes

# Example usage:
check_image_sizes("/content/nature/inaturalist_12K/test")
check_image_sizes("/content/nature/inaturalist_12K/train")


/content/nature/inaturalist_12K/test/Plantae/28cddd6b16eab7480b225f933ed7e272.jpg => (800, 600)
/content/nature/inaturalist_12K/test/Plantae/f00e0e5784cd21a641e7f41ae264ae17.jpg => (800, 534)
/content/nature/inaturalist_12K/test/Plantae/53951922033e90dd818752f35ba1fec4.jpg => (800, 452)
/content/nature/inaturalist_12K/test/Plantae/62053681fd63fe4a208ebc5e33021537.jpg => (600, 800)
/content/nature/inaturalist_12K/test/Plantae/b7f98d16c84f8000b7e6db55ae87c64b.jpg => (600, 800)
/content/nature/inaturalist_12K/train/Plantae/fe9389352b5945eff27a90b901a6bde1.jpg => (600, 800)
/content/nature/inaturalist_12K/train/Plantae/05c1f3d558d86223c73de780fd74ec78.jpg => (600, 800)
/content/nature/inaturalist_12K/train/Plantae/e15f2a119c4d9e544cf5f3a0247a7049.jpg => (597, 800)
/content/nature/inaturalist_12K/train/Plantae/e7a016514af9f921d35b002b50962f9e.jpg => (600, 800)
/content/nature/inaturalist_12K/train/Plantae/1be031e041cdf2a0467762c52187b811.jpg => (600, 800)


[(600, 800), (600, 800), (597, 800), (600, 800), (600, 800)]

In [None]:
print(torch.cuda.is_available())  # True if GPU is available
print(torch.cuda.current_device())  # Print current GPU device
print(torch.cuda.get_device_name(torch.cuda.current_device()))  # Print GPU name

True
0
Tesla T4


## Testing best Model

Once the best hyperparameters are obtained from the sweep, we use it to train the same model and test it on the test dataset.

In [11]:
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

# 1. Define the best config
best_config = {
    "lr": 0.01,
    "batch_size": 64,
    "dropout": 0.3,
    "activation": "gelu",
    "filters": [32, 64, 128, 128, 256],
    "kernel_size": 5,
    "dense_neurons": 256,
    "augment": True,
    "fp16": True
}

# 2. Create the DataModule
datamodule = INaturalistDataModule(
    data_dir="/content/nature/inaturalist_12K",
    batch_size=best_config["batch_size"],
    augment=best_config["augment"]
)

datamodule.setup()

# 3. Create the model
model = LitCNN(best_config)

In [6]:
# 4. Add test_step to the model if not already present
def test_step(self, batch, batch_idx):
    x, y = batch
    preds = self(x)
    loss = F.cross_entropy(preds, y)
    acc = (preds.argmax(dim=1) == y).float().mean()
    self.log("test_loss", loss, prog_bar=True)
    self.log("test_acc", acc, prog_bar=True)
    return {"test_loss": loss, "test_acc": acc}
LitCNN.test_step = test_step  # Dynamically add it to your class

# 5. Save best model using checkpoint
checkpoint_callback = ModelCheckpoint(
    monitor="val_acc",
    mode="max",
    filename="best-cnn-model",
    save_top_k=1
)

logger = CSVLogger("lightning_logs", name="cnn_best_run")

trainer = Trainer(
    max_epochs=20,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    precision=16 if best_config["fp16"] else 32,
    callbacks=[checkpoint_callback],
    logger=logger
)

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:513: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [10]:
# 6. Train the model
trainer.fit(model, datamodule=datamodule)

# 7. Save the trained model manually as .pth
torch.save(model.state_dict(), "best_model.pth")

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | conv    | Sequential        | 1.5 M  | train
1 | gap     | AdaptiveAvgPool2d | 0      | train
2 | dropout | Dropout           | 0      | train
3 | fc1     | Linear            | 65.8 K | train
4 | output  | Linear            | 2.6 K  | train
------------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.225     Total estimated model params size (MB)
16        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.



Train Loss=2.3006, Train Acc=0.0952, Val Loss=2.3030, Val Acc=0.1250


In [12]:
model.load_state_dict(torch.load("best_model.pth"))

<All keys matched successfully>

In [13]:
trainer.test(model, dataloaders=datamodule.test_dataloader())

Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.3028059005737305, 'test_acc': 0.10000000149011612}]