# AlexNet on Imagenet Dataset

Code is running, but hyper parameters are bad, following method need to apply to improve results.

1. Increase training epoches, 1000 classes need more training epoches to lower loss.
2. Learning rate may be too high, lower to 0.003 or 0.001 may have better result.

It's not wise to increase test count without enough understand of lightning framework. Plan to use small data sets to continue learning, and then come back to improve after familiar with lightning framework and tensorboard.


In [1]:
import torch
from torch import nn
import pytorch_lightning as L
from torch.utils import data
from torchvision.transforms import v2
import torchvision
import random
import matplotlib.pyplot as plt
from torchmetrics import Metric
import torchmetrics
from pytorch_lightning.tuner import Tuner

In [None]:
class ImagenetDataset(L.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.train_dataset_path = "./data/imagenet/training_images/"
        self.test_dataset_path = "./data/imagenet/validation_images/"
        self.image_transforms = v2.Compose(
            [v2.ToImage(), v2.ToDtype(torch.float32, True), v2.Resize((224, 224))])
        self.batch_size = 32
        self.num_workers = 16

    def prepare_data(self):
        return super().prepare_data()

    def setup(self, stage):
        all_train_data = torchvision.datasets.ImageFolder(
            self.train_dataset_path, self.image_transforms)
        train_dataset, validation_dataset = data.random_split(
            all_train_data, [0.8, 0.2])
        self.train_dataset = train_dataset
        self.validation_dataset = validation_dataset

        test_dataset = torchvision.datasets.ImageFolder(
            self.test_dataset_path, self.image_transforms)
        self.test_dataset = test_dataset

    def train_dataloader(self):
        return data.DataLoader(self.train_dataset, self.batch_size, True, num_workers=self.num_workers, persistent_workers=True)

    def val_dataloader(self):
        return data.DataLoader(self.validation_dataset, self.batch_size, False, num_workers=self.num_workers, persistent_workers=True)

    def test_dataloader(self):
        return data.DataLoader(self.test_dataset, self.batch_size, False, num_workers=self.num_workers, persistent_workers=True)

In [3]:
class AlexNet(L.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.net = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
            nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Flatten(),
            nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5),
            nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
            nn.Linear(4096, 1000)
        )

        self.learning_rate = 0.005

        self.loss = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(
            task="multiclass", num_classes=1000)

    def forward(self, X):
        return self.net(X)

    def training_step(self, batch, batch_idx):
        X, y = batch
        output = self(X)
        loss = self.loss(output, y)
        accuracy = self.accuracy(output, y)
        self.log_dict({"training_loss": loss, "train_accuracy": accuracy},
                      on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        output = self(X)
        loss = self.loss(output, y)
        accuracy = self.accuracy(output, y)
        self.log_dict({"validation_loss": loss,
                      "validation_accuracy": accuracy})
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=self.learning_rate)

    def train_dataloader(self):
        return data.DataLoader(torchvision.datasets.ImageFolder(
            "./data/imagenet/training_images/", v2.Compose(
                [v2.ToImage(), v2.ToDtype(torch.float32, True), v2.Resize((224, 224))])), 32, True, num_workers=16)

In [4]:
model = AlexNet()
trainer = L.Trainer(accelerator='gpu', max_epochs=10)


Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
f:\code\deep-learning\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [5]:
my_dataset = ImagenetDataset()
trainer.fit(model, my_dataset)

You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti SUPER') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type               | Params | Mode 
--------------------------------------------------------
0 | net      | Sequential         | 50.8 M | train
1 | loss     | CrossEntropyLoss   | 0      | train
2 | accuracy | MulticlassAccuracy | 0      | train
--------------------------------------------------------
50.8 M    Trainable params
0         Non-trainable params
50.8 M    Total params
203.376   Total estimated model params size (MB)
24        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
