<a href="https://colab.research.google.com/github/turab45/mastering-pytorch-and-pytorch-lightining-/blob/master/MNIST_classification%20-%20Lightning%20DataModule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installation
!pip install lightning

In [37]:
# imports
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision import datasets
from torch.utils.data.dataset import random_split
from torch.utils.data.dataloader import DataLoader
import lightning as L
import torchmetrics

print("GPU Available: {}".format(torch.cuda.is_available()))

GPU Available: True


In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Loader (Custom Dataset)

* The `batch_size` parameter is important because it can affect the performance of the training process. A larger batch size will typically lead to faster training, but it can also lead to memory issues. A smaller batch size will typically lead to slower training, but it can also lead to more accurate results.

* The `data_dir` parameter is important because it specifies the directory where the data is stored. (In this case, it means where we want to store the data after downloading)

* `num_workers` is a parameter that specifies the number of subprocesses that will be used to load data. By default, num_workers is set to 0, which means that the data will be loaded in the main process. However, if you set num_workers to a positive integer, then the data will be loaded in parallel by multiple subprocesses. This can significantly speed up the data loading process, especially if you have a large dataset
  * `num_workers` should not be set too high, as this can lead to memory issues.



In [47]:
class MnistDataModule(L.LightningDataModule):
  def __init__(self, data_dir, batch_size, num_workers):
    super().__init__()
    self.data_dir = data_dir
    self.batch_size = batch_size
    self.num_workers = num_workers

  def prepare_data(self):
    """
    Tip: If you have the dataset already downloaded in a directory, then this step is not necessary. skip this.
    """
    # download, tokenize (text data), etc...
    datasets.MNIST(self.data_dir, train=True, download=True)
    datasets.MNIST(self.data_dir, train=False, download=True)


  def setup(self, stage: str): #stage: fit, validate, test, predict etc...
    # transform, split, etc...
    entire_dataset = datasets.MNIST(root=self.data_dir,
                           train=True,
                           transform=transforms.ToTensor(),
                           download=False)
    self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
    self.test_ds = datasets.MNIST(root=self.data_dir,
                           train=False,
                           transform=transforms.ToTensor(),
                           download=False)

  def train_dataloader(self):
    return DataLoader(self.train_ds,
                      batch_size=self.batch_size,
                      num_workers=self.num_workers,
                      shuffle=True)

  def val_dataloader(self):
    return DataLoader(self.val_ds,
                      batch_size=self.batch_size,
                      num_workers=self.num_workers,
                      shuffle=False)

  def test_dataloader(self):
    return DataLoader(self.test_ds,
                      batch_size=self.batch_size,
                      num_workers=self.num_workers,
                      shuffle=False)

In [48]:
class NN(L.LightningModule):
  def __init__(self, input_size, num_classes) -> None:
      super().__init__()
      # add network layers
      self.fc1 = nn.Linear(in_features=input_size, out_features=50)
      self.fc2 = nn.Linear(in_features=50, out_features=num_classes)
      self.loss_fn = nn.CrossEntropyLoss()
      self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
      self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

  def training_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    accuracy = self.accuracy(scores, y)
    f1_score = self.f1_score(scores, y)
    self.log_dict({"train_loss: ":loss, "train_accuracy:":accuracy, "train_f1_score:":f1_score},
                  on_step=False, on_epoch=True, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    self.log("Validation loss: ", loss)
    return loss

  def test_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    self.log("Test loss: ", loss)
    return loss

  def predict_step(self, batch, batch_idx):
    x, y = batch
    x = x.reshape(x.size(0), -1)
    scores = self.forward(x)
    preds = torch.argmax(scores, dim=1)
    return preds

  # Optional, just to make the code look clean
  def _common_step(self, batch, batch_idx):
    x, y = batch
    x = x.reshape(x.size(0), -1)
    scores = self.forward(x)
    loss = self.loss_fn(scores, y)
    return loss, scores, y

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return optimizer

In [49]:
# Hyperparameters
input_size = 784
num_classes = 10
learning_rate = 0.001 #1e-3
batch_size = 64
num_epochs = 3

In [50]:
dm = MnistDataModule(data_dir="dataset/", batch_size=batch_size, num_workers=4)

In [51]:
# Initialize the network

model = NN(input_size=input_size, num_classes=num_classes).to(device)

In [52]:
# Trainer

trainer = L.Trainer(accelerator="gpu", devices=[0], min_epochs=1, max_epochs=3, enable_model_summary=True, precision=16)
#trainer.tune() -> find the optima hyperparameters (lr, batch size etc)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type               | Params
------------------------------------------------
0 | fc1      | Linear             | 39.2 K
1 | fc2      | Linear             | 510   
2 | loss_fn  | CrossEntropyLoss   | 0     
3 | 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'Test loss: ': 0.1750074177980423}]