[Level 2: Add a validation and test set](https://pytorch-lightning.readthedocs.io/en/stable/levels/basic_level_2.html)

### 1. [Validate and test a model](https://pytorch-lightning.readthedocs.io/en/stable/common/evaluation_basic.html)

### 2. [Save your model progress](https://pytorch-lightning.readthedocs.io/en/stable/common/checkpointing_basic.html#save-a-checkpoint)

### 3. [Enable early stopping](https://pytorch-lightning.readthedocs.io/en/stable/common/early_stopping.html)

### Add imports

In [1]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

### Define the PyTorch nn.Modules

In [2]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))

    def forward(self, x):
        return self.l1(x)


class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

    def forward(self, x):
        return self.l1(x)

### Define a LightningModule

In [3]:
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        return loss

    def test_step(self, batch, batch_idx):
        # this is the test loop
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        test_loss = F.mse_loss(x_hat, x)
        self.log("test_loss", test_loss)

    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        val_loss = F.mse_loss(x_hat, x)
        self.log("val_loss", val_loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

### Define the train and test dataset

In [4]:
import torch.utils.data as data
from torchvision import datasets

# Load data sets
train_set = datasets.MNIST(root=os.getcwd(), download=True, train=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_set)

test_set = datasets.MNIST(root=os.getcwd(), download=True, train=False, transform=transforms.ToTensor())
test_loader = DataLoader(test_set)

print(train_set)
print()
print(test_set)

Dataset MNIST
    Number of datapoints: 60000
    Root location: /home/tkyen/opencv_practice/metavision/pytorch_practice/pytorch_lightning
    Split: Train
    StandardTransform
Transform: ToTensor()

Dataset MNIST
    Number of datapoints: 10000
    Root location: /home/tkyen/opencv_practice/metavision/pytorch_practice/pytorch_lightning
    Split: Test
    StandardTransform
Transform: ToTensor()


### Train with the test loop

In [5]:
from torch.utils.data import DataLoader

# model
model = LitAutoEncoder(Encoder(), Decoder())

# initialize the Trainer
trainer = pl.Trainer()

# test the model
trainer.test(model, dataloaders=test_loader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.13455133140087128}
--------------------------------------------------------------------------------


[{'test_loss': 0.13455133140087128}]

### Split the training data
As a rule of thumb, we use 20% of the training set as the validation set. This number varies from dataset to dataset.

In [6]:
# use 20% of training data for validation
train_set_size = int(len(train_set) * 0.8)
valid_set_size = len(train_set) - train_set_size

# split the train set into two
seed = torch.Generator().manual_seed(42)
train_set, valid_set = data.random_split(train_set, [train_set_size, valid_set_size], generator=seed)

### Train with the validation loop

In [7]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set)
valid_loader = DataLoader(valid_set)

# train with both splits
trainer = pl.Trainer()
trainer.fit(model, train_loader, valid_loader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 50.4 K
1 | decoder | Decoder | 51.2 K
------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
