# Pytorch Lightning Tutorial
This tutorial demonstrates integration of ModelBox with PyTorch Lightning.

In [10]:
pip install pytorch-lightning

Note: you may need to restart the kernel to use updated packages.


In [11]:
import os

import pandas as pd
import seaborn as sn
import torch
import random
from IPython.core.display import display
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import CSVLogger
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST
from modelbox.lightning_logger import ModelBoxLogger

PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
BATCH_SIZE = 256 if torch.cuda.is_available() else 64

  from IPython.core.display import display


In [3]:
class MNISTModel(LightningModule):
    def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
        super().__init__()
        self.l1 = torch.nn.Linear(28 * 28, 10)
        
        self.data_dir = data_dir
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,)),
            ]
        )
        self.val_accuracy = Accuracy()
        self.test_accuracy = Accuracy()
        self.save_hyperparameters()

    def forward(self, x):
        return torch.relu(self.l1(x.view(x.size(0), -1)))

    def training_step(self, batch, batch_nb):
        x, y = batch
        loss = F.cross_entropy(self(x), y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        self.val_accuracy.update(preds, y)

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", self.val_accuracy, prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        self.test_accuracy.update(preds, y)

        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", self.test_accuracy, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

    def prepare_data(self):
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=BATCH_SIZE)



In [4]:
mnist_model = MNISTModel()

train_ds = MNIST(PATH_DATASETS, train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE)

# Intialize ModelBoxLogger
experiment_name = f"lid_quartznet-{random.randint(1, 10000)}"
mbox_logger = ModelBoxLogger(namespace="langtech", experiment_name=experiment_name, owner="owner@pytorch.com", server_addr="172.21.0.2:8085")

trainer = Trainer(
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None, 
    max_epochs=3,
    logger=[mbox_logger],
    callbacks=[TQDMProgressBar(refresh_rate=20)],
)

trainer.fit(mnist_model, train_loader)
trainer.test()


  return new_rank_zero_deprecation(*args, **kwargs)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
modelbox - attempting to create a project
modelbox - created experiment with id: 27dc27aceeb99d6344c63cd8584ad004eae8bd6e

  | Name          | Type     | Params
-------------------------------------------
0 | l1            | Linear   | 7.9 K 
1 | val_accuracy  | Accuracy | 0     
2 | test_accuracy | Accuracy | 0     
-------------------------------------------
7.9 K     Trainable params
0         Non-trainable params
7.9 K     Total params
0.031     Total estimated model params size (MB)
modelbox - log hpraams params "data_dir":      .
"hidden_size":   64
"learning_rate": 0.0002
modelbox - log hpraams metrics None


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

modelbox - log metrics, step: 937, metrics: {'val_loss': 31.517704010009766, 'val_acc': 0.444599986076355}


Validation: 0it [00:00, ?it/s]

modelbox - log metrics, step: 1875, metrics: {'val_loss': 41.080623626708984, 'val_acc': 0.4016000032424927}


Validation: 0it [00:00, ?it/s]

modelbox - log metrics, step: 2813, metrics: {'val_loss': 51.86833572387695, 'val_acc': 0.38420000672340393}
`Trainer.fit` stopped: `max_epochs=3` reached.
  rank_zero_warn(
Restoring states from the checkpoint path at /home/jovyan/work/tutorials/lid_quartznet-8929/0.1/checkpoints/epoch=2-step=2814.ckpt
modelbox - log hpraams params "data_dir":      .
"hidden_size":   64
"learning_rate": 0.0002
modelbox - log hpraams metrics None
Loaded model weights from checkpoint at /home/jovyan/work/tutorials/lid_quartznet-8929/0.1/checkpoints/epoch=2-step=2814.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

modelbox - log metrics, step: 2814, metrics: {'test_loss': 51.214683532714844, 'test_acc': 0.3817000091075897}


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.3817000091075897
        test_loss           51.214683532714844
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 51.214683532714844, 'test_acc': 0.3817000091075897}]

In [5]:
from modelbox.modelbox import ModelBox
client = ModelBox(addr="172.21.0.2:8085")


Once an experiment is logged into ModelBox from a lightning trainer, it can then be accessed via the client API.

In [8]:
experiment = client.experiment('27dc27aceeb99d6344c63cd8584ad004eae8bd6e')
experiment.metadata()

ListMetadataResponse(metadata={'hyperparams': {'hidden_size': 64.0, 'learning_rate': 0.0002, 'data_dir': '.'}})

In [9]:
experiment.all_metrics()

{'val_loss': [MetricValue(step=937, wallclock_time=1665546472, value=31.517704010009766),
  MetricValue(step=1875, wallclock_time=1665546472, value=41.080623626708984),
  MetricValue(step=2813, wallclock_time=1665546472, value=51.86833572387695)],
 'val_acc': [MetricValue(step=937, wallclock_time=1665546472, value=0.444599986076355),
  MetricValue(step=1875, wallclock_time=1665546472, value=0.4016000032424927),
  MetricValue(step=2813, wallclock_time=1665546472, value=0.38420000672340393)],
 'test_loss': [MetricValue(step=2814, wallclock_time=1665546472, value=51.214683532714844)],
 'test_acc': [MetricValue(step=2814, wallclock_time=1665546472, value=0.3817000091075897)]}