## Deep Learning ML training at local

The purpose of the notebook is to demo how to training a computer vision model at local using PyTorch Lightning, with Weights & Bias for full traceability and reproducibility.

In [1]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mtom-5610[0m ([33mtom-5610-aws[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Setup the dataloader

In [3]:
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader, random_split

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

dataset = MNIST(root="./data/MNIST", download=True, transform=transform)
training_set, validation_set = random_split(dataset, [55_000, 5000])

In [4]:
training_loader = DataLoader(training_set, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=64)

### Defining the model

**Tips**:
* Call `self.save_hyperparameters()` in `__init__` to automatically log your hyperparameters to **W&B**.
* Call self.log in `training_step` and `validation_step` to log the metrics.

In [5]:
import lightning.pytorch as pl


In [6]:
import torch
from torch.nn import Linear, CrossEntropyLoss, functional as F
from torch.optim import Adam
from torchmetrics.functional import accuracy

class MNIST_LitModule(pl.LightningModule):

    def __init__(self, n_classes=10, n_layer_1=128, n_layer_2=256, lr=1e-3):
        '''
        method used to define our model parameters
        '''
        super().__init__()

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = Linear(28 * 28, n_layer_1)
        self.layer_2 = Linear(n_layer_1, n_layer_2)
        self.layer_3 = Linear(n_layer_2, n_classes)

        # loss
        self.loss = CrossEntropyLoss()

        # optimizer parameters
        self.lr = lr

        # save Hyperparameters to self.hparams (auto-logged by W&B)
        self.save_hyperparameters()

    
    def forward(self, x):
        '''method used for infernce input -> output'''

        batch_size, channels, width, height = x.size()

        # (b, 1, 28, 28) -> (b, 1*28*28)
        x = x.view(batch_size, -1)

        # let's do 3 x (linear + relu)
        x = self.layer_1(x)
        x = F.relu(x)
        x = self.layer_2(x)
        x = F.relu(x)
        x = self.layer_3(x)

        return x

    def _get_preds_loss_accuracy(self, batch):
        '''convenience function since train/valid/test steps are similar'''
        x, y = batch
        logits = self(x)
        preds = torch.argmax(logits, dim=1)
        loss = self.loss(logits, y)
        acc = accuracy(preds, y, 'multiclass', num_classes=10)
        return preds, loss, acc


    def training_step(self, batch, batch_idx):
        '''needs to return a loss from a single batch'''
        _, loss, acc = self._get_preds_loss_accuracy(batch)

        # log loss and metric
        self.log('train_loss', loss)
        self.log('training_accuracy', acc)

        return loss
    
    def validation_step(self, batch, batch_idx):
        '''used for logging metrics'''
        preds, loss, acc = self._get_preds_loss_accuracy(batch)

        # log
        self.log('val_loss', loss)
        self.log('val_accuracy', acc)

        return preds

    def test_step(self, batch, batch_idx):
        '''used for logging metrics'''
        _, loss, acc = self._get_preds_loss_accuracy(batch)

        # Log loss and metric
        self.log('test_loss', loss)
        self.log('test_accuracy', acc)

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)

    def on_save_checkpoint(self, checkpoint):
        super().on_save_checkpoint(checkpoint)
        print("!!!LightningModule-Checkpoint!!!")
        print("checkpoint name", self.logger._checkpoint_name)
        print("project", self.logger._project)
        print("name", self.logger._name)
        print("entity", self.logger._experiment.entity)
        





In [7]:
model = MNIST_LitModule(n_layer_1=128, n_layer_2=128)

### Save model checkpoints

In [9]:
from lightning.pytorch.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoint", 
    filename="{epoch:03d}",
    monitor='val_accuracy', 
    save_top_k=-1,
    mode='max')

#### Logging images

In [10]:
from lightning.pytorch.callbacks import Callback
 
class LogPredictionsCallback(Callback):

    # def on_save_checkpoint(self, checkpoint):
    #     print("---CHECKPOINT---")
    #     print(type(checkpoint))
    #     print(checkpoint._checkpoint_name)
    
    def on_validation_batch_end(
        self, trainer, pl_module, outputs, batch, batch_idx):
        """Called when the validation batch ends."""
 
        # `outputs` comes from `LightningModule.validation_step`
        # which corresponds to our model predictions in this case
        
        # Let's log 20 sample image predictions from first batch
        if batch_idx == 0:
            n = 20
            x, y = batch
            images = [img for img in x[:n]]
            captions = [f'Ground Truth: {y_i} - Prediction: {y_pred}' for y_i, y_pred in zip(y[:n], outputs[:n])]
            
            # Option 1: log images with `WandbLogger.log_image`
            wandb_logger.log_image(key='sample_images', images=images, caption=captions)

            # Option 2: log predictions as a Table
            columns = ['image', 'ground truth', 'prediction']
            data = [[wandb.Image(x_i), y_i, y_pred] for x_i, y_i, y_pred in list(zip(x[:n], y[:n], outputs[:n]))]
            wandb_logger.log_table(key='sample_table', columns=columns, data=data)

log_predictions_callback = LogPredictionsCallback()

### Train Your Model

In [11]:
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch import Trainer

wandb_logger = WandbLogger(project="MNIST", log_model="all")


trainer = Trainer(
    logger=wandb_logger,
    callbacks=[log_predictions_callback, checkpoint_callback],
    accelerator="gpu",
    max_epochs=5
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model, training_loader, validation_loader)

You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


/home/ubuntu/workspace/deep-learning/sagemaker-training-job-wandb-samples/.venv/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:701: Checkpoint directory /home/ubuntu/workspace/deep-learning/sagemaker-training-job-wandb-samples/notebooks/checkpoint exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | layer_1 | Linear           | 100 K  | train
1 | layer_2 | Linear           | 16.5 K | train
2 | layer_3 | Linear           | 1.3 K  | train
3 | loss    | CrossEntropyLoss | 0      | train
-----------------------------------------------------
118 K     Trainable params
0         Non-trainable params
118 K     Total params
0.473     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/ubuntu/workspace/deep-learning/sagemaker-training-job-wandb-samples/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

/home/ubuntu/workspace/deep-learning/sagemaker-training-job-wandb-samples/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 860/860 [00:11<00:00, 73.69it/s, v_num=yoxk]!!!LightningModule-Checkpoint!!!
checkpoint name None
project MNIST
name None
entity tom-5610-aws
Epoch 1: 100%|██████████| 860/860 [00:11<00:00, 75.51it/s, v_num=yoxk]!!!LightningModule-Checkpoint!!!
checkpoint name model-a15oyoxk
project MNIST
name None
entity tom-5610-aws
Epoch 2: 100%|██████████| 860/860 [00:11<00:00, 74.03it/s, v_num=yoxk]!!!LightningModule-Checkpoint!!!
checkpoint name model-a15oyoxk
project MNIST
name None
entity tom-5610-aws
Epoch 3: 100%|██████████| 860/860 [00:11<00:00, 74.89it/s, v_num=yoxk]!!!LightningModule-Checkpoint!!!
checkpoint name model-a15oyoxk
project MNIST
name None
entity tom-5610-aws
Epoch 4: 100%|██████████| 860/860 [00:11<00:00, 73.74it/s, v_num=yoxk]!!!LightningModule-Checkpoint!!!
checkpoint name model-a15oyoxk
project MNIST
name None
entity tom-5610-aws


`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 860/860 [00:11<00:00, 72.39it/s, v_num=yoxk]


In [None]:
tom-5610-aws/MNIST/model-nhrn51pr:v3

In [13]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆█████
train_loss,█▄▆▄▆▂▃▂▂▄▂▄▃▂▃▂▁▂▁▁▂▂▃▂▂▁▁▂▂▂▁▁▂▄▁▁▂▁▂▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇█████
training_accuracy,▂▃▆▁▁▂▃▅▆▂▆▇█▇▅█▃▃▆▃▆▇▇███▆▇▅█▇▆▇█▇█▅█▇█
val_accuracy,▁▅███
val_loss,█▄▁▁▂

0,1
epoch,4.0
train_loss,0.04064
trainer/global_step,4299.0
training_accuracy,0.95833
val_accuracy,0.9724
val_loss,0.10177


In [None]:
wandb_logger._checkpoint_name, wandb_logger._save_dir

In [None]:
tom-5610-aws/MNIST/model-cj5s8odg:v0

tom-5610-aws/MNIST/model-cj5s8odg:v1

In [None]:
api = wandb.Api()

collections = [
    coll for coll in api.artifact_type(type_name="model", project="MNIST").collections()
]

aliases = set()

# for coll in collections:
    # print(coll.artifacts.)

print(collections)
print("aliases", aliases)

In [None]:
artifacts = api.artifacts(type_name="model", name="tom-5610-aws/MNIST/model-ano4gslu")

if artifacts:
    print(artifacts.next().source_qualified_name)
else:
    print('not found')

In [12]:
checkpoint_reference = "tom-5610-aws/MNIST/model-ano4gslu:latest"
wandb_logger.download_artifact(checkpoint_reference, artifact_type="model", save_dir="./checkpoint")


[34m[1mwandb[0m:   1 of 1 files downloaded.  


'./checkpoint'