# Saving model, hyperparameters

> Fill in a module description here

In [None]:
#| default_exp basic_val

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms, datasets
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

In [None]:
#| export
# Load data sets
transform = transforms.ToTensor()
train_set = datasets.MNIST(root="../data", download=True, train=True, transform=transform)
test_set = datasets.MNIST(root="../data", download=True, train=False, transform=transform)

In [None]:
! ls ../data/MNIST/raw

t10k-images-idx3-ubyte	   train-images-idx3-ubyte
t10k-images-idx3-ubyte.gz  train-images-idx3-ubyte.gz
t10k-labels-idx1-ubyte	   train-labels-idx1-ubyte
t10k-labels-idx1-ubyte.gz  train-labels-idx1-ubyte.gz


In [None]:
type(train_set)

torchvision.datasets.mnist.MNIST

In [None]:
#| export
# use 20% of training data for validation
train_set_size = int(len(train_set) * 0.8)
valid_set_size = len(train_set) - train_set_size
train_set_size, valid_set_size

(48000, 12000)

In [None]:
len(test_set)

10000

In [None]:
#| export
seed = torch.Generator().manual_seed(42)
train_set, valid_set = random_split(train_set, [train_set_size, valid_set_size], generator=seed)

In [None]:
#| export
train_loader = DataLoader(train_set, batch_size=10)

In [None]:
#| export
valid_loader = DataLoader(valid_set, batch_size=10)

In [None]:
#| export
test_loader = DataLoader(test_set, batch_size=10)

In [None]:
#| export
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Flatten(), nn.Linear(28*28, 64), nn.ReLU(), nn.Linear(64, 3))
        
    def forward(self, x):
        return self.l1(x)

In [None]:
#| export
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28*28))
        
    def forward(self, x):
        return self.l1(x)

In [None]:
#| export
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.save_hyperparameters()        

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch 
        out_shape = x.shape
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat.view(out_shape), x)
        return loss
    
    def test_step(self, batch, batch_idx):
        # this is the test loop
        x, y = batch 
        out_shape = x.shape
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat.view(out_shape), x)
        self.log("test_loss", loss)
    
    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        x, y = batch 
        out_shape = x.shape
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat.view(out_shape), x)
        self.log("val_loss", loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:

# model
autoencoder = LitAutoEncoder(Encoder(), Decoder())

  rank_zero_warn(
  rank_zero_warn(


Lightning automatically saves a checkpoint for you in your current working directory, with the state of your last training epoch. This makes sure you can resume training in case it was interrupted.

In [None]:
! ls lightning_logs/version_0/checkpoints/

3534.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


'epoch=1-step=12000.ckpt'


In [None]:

LitAutoEncoder.load_from_checkpoint("lightning_logs/version_0/checkpoints/epoch=1-step=12000.ckpt", encoder=Encoder(), decoder=Decoder())

LitAutoEncoder(
  (encoder): Encoder(
    (l1): Sequential(
      (0): Flatten(start_dim=1, end_dim=-1)
      (1): Linear(in_features=784, out_features=64, bias=True)
      (2): ReLU()
      (3): Linear(in_features=64, out_features=3, bias=True)
    )
  )
  (decoder): Decoder(
    (l1): Sequential(
      (0): Linear(in_features=3, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=784, bias=True)
    )
  )
)

In [None]:

# train model
trainer = pl.Trainer(max_epochs=1)
trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 50.4 K
1 | decoder | Decoder | 51.2 K
------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


                                                                                                                     

  rank_zero_warn(
  rank_zero_warn(


Epoch 0:  80%|████████████████████████████████████▊         | 4800/6000 [01:53<00:28, 42.18it/s, loss=0.043, v_num=5]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|                                                                           | 0/1200 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|                                                              | 0/1200 [00:00<?, ?it/s][A
Epoch 0:  80%|████████████████████████████████████▊         | 4801/6000 [01:53<00:28, 42.18it/s, loss=0.043, v_num=5][A
Epoch 0:  80%|████████████████████████████████████▊         | 4802/6000 [01:53<00:28, 42.18it/s, loss=0.043, v_num=5][A
Epoch 0:  80%|████████████████████████████████████▊         | 4803/6000 [01:53<00:28, 42.19it/s, loss=0.043, v_num=5][A
Epoch 0:  80%|████████████████████████████████████▊         | 4804/6000 [01:53<00:28, 42.19it/s, loss=0.043, v_num=5][A
Epoch 0:  80%|████████████████████████████████████▊         | 4805/6000 [01:53<00:28, 42.18it/s, loss=0.043, v_num=5][A
E

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████████████████████████████████████████| 6000/6000 [02:24<00:00, 41.38it/s, loss=0.043, v_num=5]


Model weights can be read from the checkpoints as well.

In [None]:
!ls lightning_logs/version_4/checkpoints/

3689.44s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


'epoch=0-step=4800.ckpt'


In [None]:

checkpoint = torch.load("lightning_logs/version_4/checkpoints/epoch=0-step=4800.ckpt")

In [None]:
checkpoint.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])

In [None]:
checkpoint['state_dict'].keys()

odict_keys(['encoder.l1.1.weight', 'encoder.l1.1.bias', 'encoder.l1.3.weight', 'encoder.l1.3.bias', 'decoder.l1.0.weight', 'decoder.l1.0.bias', 'decoder.l1.2.weight', 'decoder.l1.2.bias'])

In [None]:
checkpoint['state_dict']['encoder.l1.1.weight'].mean()

tensor(-0.0088)

In [None]:
list(checkpoint['hyper_parameters']["encoder"].parameters())[0].mean()

tensor(-0.0088, grad_fn=<MeanBackward0>)

Can also use the encoder key to get the weights

In [None]:
checkpoint['hyper_parameters']['encoder']

Encoder(
  (l1): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [None]:

encoder_weights = checkpoint['hyper_parameters']["encoder"].parameters()
decoder_weights = checkpoint['hyper_parameters']["decoder"].parameters()

In [None]:
encoder_weights

<generator object Module.parameters>

In [None]:
decoder_weights

<generator object Module.parameters>

To directly resume training from where we stopped. This will train one more epoch eventhough `max_epochs=2` since the pretrained is trained for 1 epoch

In [None]:
# automatically restores model, epoch, step, LR schedulers, apex, etc...
trainer = pl.Trainer(max_epochs=2) # 
trainer.fit(autoencoder, train_loader, ckpt_path="lightning_logs/version_4/checkpoints/epoch=0-step=4800.ckpt")

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(
Restoring states from the checkpoint path at lightning_logs/version_4/checkpoints/epoch=0-step=4800.ckpt

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 50.4 K
1 | decoder | Decoder | 51.2 K
------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_4/checkpoints/epoch=0-step=4800.ckpt
  rank_zero_warn(


Epoch 1: 100%|█████████████████████████████████████████████| 4800/4800 [01:00<00:00, 78.91it/s, loss=0.0397, v_num=6]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|█████████████████████████████████████████████| 4800/4800 [01:00<00:00, 78.87it/s, loss=0.0397, v_num=6]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()