In [1]:
import time
import pyrootutils
import pytorch_lightning as pl
import wandb

# set pythonpath and working directory to folder containing .project-root file
root = pyrootutils.setup_root(".", indicator=".project-root", pythonpath=True, cwd=True)

from src.data.face_age_datamodule import FaceAgeDataModule
from src.models.face_age_module import FaceAgeModule


def main():
    
    # execute for 3 seeds
    for i in range(3):
        # set seed for reproducibility
        pl.seed_everything(i+42)

        data_dir = root / "data"
        log_dir = root / "logs" / time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

        use_wandb = True
        age_norm_value = 80
        use_augmented_dataset = False
        loss_fn = "MSELoss"  # "SmoothL1Loss"

        # 1
        net = "SimpleConvNet_100x100"
        img_size = (100, 100)
        imagenet_normalization = False
        exp_name = f"SimpleConvNet+{img_size}+augmented={use_augmented_dataset}+{loss_fn}"

        # 2
        # net = "SimpleConvNet_224x224"
        # img_size = (224, 224)
        # imagenet_normalization = False
        # exp_name = f"SimpleConvNet+{img_size}+augmented={use_augmented_dataset}+{loss_fn}"

        # 3
        # net = "EffNet_224x224"
        # img_size = (224, 224)
        # imagenet_normalization = True
        # exp_name = f"EffNet+{img_size}+augmented={use_augmented_dataset}+{loss_fn}"

        datamodule = FaceAgeDataModule(
            data_dir=data_dir,
            img_size=img_size,
            imagenet_normalization=imagenet_normalization,
            use_augmented_dataset=use_augmented_dataset,
            normalize_age_by=age_norm_value,
            num_workers=12,
            batch_size=32,
            pin_memory=False,
        )

        model = FaceAgeModule(net=net, rescale_age_by=age_norm_value, loss_fn=loss_fn)

        callbacks = []
        loggers = []

        # this controls how checkpoints are saved
        callbacks.append(
            pl.callbacks.ModelCheckpoint(
                monitor="val/loss",
                dirpath=log_dir / "checkpoints",
                save_top_k=1,  # save the best checkpoint
                save_last=True,  # additionally the save the last checkpoint
                mode="min",
                save_weights_only=True,
                filename="best-checkpoint",
            )
        )

        # this configurates optional weights&biases logger
        if use_wandb:
            loggers.append(
                pl.loggers.WandbLogger(
                    project="face-age",
                    save_dir=log_dir,
                    name=exp_name,
                    group=exp_name,
                )
            )

        # trainer setup
        trainer = pl.Trainer(
            accelerator="gpu",
            default_root_dir=log_dir,
            callbacks=callbacks,
            logger=loggers,
            max_epochs=10,
            val_check_interval=0.2,  # frequency of validation epoch
        )

        # train
        trainer.fit(model=model, datamodule=datamodule)

        # test
        trainer.test(model=model, datamodule=datamodule, ckpt_path="best")

        if use_wandb:
            wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
main()

Global seed set to 0
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhobglob[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                  | Params
-------------------------------------------------------
0 | net          | SimpleConvNet_100x100 | 5.1 M 
1 | criterion    | MSELoss               | 0     
2 | train_mae    | MeanAbsoluteError     | 0     
3 | val_mae      | MeanAbsoluteError     | 0     
4 | test_mae     | MeanAbsoluteError     | 0     
5 | train_loss   | MeanMetric            | 0     
6 | val_loss     | MeanMetric            | 0     
7 | test_loss    | MeanMetric            | 0     
8 | val_mae_best | MinMetric             | 0     
-------------------------------------------------------
5.1 M     Trainable params
0         Non-trainable params
5.1 M     Total params
20.502    Total estimated model params size (MB)


Epoch 9: 100%|██████████| 767/767 [00:13<00:00, 56.66it/s, loss=0.0216, v_num=ng35, val/loss=0.0384, val/mae=12.10, val/mae_best=11.00, train/loss=0.0236, train/mae=9.060]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 767/767 [00:13<00:00, 56.65it/s, loss=0.0216, v_num=ng35, val/loss=0.0384, val/mae=12.10, val/mae_best=11.00, train/loss=0.0236, train/mae=9.060]


Restoring states from the checkpoint path at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-08-03/checkpoints/best-checkpoint.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-08-03/checkpoints/best-checkpoint.ckpt


Testing DataLoader 0: 100%|██████████| 49/49 [00:00<00:00, 97.65it/s] 


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█
test/loss,▁
test/mae,▁
train/loss,█▁▁▁▁▁▁▁▁▁
train/mae,█▄▃▂▂▂▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val/loss,█▄▄▃▅▃▃▂▂▂▃▂▄▂▁▂▂▂▂▂▁▂▂▁▄▂▂▂▁▁▂▃▂▁▁▁▁▂▂▂
val/mae,█▅▄▄▅▃▄▃▃▂▃▂▄▂▂▂▂▂▂▂▁▂▂▁▄▂▂▂▁▁▂▄▃▁▁▁▁▂▂▂
val/mae_best,█▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
test/loss,0.03261
test/mae,11.0122
train/loss,0.0236
train/mae,9.06211
trainer/global_step,5170.0
val/loss,0.03839
val/mae,12.08442
val/mae_best,10.96982


Global seed set to 1


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                  | Params
-------------------------------------------------------
0 | net          | SimpleConvNet_100x100 | 5.1 M 
1 | criterion    | MSELoss               | 0     
2 | train_mae    | MeanAbsoluteError     | 0     
3 | val_mae      | MeanAbsoluteError     | 0     
4 | test_mae     | MeanAbsoluteError     | 0     
5 | train_loss   | MeanMetric            | 0     
6 | val_loss     | MeanMetric            | 0     
7 | test_loss    | MeanMetric            | 0     
8 | val_mae_best | MinMetric             | 0     
-------------------------------------------------------
5.1 M     Trainable params
0         Non-trainable params
5.1 M     Total params
20.502    Total estimated model params size (MB)


Epoch 9: 100%|██████████| 767/767 [00:14<00:00, 52.22it/s, loss=0.0515, v_num=eqsg, val/loss=0.087, val/mae=20.10, val/mae_best=19.80, train/loss=0.0581, train/mae=15.00] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 767/767 [00:14<00:00, 52.21it/s, loss=0.0515, v_num=eqsg, val/loss=0.087, val/mae=20.10, val/mae_best=19.80, train/loss=0.0581, train/mae=15.00]


Restoring states from the checkpoint path at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-10-32/checkpoints/best-checkpoint.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-10-32/checkpoints/best-checkpoint.ckpt


Testing DataLoader 0: 100%|██████████| 49/49 [00:00<00:00, 124.58it/s]


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█
test/loss,▁
test/mae,▁
train/loss,█▁▁▁▁▁▁▁▁▁
train/mae,█▂▁▃▂▃▂▂▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val/loss,▂▁▂▃▄▇▃▂▃▂▃▁▄▄▃▂▃▄▃▆█▄█▁▂▆▂▃▂▄▄▆▂▃▂▄▃▃▃▂
val/mae,▂▁▂▃▄▇▃▂▃▂▃▁▄▄▃▂▄▄▃▆█▄█▁▃▆▂▃▃▄▄▆▂▃▂▄▃▃▃▂
val/mae_best,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
test/loss,0.07979
test/mae,19.31039
train/loss,0.05812
train/mae,14.97241
trainer/global_step,5170.0
val/loss,0.08703
val/mae,20.1125
val/mae_best,19.81214


Global seed set to 2


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                  | Params
-------------------------------------------------------
0 | net          | SimpleConvNet_100x100 | 5.1 M 
1 | criterion    | MSELoss               | 0     
2 | train_mae    | MeanAbsoluteError     | 0     
3 | val_mae      | MeanAbsoluteError     | 0     
4 | test_mae     | MeanAbsoluteError     | 0     
5 | train_loss   | MeanMetric            | 0     
6 | val_loss     | MeanMetric            | 0     
7 | test_loss    | MeanMetric            | 0     
8 | val_mae_best | MinMetric             | 0     
-------------------------------------------------------
5.1 M     Trainable params
0         Non-trainable params
5.1 M     Total params
20.502    Total estimated model params size (MB)


Epoch 9: 100%|██████████| 767/767 [00:16<00:00, 47.22it/s, loss=0.0571, v_num=6ew9, val/loss=0.0947, val/mae=20.80, val/mae_best=19.90, train/loss=0.0578, train/mae=14.90]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 767/767 [00:16<00:00, 47.21it/s, loss=0.0571, v_num=6ew9, val/loss=0.0947, val/mae=20.80, val/mae_best=19.90, train/loss=0.0578, train/mae=14.90]


Restoring states from the checkpoint path at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-12-56/checkpoints/best-checkpoint.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ukasz/Projects/IntroToML/logs/2023-02-04_20-12-56/checkpoints/best-checkpoint.ckpt


Testing DataLoader 0: 100%|██████████| 49/49 [00:00<00:00, 108.29it/s]


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█
test/loss,▁
test/mae,▁
train/loss,█▁▁▁▁▁▁▁▁▁
train/mae,█▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val/loss,▃▃▃▃▄▃▂▄▅▁▅▂▃▂▄▄▆▄▄▂█▃▄▂▆▄▁▃▃▅▂▄▂▁▂▁▃▄▄▄
val/mae,▄▃▃▃▄▃▂▄▅▁▅▂▃▂▄▄▆▄▄▂█▃▄▂▆▄▁▃▃▅▂▄▂▁▂▁▃▄▄▄
val/mae_best,█▇▆▆▆▆▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,10.0
test/loss,0.07999
test/mae,19.33448
train/loss,0.05781
train/mae,14.93408
trainer/global_step,5170.0
val/loss,0.09472
val/mae,20.78312
val/mae_best,19.85258
