In [1]:
from typing import Callable, List, Tuple
import collections
import os
import torch
import catalyst

from catalyst.dl import utils
from torch import nn

from catalyst.contrib.nn import DiceLoss, IoULoss
from catalyst.dl import SupervisedRunner
import segmentation_models_pytorch as smp

from pathlib import Path
from models.unets import unet_resnet
from dataflow.dataloaders import get_train_val_loaders
from dataflow.visualisations import tensor_to_rgb
from dataflow.transforms import get_train_augmentation, get_validation_augmentation, prepare_batch_fp32, get_preprocessing

import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

  from pandas import Panel

Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.



In [2]:
data_dir = Path('/media/wwymak/Storage/spacenet/AOI_3_Paris_Train')
image_dir = data_dir / 'RGB-PanSharpen'
mask_dir = data_dir / 'masks'
summary_data_filepath = data_dir / 'summaryData' / 'AOI_3_Paris_Train_Building_Solutions.csv'

In [3]:
debug=False
batch_size = 8 
val_batch_size = batch_size * 2
num_workers = 12
val_interval = 3
accumulation_steps = 4

train_loader, val_loader, train_eval_loader = get_train_val_loaders(
    image_dir=image_dir,
    mask_dir=mask_dir,
    summary_data_filepath=summary_data_filepath,
    train_transforms=get_train_augmentation(512),
    val_transforms=get_validation_augmentation(512),
    train_ratio=0.8,
    batch_size=batch_size,
    num_workers=num_workers,
    limit_train_num_samples=100 if debug else None,
    limit_val_num_samples=100 if debug else None,
)

loaders = collections.OrderedDict()
loaders["train"] = train_loader
loaders["valid"] = val_loader

(17148, 4)
(16633, 4)


In [4]:
def report_checkpoint(checkpoint):
    """
    Print checkpoint metrics and epoch number
    :param checkpoint:
    """
    print("Epoch          :", checkpoint["epoch"])

    skip_fields = [
        "_base/lr",
        "_base/momentum",
        "_timers/data_time",
        "_timers/model_time",
        "_timers/batch_time",
        "_timers/_fps",
    ]
    print(
        "Metrics (Train):", [(k, v) for k, v, in checkpoint["epoch_metrics"].items() if k.startswith('train')]
    )
    print(
        "Metrics (Valid):", [(k, v) for k, v, in checkpoint["epoch_metrics"].items() if k.startswith('valid')]
    )

In [5]:
model = smp.Unet(encoder_name="efficientnet-b0", classes=1)

best_checkpoint = data_dir / "logs" /"unet_resnet" / "checkpoints"/"best_full.pth"
checkpoint = utils.load_checkpoint(best_checkpoint)


In [6]:

criterion = {
    "dice": DiceLoss(),
    "iou": IoULoss(),
    "bce": nn.BCEWithLogitsLoss()
}

from torch import optim

from catalyst.contrib.nn import RAdam, Lookahead

learning_rate = 0.001
encoder_learning_rate = 0.005

# model, preprocessing_function = unet_resnet('efficientnet-b0')
# Since we use a pre-trained encoder, we will reduce the learning rate on it.
layerwise_params = {"encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003)}

# This function removes weight_decay for biases and applies our layerwise_params
model_params = utils.process_model_params(model, layerwise_params=layerwise_params)

# Catalyst has new SOTA optimizers out of box
base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003)
optimizer = Lookahead(base_optimizer)

# scheduler = optim.lr_scheduler.CyclicLR(optimizer, factor=0.25, patience=2)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=[x['lr'] * 10 for x in model_params], steps_per_epoch=len(train_loader), epochs=50)

In [7]:
checkpoint.keys()

dict_keys(['epoch_metrics', 'valid_metrics', 'stage_name', 'epoch', 'loader_name', 'loader_step', 'global_epoch', 'checkpoint_data', 'main_metric', 'minimize_metric', 'valid_loader', 'model_state_dict', 'criterion_dice_state_dict', 'criterion_iou_state_dict', 'criterion_bce_state_dict', 'optimizer_state_dict', 'scheduler_state_dict'])

In [12]:
# utils.unpack_checkpoint(checkpoint, model=model,criterion=criterion, optimizer=optimizer, scheduler=scheduler)
device='cuda:0'
checkpoint = torch.load(best_checkpoint, map_location='cuda:0')
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
epoch = checkpoint['epoch']
model.train()

Unet(
  (encoder): EfficientNetEncoder(
    (_conv_stem): Conv2dStaticSamePadding(
      3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
      (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
    )
    (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
    (_blocks): ModuleList(
      (0): MBConvBlock(
        (_depthwise_conv): Conv2dStaticSamePadding(
          32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
          (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
        )
        (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
        (_se_reduce): Conv2dStaticSamePadding(
          32, 8, kernel_size=(1, 1), stride=(1, 1)
          (static_padding): Identity()
        )
        (_se_expand): Conv2dStaticSamePadding(
          8, 32, kernel_size=(1, 1), stride=(1, 1)
          (static_padding): Identity()
        )
        (_

In [15]:
num_epochs = 50
logdir = data_dir /"logs"/"unet_resnet"/"resume_training"

device = utils.get_device()
runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask")

In [16]:
%load_ext tensorboard.notebook
%tensorboard --logdir {logdir}

The tensorboard.notebook extension is already loaded. To reload it, use:
  %reload_ext tensorboard.notebook


Reusing TensorBoard on port 6006 (pid 28932), started 0:21:20 ago. (Use '!kill 28932' to kill it.)

In [17]:
from catalyst.dl.callbacks import DiceCallback, IouCallback, \
  CriterionCallback, MetricAggregationCallback

callbacks = [
    # Each criterion is calculated separately.
    CriterionCallback(
        input_key="mask",
        prefix="loss_dice",
        criterion_key="dice"
    ),
    CriterionCallback(
        input_key="mask",
        prefix="loss_iou",
        criterion_key="iou"
    ),
    CriterionCallback(
        input_key="mask",
        prefix="loss_bce",
        criterion_key="bce"
    ),

    # And only then we aggregate everything into one loss.
    MetricAggregationCallback(
        prefix="loss",
        mode="weighted_sum", # can be "sum", "weighted_sum" or "mean"
        # because we want weighted sum, we need to add scale for each loss
        metrics={"loss_dice": 1.0, "loss_iou": 1.0, "loss_bce": 0.8},
    ),

    # metrics
    DiceCallback(input_key="mask"),
    IouCallback(input_key="mask"),
]


runner.train(
    model=model.cuda(),
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    # our dataloaders
    loaders=loaders,
    # We can specify the callbacks list for the experiment;
    callbacks=callbacks,
    # path to save logs
    logdir=logdir,
    num_epochs=num_epochs,
    # save our best checkpoint by IoU metric
    main_metric="iou",
    # IoU needs to be maximized.
    minimize_metric=False,
    # for FP16. It uses the variable from the very first cell
#     fp16=fp16_params,
    # prints train logs
    verbose=True,
)

1/50 * Epoch (train):   2% 1/63 [00:01<01:36,  1.56s/it, dice=0.806, iou=0.675, loss=0.637, loss_bce=0.147, loss_dice=0.194, loss_iou=0.325]


This overload of add is deprecated:
	add(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add(Tensor other, *, Number alpha)



1/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.40it/s, dice=0.809, iou=0.679, loss=0.659, loss_bce=0.184, loss_dice=0.191, loss_iou=0.321]
1/50 * Epoch (valid): 100% 16/16 [00:05<00:00,  3.18it/s, dice=0.826, iou=0.704, loss=0.593, loss_bce=0.155, loss_dice=0.174, loss_iou=0.296]
[2020-07-30 22:22:10,966] 
1/50 * Epoch 1 (_base): lr=0.0002 | momentum=0.9494
1/50 * Epoch 1 (train): dice=0.7981 | iou=0.6649 | loss=0.6675 | loss_bce=0.1631 | loss_dice=0.2019 | loss_iou=0.3351
1/50 * Epoch 1 (valid): dice=0.8197 | iou=0.6948 | loss=0.5913 | loss_bce=0.1322 | loss_dice=0.1803 | loss_iou=0.3052



To get the last learning rate computed by the scheduler, please use `get_last_lr()`.



2/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.46it/s, dice=0.776, iou=0.634, loss=0.787, loss_bce=0.246, loss_dice=0.224, loss_iou=0.366]
2/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.66it/s, dice=0.825, iou=0.703, loss=0.592, loss_bce=0.150, loss_dice=0.175, loss_iou=0.297]
[2020-07-30 22:22:31,350] 
2/50 * Epoch 2 (_base): lr=0.0002 | momentum=0.9493
2/50 * Epoch 2 (train): dice=0.7992 | iou=0.6664 | loss=0.6663 | loss_bce=0.1649 | loss_dice=0.2008 | loss_iou=0.3336
2/50 * Epoch 2 (valid): dice=0.8222 | iou=0.6983 | loss=0.5792 | loss_bce=0.1247 | loss_dice=0.1778 | loss_iou=0.3017
3/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.47it/s, dice=0.786, iou=0.648, loss=0.673, loss_bce=0.133, loss_dice=0.214, loss_iou=0.352]
3/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.80it/s, dice=0.822, iou=0.698, loss=0.606, loss_bce=0.158, loss_dice=0.178, loss_iou=0.302]
[2020-07-30 22:22:53,516] 
3/50 * Epoch 3 (_base): lr=0.0002 | momentum=0.9493
3/50 * Epoch 3 (train): dice=0.7975 | 

16/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.40it/s, dice=0.807, iou=0.677, loss=0.619, loss_bce=0.129, loss_dice=0.193, loss_iou=0.323]
16/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.56it/s, dice=0.825, iou=0.702, loss=0.602, loss_bce=0.162, loss_dice=0.175, loss_iou=0.298]
[2020-07-30 22:27:34,722] 
16/50 * Epoch 16 (_base): lr=0.0003 | momentum=0.9489
16/50 * Epoch 16 (train): dice=0.8143 | iou=0.6876 | loss=0.6207 | loss_bce=0.1533 | loss_dice=0.1857 | loss_iou=0.3124
16/50 * Epoch 16 (valid): dice=0.8243 | iou=0.7014 | loss=0.5814 | loss_bce=0.1339 | loss_dice=0.1757 | loss_iou=0.2986
17/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.40it/s, dice=0.800, iou=0.666, loss=0.648, loss_bce=0.143, loss_dice=0.200, loss_iou=0.334]
17/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.64it/s, dice=0.828, iou=0.706, loss=0.590, loss_bce=0.156, loss_dice=0.172, loss_iou=0.294]
[2020-07-30 22:27:56,333] 
17/50 * Epoch 17 (_base): lr=0.0003 | momentum=0.9489
17/50 * Epoch 17 (train): 

30/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.42it/s, dice=0.782, iou=0.641, loss=0.677, loss_bce=0.124, loss_dice=0.218, loss_iou=0.359]
30/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.61it/s, dice=0.830, iou=0.709, loss=0.586, loss_bce=0.157, loss_dice=0.170, loss_iou=0.291]
[2020-07-30 22:32:39,963] 
30/50 * Epoch 30 (_base): lr=0.0003 | momentum=0.9484
30/50 * Epoch 30 (train): dice=0.8217 | iou=0.6981 | loss=0.5975 | loss_bce=0.1467 | loss_dice=0.1783 | loss_iou=0.3019
30/50 * Epoch 30 (valid): dice=0.8298 | iou=0.7094 | loss=0.5653 | loss_bce=0.1305 | loss_dice=0.1702 | loss_iou=0.2906
31/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.40it/s, dice=0.869, iou=0.768, loss=0.486, loss_bce=0.153, loss_dice=0.131, loss_iou=0.232]
31/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.60it/s, dice=0.830, iou=0.709, loss=0.591, loss_bce=0.163, loss_dice=0.170, loss_iou=0.291]
[2020-07-30 22:33:02,551] 
31/50 * Epoch 31 (_base): lr=0.0003 | momentum=0.9483
31/50 * Epoch 31 (train): 

44/50 * Epoch (train): 100% 63/63 [00:18<00:00,  3.38it/s, dice=0.873, iou=0.775, loss=0.473, loss_bce=0.151, loss_dice=0.127, loss_iou=0.225]
44/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  8.71it/s, dice=0.835, iou=0.717, loss=0.571, loss_bce=0.154, loss_dice=0.165, loss_iou=0.283]
[2020-07-30 22:37:46,369] 
44/50 * Epoch 44 (_base): lr=0.0003 | momentum=0.9477
44/50 * Epoch 44 (train): dice=0.8264 | iou=0.7051 | loss=0.5824 | loss_bce=0.1424 | loss_dice=0.1736 | loss_iou=0.2949
44/50 * Epoch 44 (valid): dice=0.8297 | iou=0.7094 | loss=0.5660 | loss_bce=0.1314 | loss_dice=0.1703 | loss_iou=0.2906
45/50 * Epoch (train): 100% 63/63 [00:17<00:00,  3.61it/s, dice=0.794, iou=0.659, loss=0.701, loss_bce=0.193, loss_dice=0.206, loss_iou=0.341]
45/50 * Epoch (valid): 100% 16/16 [00:01<00:00,  9.04it/s, dice=0.836, iou=0.718, loss=0.572, loss_bce=0.157, loss_dice=0.164, loss_iou=0.282]
[2020-07-30 22:38:07,434] 
45/50 * Epoch 45 (_base): lr=0.0003 | momentum=0.9477
45/50 * Epoch 45 (train): 

In [25]:
scheduler

<torch.optim.lr_scheduler.OneCycleLR at 0x7f0f422d8910>