In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# import

In [2]:
# export
from fastai.basic_train import LearnerCallback

In [3]:
# export
from fastai.callbacks.general_sched import *
from fastai.callback import *

In [4]:
# export
from fastai.core import *

In [5]:
# export
from IPython.core import debugger as idb

In [6]:
# export
from exp import nb_resnet_unet
from exp import nb_loss_metrics
from exp import nb_optimizer
from exp import nb_tensorboard_callback
from exp import nb_scheduling_train
from exp import nb_databunch

In [7]:
# export
import torch

In [8]:
# export
from torch import tensor

In [9]:
# export
from torch import nn

In [10]:
# export
from torch.nn import Sequential, ModuleList

In [11]:
# export
from fastai.basic_train import Learner

In [12]:
# export
from fastai.torch_core import bn_types,bias_types

In [13]:
# export
from functools import partial

In [14]:
# export
import os,shutil

In [15]:
# export
from fastai.callbacks import CSVLogger

In [16]:
# export
from fastai.callbacks.tracker import SaveModelCallback

# functions

In [17]:
# export
def txt_write(fh, i, opt, lr, path, csv_fname):
    fh.write('===================================\n')
    fh.write(f'run_{i}\n')
    fh.write('-----------------------------------\n')
    fh.write(f'--opt_func: {opt}\n')
    fh.write(f'--lr: {lr}\n')
    fh.write(f'--csv_log: {path}/{csv_fname}.csv\n')
    fh.write(f'--best model: {path}/models/run_{i}.pth\n')

In [18]:
# export
def multi_train(get_learn, epoch_len, epochs, opts, lrs, checkpoints, tb_log_root,autoSave=True):
    '''
    可以从checkpoint继续训练，为了保证训练连续性，需要手动设置lr与checkpoint保存时一致。
    '''
    # 清理tensorboard log dir
    if os.path.exists(tb_log_root): shutil.rmtree(tb_log_root)
    os.mkdir(tb_log_root)
    
    if not os.path.exists('./run_log/'): os.mkdir('./run_log/')
    txtlog = open('./run_log/log.txt',mode='w')
    for i,(opt,lr,checkpoint) in enumerate(zip(opts,lrs,checkpoints)):
        # create a learner
        learn = get_learn()
        
        # set optimizer
        learn.opt_func = opt
        
        # load checkpoint
        if checkpoint is not None:
            with open(checkpoint,'rb') as f:
                learn.load(f)
        
        # 在txt log中记录
        csv_log_dir = f'csv_log/'
        if not os.path.exists(learn.path/csv_log_dir): os.mkdir(learn.path/csv_log_dir)
        csv_fname = csv_log_dir+f'run_{i}'
        txt_write(txtlog,i,opt,lr,learn.path,csv_fname)
        
        callbacks = []
        # get csvlogger callback
        csvLog = CSVLogger(learn,filename=csv_fname)
        callbacks += [csvLog]
        
        if autoSave:
            # savemodel callback
            autoSave = SaveModelCallback(learn,monitor='valid_loss',mode='min',every='improvement',name=f'run_{i}')
            callbacks += [autoSave]
        
        # get tensorboard callback
        tbCb = get_tbCb(learn,tb_log_root+f'run_{i}')
        callbacks += [tbCb]
        
        # train
        fit(learn=learn, epoch_len=epoch_len, epochs=epochs, lr=lr, callbacks=callbacks)
        
    txtlog.close()

In [19]:
# export
def split_model(model):
    group0 = ModuleList()
    group1 = ModuleList()
    
    pretrained_layers = model.down_blocks
    noPretrain_layers = Sequential(model.bridge, model.side_layers, model.up_blocks, model.head)
    
    #把pretrained layers分作batchnorm部分（放在group1），和非batchnorm部分（放在group0）
    for m in pretrained_layers.modules():
        if isinstance(m,bn_types): group1.append(m)
        elif isinstance(m,bias_types): group0.append(m)
            
    #把非pretrain的层放到group1
    for m in noPretrain_layers.children():
        group1.append(m)
    
    return [group0, group1]

In [20]:
# export
def get_learn(data):
    # create model
    model = nb_resnet_unet.get_unet_res18(1,True)
    model.load_state_dict(torch.load('./models/unet_res18_allres_init.pth'));
    
    # create learner
    learn = Learner(data,model)
    
    # split model
    learn.layer_groups = split_model(learn.model)
    
    # set multi-gpu
    if data.device.type=='cuda':
        learn.model = torch.nn.DataParallel(learn.model,device_ids=[0,1,2,3])
        
    # set loss func
#     learn.loss_func = partial(nb_loss_metrics.combo_loss, balance_ratio=1)
#     learn.loss_func = nb_loss_metrics.dice_loss
    learn.loss_func = partial(nb_loss_metrics.balance_bce, balance_ratio=1)
    
    # 添加metrics
    learn.metrics += [nb_loss_metrics.dice_loss]
    learn.metrics += [partial(nb_loss_metrics.balance_bce,balance_ratio=1)]
    learn.metrics += [nb_loss_metrics.mask_iou]
    
    return learn

In [21]:
# export
def get_tbCb(learn,log_dir):
    tbCb = nb_tensorboard_callback.TensorBoardCallback(
                                   learn=learn,
                                   log_dir=log_dir,
                                   plot_net=False,
                                   plot_loss=True,
                                   metric_plots=['mask_iou'],
                                   hyper_plots=['lr'],
                                   hist_plots=['down_blocks.2.0.conv1.weight',
                                               'up_blocks.2.conv1.conv.weight'],
                                   hist_iters=50)
    return tbCb

In [22]:
# export
def fit(learn,epoch_len,epochs,lr,callbacks):
    nb_scheduling_train.fit_with_warmup_multiAnnealPlat(learn,
                                    epoch_len=epoch_len,
                                    num_epoch=epochs,

                                    lr_start=lr/10,
                                    lr_constant=lr,
                                    warmup_iter=10,

                                    monitor='train_smooth',
                                    worseN_thres=5,
                                    annealRate=10,
                                    duration_thres=30,
                                    annealIte=10,
                                    phaseMaxN=3,
                                    finetune_stop=1,
                                    callbacks=callbacks)

# test

In [23]:
# export
# 设置device
device = torch.device('cuda')

In [24]:
# export
ds = './data/dataset_20200708'
data = nb_databunch.get_databunch(ds, bs=16, device=device) 

In [25]:
# export
opts = [partial(nb_optimizer.Adam, betas=(0.9,0.99))]

lrs = [1e-3]

checkpoints = [None]

In [26]:
# learn = get_learn(data)
# learn.opt_func = partial(nb_optimizer.Adam, betas=(0.9,0.99))

# learn.lr_find()
# learn.recorder.plot()

In [27]:
# export
multi_train(get_learn=partial(get_learn,data=data), 
            epoch_len=1e9, epochs=500,
            opts=opts, lrs=lrs, checkpoints=checkpoints,
            tb_log_root='./tb_log/',
            autoSave=True)

ALLERT: You are using CumtomEpochLength, please make sure that your training dataloader is using random sampler, or this may cause problem.


epoch,train_loss,valid_loss,dice_loss,balance_bce,mask_iou,time
0,0.559014,0.696392,0.716869,0.696392,0.0,00:36
1,0.506591,0.663716,0.696999,0.663716,0.326093,00:31
2,0.484401,0.613259,0.664939,0.613259,0.385476,00:30
3,0.463973,0.537986,0.605159,0.537986,0.399828,00:31
4,0.439107,0.459481,0.538257,0.459481,0.46973,00:31
5,0.41274,0.410764,0.482823,0.410764,0.442997,00:31
6,0.37671,0.264881,0.315373,0.264881,0.708487,00:30
7,0.346964,0.288355,0.329674,0.288355,0.593468,00:31
8,0.317036,0.269857,0.285794,0.269857,0.684629,00:31
9,0.287659,0.19084,0.189843,0.19084,0.735,00:31


Better model found at epoch 0 with valid_loss value: 0.6963922381401062.
Better model found at epoch 1 with valid_loss value: 0.663715660572052.
Better model found at epoch 2 with valid_loss value: 0.613258957862854.
Better model found at epoch 3 with valid_loss value: 0.5379860997200012.
Better model found at epoch 4 with valid_loss value: 0.45948106050491333.
Better model found at epoch 5 with valid_loss value: 0.4107644557952881.
Better model found at epoch 6 with valid_loss value: 0.26488083600997925.
Better model found at epoch 9 with valid_loss value: 0.1908400058746338.
Better model found at epoch 13 with valid_loss value: 0.1696300208568573.
Better model found at epoch 14 with valid_loss value: 0.16637985408306122.
Better model found at epoch 15 with valid_loss value: 0.15656128525733948.
Better model found at epoch 16 with valid_loss value: 0.11723385006189346.
Better model found at epoch 25 with valid_loss value: 0.1056334599852562.
Better model found at epoch 30 with valid_l

# export

In [29]:
!python notebook2script.py --fname 'train_script_logger.ipynb' --outputDir './'

Converted train_script_logger.ipynb to nb_train_script_logger.py
