# BESNet

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
from datetime import datetime
import sys
import gc
sys.path.append('../../')

from sklearn.model_selection import KFold
from tqdm import tqdm

from dependencies import *
from settings import *
from reproducibility import *
from models.TGS_salt.BesNet import BesNet as Net

Using paths on kail-main

Importing numerical libraries...
Importing standard libraries...
Importing miscellaneous functions...
Importing constants...
Importing Neural Network dependencies...
	PyTorch
	Keras
	TensorFlow
	Metrics, Losses and LR Schedulers
	Kaggle Metrics
	Image augmentations
	Datasets
Importing external libraries...
	Lovasz Losses (elu+1)

Fixing random seed for reproducibility...
	Setting random seed to 35202.

Setting CUDA environment...
	torch.__version__              = 1.1.0
	torch.version.cuda             = 9.0.176
	torch.backends.cudnn.version() = 7501
	os['CUDA_VISIBLE_DEVICES']     = 0,1
	torch.cuda.device_count()      = 2



In [2]:
SIZE = 256
FACTOR = SIZE
ne = "ne"
initial_checkpoint = None
MODEL = "ResNet34"

batch_size = 8
n_acc = 256 / batch_size
nfolds = 4

noise_th = 75.0*(SIZE/128.0)**2 #threshold for the number of predicted pixels
best_thr0 = 0.2 #preliminary value of the threshold for metric calculation

data_root = '../../data/siim-pneumothorax'
torch.cuda.set_device('cuda:0')

In [3]:
def time_to_str(time, str):
    #if str == 'min':
    #	    return str(round(float(time)/60,5))+" min(s)"
    return round(time,4)

In [4]:
#TODO: Instead of directly printing to stdout, copy it into a txt file
class Logger():
    def __init__(self,name=MODEL+ne):
        super().__init__()
        self.model=name
        #if OHEM != "OHEM":
        #    self.model=MODEL+ne[ne.find("_")+1:]
        self.file = open(self.model+"_bes_log.txt","w+")
        self.file.close()
        
        self.debug_file = open(self.model + '_bes_debug.txt', 'w+')
        self.debug_file.close()
    def write(self, str):
        print(str)
        self.file = open(self.model+"_bes_log.txt","a+")
        self.file.write(str)
        self.file.close()
    def write2(self, str):
        print(str, end='',flush=True)
        self.file = open(self.model+"_bes_log.txt","a+")
        self.file.write(str)
        self.file.close()
    def debug(self, str):
        self.debug_file = open(self.model + '_bes_debug.txt', 'a+')
        self.debug_file.write(str)
        self.debug_file.close()
    def stop():
        self.file.close()
        self.debug_file.close()
        
log = Logger()

In [5]:
def valid_augment(image,mask,index):
    cache = Struct(image = image.copy(), mask = mask.copy())
    # image, mask = do_resize2(image, mask, SIZE, SIZE)
    # image, mask = do_center_pad_to_factor2(image, mask, factor = FACTOR)
    return image,mask,index,cache

def train_augment(image,mask,index):
    cache = Struct(image = image.copy(), mask = mask.copy())

    if np.random.rand() < 0.5:
         image, mask = do_horizontal_flip2(image, mask)
         pass

    if np.random.rand() < 0.2:
        c = np.random.choice(4)
        if c==0:
            image, mask = do_random_shift_scale_crop_pad2(image, mask, 0.1) #0.125

        if c==1:
            image, mask = do_horizontal_shear2( image, mask, dx=np.random.uniform(-0.02,0.02) )
            pass

        if c==2:
            image, mask = do_shift_scale_rotate2( image, mask, dx=0, dy=0, scale=1, angle=np.random.uniform(0,15))  #10

        if c==3:
            image, mask = do_elastic_transform2(image, mask, grid=10, distort=np.random.uniform(0,0.05))#0.10
            pass
    if np.random.rand() < 0.1:
        c = np.random.choice(3)
        if c==0:
            image = do_brightness_shift(image,np.random.uniform(-0.1,+0.1))
        if c==1:
            image = do_brightness_multiply(image,np.random.uniform(1-0.08,1+0.08))
        if c==2:
            image = do_gamma(image,np.random.uniform(1-0.08,1+0.08))
        # if c==1:
        #     image = do_invert_intensity(image)

    # image, mask = do_resize2(image, mask, SIZE, SIZE)
    # image, mask = do_center_pad_to_factor2(image, mask, factor = FACTOR)
    return image,mask,index,cache

In [6]:
def null_augment(image, mask, index):
    cache = Struct(image = image.copy(), mask = mask.copy())
    return image, mask, index, cache

def null_collate(batch):

    batch_size = len(batch)
    cache = []
    input = []
    truth = []
    index = []
    for b in range(batch_size):
        input.append(batch[b][0])
        truth.append(batch[b][1])
        index.append(batch[b][2])
        cache.append(batch[b][3])
    input = torch.from_numpy(np.array(input)).float().unsqueeze(1)

    if truth[0]!=[]:
        truth = torch.from_numpy(np.array(truth)).float().unsqueeze(1)

    return input, truth, index, cache

def get_weights_for_balanced_classes(cls_list, num_classes):
    # get count per class
    count = [0] * num_classes
    
    for cls in cls_list:
        count[cls] += 1

    # get weight per class
    weight_per_class = [0.] * num_classes
    N = float(len(cls_list))
    
    for i in range(num_classes):
        weight_per_class[i] = N / float(count[i])
        
    #　get weight per sample
    weights = [0] * len(cls_list)
    
    for i, cls in enumerate(cls_list):
        weights[i] = weight_per_class[cls]
        
    return weights

def get_boundary(masks):
    mask_arr = (masks.cpu().numpy() * 255).astype(np.uint8).squeeze()
    b_arr = []
    
    for mask in mask_arr:
        b_img = np.zeros(mask.shape)
        
        contours, hier = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
        cv2.drawContours(b_img, contours, -1, 255, 1)
        
        b_arr.append(b_img)
        
    b_arr = np.stack(b_arr)
    
    return torch.from_numpy(b_arr)

class SIIMDataset(Dataset):
    def __init__(self, data_root, fold, pos_neg_ratio=0.5, width=1024, height=1024, phase='train', augment=null_augment, random_state=2019, nfolds=4):
        self.data_root = data_root
        self.fold = fold
        self.height = width
        self.width = height
        self.phase = phase
        self.augment = augment
        
        kf = KFold(n_splits=nfolds, shuffle=True, random_state=random_state)
        train_list = os.listdir(os.path.join(data_root, 'train_png'))
        
        if phase == 'train':
            index_list = list(kf.split(list(range(len(train_list)))))[fold][0]
            self.filenames = [train_list[i] for i in index_list]
            
            # read masks for pos/neg ratio sampler
            train_df = pd.read_csv(os.path.join(self.data_root, 'train-rle.csv'))
            pos_ids = list(train_df[train_df[' EncodedPixels']!=' -1']['ImageId'])

            self.cls_list = [1 if filename.split('.png')[0] in pos_ids else 0 for filename in self.filenames]

        elif phase == 'val':
            index_list = list(kf.split(list(range(len(train_list)))))[fold][1]
            self.filenames = [train_list[i] for i in index_list]
        else: # test
            self.filenames = os.listdir(os.path.join(data_root, 'test_png'))

    def __getitem__(self, index):
        img_path = os.path.join(self.data_root, 'train_png/{}'.format(self.filenames[index]))
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.
        img = cv2.resize(img, (self.width, self.height), interpolation = cv2.INTER_AREA)
        
        if self.phase == 'test':
            mask = []
        else: # train and val
            mask_path = os.path.join(self.data_root, 'mask_png/{}'.format(self.filenames[index]))
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE).astype(np.float32) / 255.
            mask = cv2.resize(mask, (self.width, self.height), interpolation = cv2.INTER_AREA)
        
        return self.augment(img, mask, index)
    
    def __len__(self):
        return len(self.filenames)


In [7]:
def validation( net, valid_loader ):

    valid_num  = 0
    valid_loss = np.zeros(3, np.float32)
    
    logits = []
    truths = []
    for input, truth, index, cache in valid_loader:
        input = input.cuda()
        truth = truth.cuda()
        
        with torch.no_grad():
            b_masks = get_boundary(truth).float().cuda() / 255.
            
            m_logit, b_logit = net(input) #data_parallel(net,input)
            
            b_loss = net.boundary_criterion(b_logit, b_masks)
            m_loss = net.mask_criterion(m_logit, b_logit, truth, b_masks, alpha=5., beta=0.2)
            loss = b_loss + m_loss
            loss = loss.mean()
            
            dice  = net.metric(m_logit, truth, noise_th=0, threshold=0, logger=log)
            
            logits.append(m_logit.cpu())
            truths.append(truth.cpu())

        batch_size = len(index)
        valid_loss += batch_size * np.array(( loss.item(), dice.item(), 0))
        valid_num += batch_size
        
    valid_loss /= valid_num
    
    # find out optimal thr and dice
    log.debug('\nscan\n')
    logits = torch.cat(logits, dim=0)
    truths = torch.cat(truths, dim=0)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    thrs = np.arange(0.05, 1, 0.05)
    
    th_dices = []
    for th in thrs:
        th_dice = net.metric(logits, truths, noise_th=0, threshold=th, logger=log)
        th_dices.append(th_dice)
        
    th_dices = np.array(th_dices)
    best_dice = th_dices.max()
    best_thr = thrs[th_dices.argmax()]
    
    valid_loss[1] = best_dice
    valid_loss[2] = best_thr
    
    gc.collect()
    torch.cuda.empty_cache()
        
    return valid_loss

In [8]:
def freeze(net):
    for p in net.conv1.parameters():
        p.requires_grad = False
        
    for p in net.encoder2.parameters():
        p.requires_grad = False
        
    for p in net.encoder3.parameters():
        p.requires_grad = False
        
    for p in net.encoder4.parameters():
        p.requires_grad = False
        
    for p in net.encoder5.parameters():
        p.requires_grad = False
        
    for p in net.center.parameters():
        p.requires_grad = False
        
    for p in net.decoder5.parameters():
        p.requires_grad = False
        
    for p in net.decoder4.parameters():
        p.requires_grad = False
        
    for p in net.decoder3.parameters():
        p.requires_grad = False
        
    for p in net.decoder2.parameters():
        p.requires_grad = False
        
    for p in net.decoder1.parameters():
        p.requires_grad = False
        
def unfreeze(net):
    for p in net.conv1.parameters():
        p.requires_grad = True
        
    for p in net.encoder2.parameters():
        p.requires_grad = True
        
    for p in net.encoder3.parameters():
        p.requires_grad = True
        
    for p in net.encoder4.parameters():
        p.requires_grad = True
        
    for p in net.encoder5.parameters():
        p.requires_grad = True
        
    for p in net.center.parameters():
        p.requires_grad = True
        
    for p in net.decoder5.parameters():
        p.requires_grad = True
        
    for p in net.decoder4.parameters():
        p.requires_grad = True
        
    for p in net.decoder3.parameters():
        p.requires_grad = True
        
    for p in net.decoder2.parameters():
        p.requires_grad = True
        
    for p in net.decoder1.parameters():
        p.requires_grad = True

def cosine_annealing_scheduler(num_iter, lr_init, lr_min):
    scheduler = lambda x: ((lr_init-lr_min)/2)*(np.cos(PI*(np.mod(x,num_iter)/(num_iter)))+1)+lr_min
    return scheduler
        
def set_BN_momentum(model, momentum=0.1*batch_size/64):
    for i, (name, layer) in enumerate(model.named_modules()):
        if isinstance(layer, nn.BatchNorm2d) or isinstance(layer, nn.BatchNorm1d):
            layer.momentum = momentum
            
def fit_one_cycle(epochs, net, train_loader, val_loader, lr_init=0.001, lr_min=0.000001, OHEM=''):
    # init learner
    iter_per_epoch = len(train_loader)
    num_iter = iter_per_epoch * epochs
    iter_smooth = 20
    iter_log    = 100
    iter_valid  = iter_per_epoch
    #iter_valid = 100
    
    #scheduler = None
    scheduler = cosine_annealing_scheduler(num_iter, lr_init, lr_min)
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
          lr=lr_init, momentum=0.9, weight_decay=0.0001
    )
    set_BN_momentum(net)
    
    start_iter = 0
    start_epoch= 0
    train_loss  = np.zeros(6,np.float32)
    valid_loss  = np.zeros(6,np.float32)
    batch_loss  = np.zeros(6,np.float32)
    rate = 0
    iter = 0
    epoch = 0
    
    #debug
    if 0: #debug  ##-------------------------------
        debug_num = 2
        debug_count = 0
        
        for input, truth, index, cache in train_loader:
            images = input.cpu().data.numpy().squeeze()
            masks  = truth.cpu().data.numpy().squeeze()
            
            batch_size = len(index)
            for b in range(batch_size):
                image = images[b]*255
                image = np.dstack([image,image,image])

                mask = masks[b]
                print(np.max(mask))
                
                # Plot some samples
                fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(12, 4))
                ax0.imshow(image.astype(np.uint8))
                ax1.imshow(mask, vmin=0, vmax=1)
                ax1.set_title('Targets')
                
                plt.show()
                
            debug_count += 1
            if debug_count > debug_num:
                break
    #--------------------------------------
    
    start = timer()
    while iter < num_iter:  # loop over the dataset multiple times
        sum_train_loss = np.zeros(6,np.float32)
        sum = 0

        log.write('\n rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          \n')
        log.write('-------------------------------------------------------------------------------------------------------------------------------\n')
            
        for input, truth, index, cache in train_loader:
            # validation
            if (iter + 1) % iter_valid == 0:
                log.debug('\nval\n')
                net.set_mode('valid')
                valid_loss = validation(net, val_loader)

                net.set_mode('train')
                log.debug('\ntrain\n')
                time.sleep(0.01)
            
            if scheduler is not None:
                lr = scheduler(iter)
                if lr<0 : break
                adjust_learning_rate(optimizer, lr)
                rate = get_learning_rate(optimizer)
            
            # ok, train
            net.set_mode('train')

            input = input.cuda()
            truth = truth.cuda()
            
            b_masks = get_boundary(truth).float().cuda() / 255.

            m_logit, b_logit = net(input) #data_parallel(net,input)

            b_loss = net.boundary_criterion(b_logit, b_masks)
            m_loss = net.mask_criterion(m_logit, b_logit, truth, b_masks, alpha=5., beta=0.2)
            # print('b_loss: {}, m_loss: {}'.format(b_loss.mean(), m_loss.mean()))
            loss = (b_loss + m_loss).sum()
                
            dice = net.metric(m_logit, truth, noise_th=0, threshold=best_thr0, logger=log)
            
            # learn with grad acc
            loss /= n_acc
            loss.backward()
            
            if ((iter + 1) % n_acc) == 0:
                optimizer.step()
                optimizer.zero_grad()
                # torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
            
            # print statistics  ------------
            batch_loss = np.array((
                           loss.item(),
                           dice.item(),
                           0, 0, 0, 0,
                         ))
            sum_train_loss += batch_loss
            sum += 1
            if iter%iter_smooth == 0:
                train_loss = sum_train_loss/sum
                sum_train_loss = np.zeros(6,np.float32)
                sum = 0

            log.write2('\r%0.4f  %5.1f  %6.1f  |  %0.3f  %0.3f  (%0.3f) |  %0.3f  %0.3f  |  %0.3f  %0.3f  | %s ' % (\
                         rate, iter/iter_per_epoch, epoch+1,
                         valid_loss[0], valid_loss[1], valid_loss[2],
                         train_loss[0], train_loss[1],
                         batch_loss[0], batch_loss[1],
                         time_to_str((timer() - start), 'min')))
            
            iter += 1
            epoch = iter // iter_per_epoch

In [9]:
def get_dataloaders(data_root, batch_size, fold, nfolds=4, width=1024, height=1024, train_augment=null_augment, val_augment=null_augment, random_state=SEED):
    train_dataset = SIIMDataset(
        data_root,
        fold,
        width=width, height=height,
        phase='train',
        augment=train_augment,
        random_state=random_state,
        nfolds=nfolds
    )
    
    weights = get_weights_for_balanced_classes(train_dataset.cls_list, 2)
    weights = torch.DoubleTensor(weights)
    balance_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

    train_loader  = DataLoader(
        train_dataset,
        # sampler     = RandomSampler(train_dataset),
        sampler = balance_sampler,
        batch_size  = batch_size,
        drop_last   = True,
        num_workers = 8,
        pin_memory  = True,
        collate_fn  = null_collate
    )

    val_dataset = SIIMDataset(
        data_root,
        fold,
        width=width, height=height,
        phase='val',
        augment=val_augment,
        random_state=random_state,
        nfolds=nfolds
    )

    val_loader  = DataLoader(
        val_dataset,
        sampler     = RandomSampler(val_dataset),
        batch_size  = batch_size,
        drop_last   = False,
        num_workers = 8,
        pin_memory  = True,
        collate_fn  = null_collate
    )
    
    return train_loader, val_loader

## Train

In [10]:
# one fold test!

train_loader, val_loader = get_dataloaders(
    data_root,
    batch_size,
    0, nfolds=10,
    width=SIZE, height=SIZE,
    train_augment=train_augment, val_augment=valid_augment,
    random_state=SEED
)

net = Net().cuda()

lr = 0.01

In [11]:
# warm up
freeze(net)
fit_one_cycle(
    2, net,
    train_loader, val_loader,
    lr_init=lr, lr_min=lr/100,
    OHEM='WBCE'
)


 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0093    1.0     1.0  |  0.000  0.000  (0.000) |  0.030  0.021  |  0.028  0.017  | 217.4904 



0.0093    1.0     1.0  |  0.873  0.648  (0.950) |  0.030  0.021  |  0.031  0.015  | 241.4792 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0075    2.0     2.0  |  0.873  0.648  (0.950) |  0.027  0.028  |  0.035  0.021  | 458.7327 



0.0075    2.0     2.0  |  0.810  0.679  (0.950) |  0.027  0.028  |  0.030  0.032  | 482.6988 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0051    3.0     3.0  |  0.810  0.679  (0.950) |  0.026  0.021  |  0.024  0.049  | 699.0729 



0.0051    3.0     3.0  |  0.780  0.631  (0.950) |  0.026  0.021  |  0.026  0.037  | 723.5851 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0026    4.0     4.0  |  0.780  0.631  (0.950) |  0.027  0.036  |  0.034  0.021  | 940.1667 



0.0026    4.0     4.0  |  0.775  0.616  (0.950) |  0.027  0.036  |  0.031  0.068  | 964.0539 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0008    5.0     5.0  |  0.775  0.616  (0.950) |  0.026  0.035  |  0.023  0.004  | 1180.609 



0.0008    5.0     5.0  |  0.770  0.634  (0.950) |  0.026  0.035  |  0.022  0.036  | 1204.5911 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0001    6.0     6.0  |  0.770  0.634  (0.950) |  0.027  0.030  |  0.033  0.026  | 1421.7164 



0.0001    6.0     6.0  |  0.754  0.633  (0.950) |  0.027  0.030  |  0.023  0.025  | 1445.6355 

In [12]:
torch.save(net.state_dict(), 'bes-cp-1.pth')

In [11]:
net.load_state_dict(torch.load('bes-cp-1.pth'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [None]:
# ok, train
unfreeze(net)
fit_one_cycle(
    50, net,
    train_loader, val_loader,
    lr_init=lr/2, lr_min=lr/120,
    OHEM='WBCE'
)


 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0050    1.0     1.0  |  0.000  0.000  (0.000) |  0.018  0.048  |  0.017  0.059  | 443.7399 



0.0050    1.0     1.0  |  0.486  0.453  (0.950) |  0.018  0.048  |  0.017  0.021  | 469.595 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0050    2.0     2.0  |  0.486  0.453  (0.950) |  0.013  0.060  |  0.012  0.032  | 911.5028 



0.0050    2.0     2.0  |  0.406  0.559  (0.950) |  0.013  0.060  |  0.018  0.047  | 936.3455 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0050    3.0     3.0  |  0.406  0.559  (0.950) |  0.013  0.077  |  0.013  0.048  | 1378.0488 



0.0050    3.0     3.0  |  0.367  0.568  (0.950) |  0.013  0.077  |  0.023  0.035  | 1401.659 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0049    4.0     4.0  |  0.367  0.568  (0.950) |  0.011  0.106  |  0.014  0.041  | 1842.7547 



0.0049    4.0     4.0  |  0.347  0.556  (0.950) |  0.011  0.106  |  0.012  0.181  | 1866.3788 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0049    5.0     5.0  |  0.347  0.556  (0.950) |  0.010  0.101  |  0.008  0.089  | 2307.4074 



0.0049    5.0     5.0  |  0.383  0.618  (0.950) |  0.010  0.101  |  0.011  0.235  | 2330.4381 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0048    6.0     6.0  |  0.383  0.618  (0.950) |  0.008  0.105  |  0.007  0.164  | 2771.2772 



0.0048    6.0     6.0  |  0.378  0.585  (0.950) |  0.008  0.105  |  0.008  0.156  | 2794.8823 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0048    7.0     7.0  |  0.378  0.585  (0.950) |  0.008  0.098  |  0.007  0.048  | 3235.3451 



0.0048    7.0     7.0  |  0.414  0.634  (0.950) |  0.008  0.098  |  0.009  0.074  | 3258.3999 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0047    8.0     8.0  |  0.414  0.634  (0.950) |  0.007  0.113  |  0.009  0.174  | 3699.0776 



0.0047    8.0     8.0  |  0.405  0.651  (0.950) |  0.007  0.113  |  0.005  0.109  | 3722.0733 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0046    9.0     9.0  |  0.405  0.651  (0.950) |  0.007  0.127  |  0.007  0.212  | 4163.0898 



0.0046    9.0     9.0  |  0.378  0.623  (0.950) |  0.007  0.127  |  0.006  0.361  | 4186.0903 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0045   10.0    10.0  |  0.378  0.623  (0.950) |  0.006  0.130  |  0.005  0.119  | 4627.3218 



0.0045   10.0    10.0  |  0.411  0.629  (0.950) |  0.006  0.130  |  0.006  0.135  | 4650.1939 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0044   11.0    11.0  |  0.411  0.629  (0.950) |  0.006  0.144  |  0.005  0.132  | 5090.6027 



0.0044   11.0    11.0  |  0.436  0.622  (0.950) |  0.006  0.144  |  0.007  0.295  | 5114.1647 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0043   12.0    12.0  |  0.436  0.622  (0.950) |  0.006  0.193  |  0.006  0.122  | 5554.5568 



0.0043   12.0    12.0  |  0.425  0.641  (0.950) |  0.006  0.193  |  0.005  0.074  | 5577.5649 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0042   13.0    13.0  |  0.425  0.641  (0.950) |  0.006  0.161  |  0.005  0.079  | 6019.082 



0.0042   13.0    13.0  |  0.431  0.656  (0.950) |  0.006  0.161  |  0.005  0.034  | 6042.7907 
 rate    iter   epoch   | valid_loss               | train_loss               | batch_loss               |  time          

-------------------------------------------------------------------------------------------------------------------------------





0.0041   13.9    14.0  |  0.431  0.656  (0.950) |  0.006  0.172  |  0.006  0.387  | 6419.2045 