In [1]:
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import torch
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

from data_utility import *
from data_utils import *
from loss import *
from train import *
from deeplab_model.deeplab import *
from dense_vnet.DenseVNet import DenseVNet
from sync_batchnorm import convert_model
import datetime

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
USE_GPU = True
NUM_WORKERS = 12
BATCH_SIZE = 2 

dtype = torch.float32 
# define dtype, float is space efficient than double

if USE_GPU and torch.cuda.is_available():
    
    device = torch.device("cuda:0")
    
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.enabled = True
    # magic flag that accelerate
    
    print('using GPU for training')
else:
    device = torch.device('cpu')
    print('using CPU for training')

using GPU for training


In [3]:
train_dataset = get_full_resolution_dataset(data_type = 'nii_train', 
                transform=transforms.Compose([
                random_affine(90, 15),
                random_filp(0.5)]))
# do data augumentation on train dataset

validation_dataset = get_full_resolution_dataset(data_type = 'nii_test', 
                transform=None)
# no data augumentation on validation dataset

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                    num_workers=NUM_WORKERS)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True,
                    num_workers=NUM_WORKERS) # drop_last
# loaders come with auto batch division and multi-thread acceleration

In [None]:
from vnet import VNet
from bv_refinement_network.RefinementModel import RefinementModel_ELU
from refinenet import refine_net

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    
checkpoint_refine = torch.load('../refine_bv_resize_no_concat_save/2019-08-26 07:52:38.026946 epoch: 70.pth')
    
refine_model = refine_net(num_classes=1, in_channels=1)
#refine_model = nn.DataParallel(refine_model)
#refine_model = convert_model(refine_model)

refine_model.load_state_dict(checkpoint_refine['state_dict_1'])

refine_model = refine_model.to(device, dtype)

optimizer = optim.Adam(refine_model.parameters(), lr=1e-3)
optimizer.load_state_dict(checkpoint_refine['optimizer'])

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)
scheduler.load_state_dict(checkpoint_refine['scheduler'])

deeplab = DeepLab(output_stride=16)
deeplab = nn.DataParallel(deeplab)
deeplab = convert_model(deeplab)

checkpoint = torch.load('../deeplab_dilated_save/2019-08-10 09:28:43.844872 epoch: 1160.pth') # best one

deeplab.load_state_dict(checkpoint['state_dict_1'])
deeplab = deeplab.to(device, dtype)

epoch = checkpoint_refine['epoch']
#epoch = 0
print(epoch)

70


In [None]:
def get_bboxes(image, label, output, batchsize, box_size):
    image_final = torch.zeros((batchsize, 1, box_size, box_size, box_size)).to(device, dtype)
    label_final = torch.zeros((batchsize, 1, box_size, box_size, box_size)).to(device, dtype)
    output_final = torch.zeros((batchsize, 1, box_size, box_size, box_size)).to(device, dtype)
    half_size = int(box_size/2)
    image_size_x = int(image.shape[-3])
    image_size_y = int(image.shape[-2])
    image_size_z = int(image.shape[-1])
    for b in range(batchsize):
        out = output[b]
        x,y,z = loadbvcenter(binarize_output(out))
        x, y, z = np.clip([x, y, z], a_min=half_size, a_max=181)
        x1 = max(x-half_size, 0)
        x2 = min(x+half_size, image_size_x)
        y1 = max(y-half_size, 0)
        y2 = min(y+half_size, image_size_y)
        z1 = max(z-half_size, 0)
        z2 = min(z+half_size, image_size_z)
        image_final[b] = image[b, :, x1:x2, y1:y2, z1:z2]
        label_final[b] = label[b, :, x1:x2, y1:y2, z1:z2]
        output_final[b] = output[b, :, x1:x2, y1:y2, z1:z2]
    return image_final, label_final, output_final

In [None]:
epochs = 5000

record = open('train_bv_refine_resize_no_concat.txt','a+')

logger = {'train':[], 'validation_1': []}

min_val = 1

for e in tqdm(range(epoch + 1, epochs)):
# iter over epoches
    epoch_loss = 0
        
    for t, batch in enumerate(train_loader):
    # iter over the train mini batches
        train_losses=[]
        for minibatch in range(BATCH_SIZE):
            refine_model.train()
            deeplab.eval()
            # Set the model flag to train
            # 1. enable dropout
            # 2. batchnorm behave differently in train and test
            #print(batch['image1_data'])
            image_1 = batch['image1_data'][minibatch].to(device=device, dtype=dtype)
            image_1 = image_1.view(1,1,256,256,256)

            label_1 = batch['image1_label'][minibatch].to(device=device, dtype=dtype)
            label_1 = label_1.view(1,3,256,256,256)

            bv_label = label_1[:, 2, :, :, :]
            bv_label = bv_label.view(1,1,256,256,256)

            original_res = [a[minibatch].item() for a in batch['original_resolution']]

            image_1_resize = F.interpolate(image_1, size=original_res, mode='trilinear', align_corners=True)
            image_1_resize = image_1_resize.view(1,1,original_res[0], original_res[1], original_res[2])

            bv_label_resize = F.interpolate(bv_label, size=original_res, mode='trilinear', align_corners=True)

            # Get coarse output from deeplab model from 256 resolution input
            out_coarse = deeplab(image_1)
            out_coarse = out_coarse.view(1,3,256,256,256)

            bv_coarse = out_coarse[:, 2, :, :, :]
            bv_coarse = bv_coarse.view(1,1,256,256,256)

            bv_coarse_resize = F.interpolate(bv_coarse, size=original_res, mode='trilinear', align_corners=True)
            
            box_size = 192
            half_size = int(box_size / 2)
            
            image_size_x = int(image_1_resize.shape[-3])
            image_size_y = int(image_1_resize.shape[-2])
            image_size_z = int(image_1_resize.shape[-1])
            
            x,y,z = loadbvcenter(binarize_output(bv_coarse_resize).view([1] + original_res))
            x, y, z = np.clip([x, y, z], a_min=box_size-half_size, a_max=box_size+half_size)
            x1 = max(x-half_size, 0)
            x2 = min(x+half_size, image_size_x)
            y1 = max(y-half_size, 0)
            y2 = min(y+half_size, image_size_y)
            z1 = max(z-half_size, 0)
            z2 = min(z+half_size, image_size_z)
            
            
            bbox_bv = bv_coarse_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
            bbox_bv = reshape_image(bbox_bv.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_bv = bbox_bv.view(1,1,box_size,box_size,box_size)
            
            bbox_bv_label = bv_label_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
            bbox_bv_label = reshape_image(bbox_bv_label.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_bv_label = bbox_bv_label.view(1,1,box_size,box_size,box_size)

            #bbox_image = get_bounding_box_image(image_1, (256,256,256)).to(device, dtype)
            bbox_image = image_1_resize[:, :, x1:x2, y1:y2, z1:z2]
            bbox_image = reshape_image(bbox_image.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_image = bbox_image.view(1, 1, box_size, box_size, box_size)
            
            bbox_image_2 = F.interpolate(bbox_image, scale_factor=1/2, mode='trilinear', align_corners=True)
            bbox_image_4 = F.interpolate(bbox_image, scale_factor=1/4, mode='trilinear', align_corners=True)

            refine_out = refine_model(bbox_image, bbox_image_2, bbox_image_4)

            loss = dice_loss(refine_out, bbox_bv_label)
            print(loss)
            train_losses.append(loss)
        
        loss = sum(train_losses) / BATCH_SIZE
        train_losses=[]
        epoch_loss += loss.item()
        # record minibatch loss to epoch loss
        
        optimizer.zero_grad()
        # set the model parameter gradient to zero
        
        loss.backward()
        # calculate the gradient wrt loss
        optimizer.step()
        #scheduler.step(loss_1)
        # take a gradient descent step
        
    outstr = 'Epoch {0} finished ! Training Loss: {1:.4f}'.format(e, epoch_loss/(t+1)) + '\n'
    
    logger['train'].append(epoch_loss/(t+1))
    
    print(outstr)
    record.write(outstr)
    record.flush()

    if (e <= 100 and e%5 == 0) or (e > 100 and e%1 == 0):
    # do validation every 5 epoches
        deeplab.eval()
        refine_model.eval()
        # set model flag to eval
        # 1. disable dropout
        # 2. batchnorm behave differs

        with torch.no_grad():
        # stop taking gradient
        
            #valloss_4 = 0
            #valloss_2 = 0
            valloss_1 = 0
            
            for v, vbatch in enumerate(validation_loader):
            # iter over validation mini batches
                val_losses = []
                for minibatch in range(BATCH_SIZE):
                    image_1 = vbatch['image1_data'][minibatch].to(device=device, dtype=dtype)
                    image_1 = image_1.view(1,1,256,256,256)

                    label_1 = vbatch['image1_label'][minibatch].to(device=device, dtype=dtype)
                    label_1 = label_1.view(1,3,256,256,256)

                    bv_label = label_1[:, 2, :, :, :]
                    bv_label = bv_label.view(1,1,256,256,256)

                    original_res = [a[minibatch].item() for a in vbatch['original_resolution']]

                    image_1_resize = F.interpolate(image_1, size=original_res, mode='trilinear', align_corners=True)
                    image_1_resize = image_1_resize.view(1,1,original_res[0], original_res[1], original_res[2])

                    bv_label_resize = F.interpolate(bv_label, size=original_res, mode='trilinear', align_corners=True)

                    # Get coarse output from deeplab model from 256 resolution input
                    out_coarse = deeplab(image_1)
                    out_coarse = out_coarse.view(1,3,256,256,256)

                    bv_coarse = out_coarse[:, 2, :, :, :]
                    bv_coarse = bv_coarse.view(1,1,256,256,256)

                    bv_coarse_resize = F.interpolate(bv_coarse, size=original_res, mode='trilinear', align_corners=True)

                    box_size = 192
                    half_size = int(box_size / 2)

                    image_size_x = int(image_1_resize.shape[-3])
                    image_size_y = int(image_1_resize.shape[-2])
                    image_size_z = int(image_1_resize.shape[-1])

                    x,y,z = loadbvcenter(binarize_output(bv_coarse_resize).view([1] + original_res))
                    x, y, z = np.clip([x, y, z], a_min=box_size-half_size, a_max=box_size+half_size)
                    x1 = max(x-half_size, 0)
                    x2 = min(x+half_size, image_size_x)
                    y1 = max(y-half_size, 0)
                    y2 = min(y+half_size, image_size_y)
                    z1 = max(z-half_size, 0)
                    z2 = min(z+half_size, image_size_z)


                    bbox_bv = bv_coarse_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
                    bbox_bv = reshape_image(bbox_bv.squeeze(), box_size, box_size, box_size).to(device, dtype)
                    bbox_bv = bbox_bv.view(1,1,box_size,box_size,box_size)

                    bbox_bv_label = bv_label_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
                    bbox_bv_label = reshape_image(bbox_bv_label.squeeze(), box_size, box_size, box_size).to(device, dtype)
                    bbox_bv_label = bbox_bv_label.view(1,1,box_size,box_size,box_size)

                    bbox_image = image_1_resize[:, :, x1:x2, y1:y2, z1:z2]
                    bbox_image = reshape_image(bbox_image.squeeze(), box_size, box_size, box_size).to(device, dtype)
                    bbox_image = bbox_image.view(1, 1, box_size, box_size, box_size)
                    
                    bbox_image_2 = F.interpolate(bbox_image, scale_factor=1/2, mode='trilinear', align_corners=True)
                    bbox_image_4 = F.interpolate(bbox_image, scale_factor=1/4, mode='trilinear', align_corners=True)

                    refine_out = refine_model(bbox_image, bbox_image_2, bbox_image_4)
                    
                    loss = dice_loss(refine_out, bbox_bv_label)
                    val_losses.append(loss)
                
                avg_loss = sum(val_losses) / BATCH_SIZE
                val_losses = []
                print(avg_loss)
            
                # calculate loss
                valloss_1 += avg_loss.item()
                
            
            avg_val_loss = (valloss_1 / (v+1))
            outstr = '------- 1st valloss={0:.4f}'\
                .format(avg_val_loss) + '\n'
            
            logger['validation_1'].append(avg_val_loss)
            #scheduler.step(avg_val_loss)
            
            if avg_val_loss < min_val:
                min_val = avg_val_loss
                save_1('refine_bv_resize_no_concat_save', refine_model, optimizer, logger, e, scheduler)
            elif e % 10 == 0:
                save_1('refine_bv_resize_no_concat_save', refine_model, optimizer, logger, e, scheduler)
            
            print(outstr)
            record.write(outstr)
            record.flush()
    


record.close()

  0%|          | 0/4929 [00:00<?, ?it/s]

tensor(0.3003, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1477, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3370, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1428, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2110, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1135, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2979, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1122, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1305, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2347, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2092, device='cuda:0', grad_fn=<RsubBackward1>

  0%|          | 1/4929 [08:38<709:36:17, 518.38s/it]

tensor(0.3788, device='cuda:0', grad_fn=<RsubBackward1>)
Epoch 71 finished ! Training Loss: 0.4281

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2329, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1971, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3280, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2085, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3372, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2552, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4440, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2746, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4357, device='cuda:0', grad_fn=<RsubBackward1>)
tens

  0%|          | 2/4929 [16:50<698:39:50, 510.49s/it]

Epoch 72 finished ! Training Loss: 0.4274

tensor(0.1621, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2884, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1440, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3303, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2421, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1770, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2125, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5186, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1960, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2450, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1990, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2173, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1710, device='cuda:0', grad_fn=<RsubBackward

  0%|          | 3/4929 [24:57<689:04:00, 503.58s/it]

Epoch 73 finished ! Training Loss: 0.4289

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3285, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3074, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3897, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2949, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2143, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2764, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3339, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1895, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1438, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2063, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3499, device='cuda:0', grad_fn=<RsubBackward1>)
tens

  0%|          | 4/4929 [32:58<679:38:09, 496.79s/it]

Epoch 74 finished ! Training Loss: 0.4225

tensor(0.2588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1439, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2041, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3344, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3559, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1077, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1477, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2645, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2436, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3333, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0944, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2887, device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.2614, device='cuda:0')
tensor(0.2917, device='cuda:0')
tensor(0.4388, device='cuda:0')
tensor(0.2072, device='cuda:0')
tensor(0.1884, device='cuda:0')
tensor(0.2587, device='cuda:0')
tensor(0.2402, device='cuda:0')
tensor(0.3027, device='cuda:0')
tensor(0.3297, device='cuda:0')
tensor(0.2470, device='cuda:0')


  0%|          | 5/4929 [42:09<701:35:59, 512.95s/it]

tensor(0.3180, device='cuda:0')
Checkpoint 75 saved !
------- 1st valloss=0.2980

tensor(0.4090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3479, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2015, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3418, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1121, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3983, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2683, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1890, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2467, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2004, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2185, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cud

  0%|          | 6/4929 [50:17<691:23:58, 505.59s/it]

Epoch 76 finished ! Training Loss: 0.4283

tensor(0.7068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4159, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1316, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2753, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1695, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4116, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2464, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3544, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2145, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2574, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2197, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3638, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

  0%|          | 7/4929 [58:27<684:29:54, 500.65s/it]

Epoch 77 finished ! Training Loss: 0.4294

tensor(0.1321, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1249, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2458, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1137, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3512, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1972, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5590, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4055, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3140, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3321, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1851, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

  0%|          | 8/4929 [1:06:37<679:59:55, 497.46s/it]

Epoch 78 finished ! Training Loss: 0.4284

tensor(0.1205, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4144, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2462, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2125, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1302, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1283, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1599, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1004, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1366, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4204, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1736, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

  0%|          | 9/4929 [1:14:40<674:01:14, 493.19s/it]

Epoch 79 finished ! Training Loss: 0.4271

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4227, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4339, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6000, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2580, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3093, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2073, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4774, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4069, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1418, d

tensor(0.2513, device='cuda:0')
tensor(0.3919, device='cuda:0')
tensor(0.2400, device='cuda:0')
tensor(0.2534, device='cuda:0')
tensor(0.2774, device='cuda:0')
tensor(0.2002, device='cuda:0')
tensor(0.2383, device='cuda:0')
tensor(0.4457, device='cuda:0')
tensor(0.3323, device='cuda:0')


  0%|          | 10/4929 [1:23:44<694:52:37, 508.55s/it]

Checkpoint 80 saved !
------- 1st valloss=0.3085

tensor(0.4056, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2700, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1881, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1230, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2269, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4732, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2216, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1916, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1262, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3621, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2392, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3058, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1077, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2533, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5990, device='cuda:0', grad_fn=<R

  0%|          | 11/4929 [1:31:50<685:24:11, 501.72s/it]

Epoch 81 finished ! Training Loss: 0.4251

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2043, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2440, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4232, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3477, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2689, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3100, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1972, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1426, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2171, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., d

  0%|          | 12/4929 [1:39:51<676:58:59, 495.66s/it]

Epoch 82 finished ! Training Loss: 0.4278

tensor(0.1813, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3208, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2971, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2260, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1953, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1461, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1958, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3290, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2122, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5404, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3120, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

  0%|          | 13/4929 [1:48:00<674:01:12, 493.59s/it]

Epoch 83 finished ! Training Loss: 0.4297

tensor(0.2842, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2994, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2424, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1383, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3599, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1495, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3733, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2388, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., d

  0%|          | 14/4929 [1:56:13<673:28:19, 493.29s/it]

Epoch 84 finished ! Training Loss: 0.4138

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2274, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3932, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2408, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3144, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1092, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1029, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4194, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2275, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2091, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3179, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1890, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1868, device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.3701, device='cuda:0')
tensor(0.2700, device='cuda:0')
tensor(0.3264, device='cuda:0')
tensor(0.4038, device='cuda:0')
tensor(0.3429, device='cuda:0')
tensor(0.1658, device='cuda:0')
tensor(0.3926, device='cuda:0')
tensor(0.2993, device='cuda:0')
tensor(0.1919, device='cuda:0')
tensor(0.2630, device='cuda:0')
tensor(0.4742, device='cuda:0')


  0%|          | 15/4929 [2:05:29<699:09:39, 512.21s/it]

Checkpoint 85 saved !
------- 1st valloss=0.2855

tensor(0.1870, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2843, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2527, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4208, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2029, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2436, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4892, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1019, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1265, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2620, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1301, device='cuda:0', grad_fn=<RsubBackw

  0%|          | 16/4929 [2:13:35<688:04:44, 504.19s/it]

Epoch 86 finished ! Training Loss: 0.4215

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2320, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3970, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1508, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3007, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1981, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1593, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0773, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3120, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2112, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5708, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3205, d

  0%|          | 17/4929 [2:21:40<680:04:05, 498.42s/it]

Epoch 87 finished ! Training Loss: 0.4246

tensor(0.1435, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1674, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4487, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2106, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4863, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4072, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2259, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3365, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1163, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3093, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

  0%|          | 18/4929 [2:29:39<672:18:56, 492.84s/it]

Epoch 88 finished ! Training Loss: 0.4380

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1527, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4453, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1935, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1769, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4386, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4632, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2965, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3377, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1155, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1178, device='cuda:0', grad_fn=<RsubBackward1>)


  0%|          | 19/4929 [2:37:44<668:41:19, 490.28s/it]

Epoch 89 finished ! Training Loss: 0.4195

tensor(0.4225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3775, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1643, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2577, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1288, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0977, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3501, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1528, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3731, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4126, device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.1409, device='cuda:0')
tensor(0.3188, device='cuda:0')
tensor(0.3376, device='cuda:0')
tensor(0.3155, device='cuda:0')
tensor(0.4482, device='cuda:0')
tensor(0.3599, device='cuda:0')
tensor(0.2694, device='cuda:0')
tensor(0.2554, device='cuda:0')
tensor(0.2018, device='cuda:0')
tensor(0.2372, device='cuda:0')
tensor(0.3029, device='cuda:0')


  0%|          | 20/4929 [2:46:58<694:49:57, 509.55s/it]

Checkpoint 90 saved !
------- 1st valloss=0.2908

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1205, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1232, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2430, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2775, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2600, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6136, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2387, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1816, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4839, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tenso

  0%|          | 21/4929 [2:55:04<685:05:28, 502.51s/it]

Epoch 91 finished ! Training Loss: 0.4281

tensor(0.1856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2451, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2004, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1958, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2927, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2129, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1351, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5795, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2302, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1675, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2317, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3995, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2880, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2517, device='cuda:0', grad_fn=<RsubBackward

  0%|          | 22/4929 [3:03:07<676:51:08, 496.57s/it]

Epoch 92 finished ! Training Loss: 0.4210

tensor(0.2713, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1923, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3674, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4022, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1809, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6311, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1597, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1219, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3306, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2993, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1593, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1839, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4547, device='cuda:0', grad_fn=<RsubBackward1>)
tens

  0%|          | 23/4929 [3:11:16<673:42:43, 494.37s/it]

Epoch 93 finished ! Training Loss: 0.4357

tensor(0.2187, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2949, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3396, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1210, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4278, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2179, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2435, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4507, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2800, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1346, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2000, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.177

  0%|          | 24/4929 [3:19:23<670:31:12, 492.12s/it]

Epoch 94 finished ! Training Loss: 0.4298

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1779, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1584, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3525, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1931, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2555, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2847, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1905, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1998, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3391, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2020, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1084, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.2235, device='cuda:0')
tensor(0.2000, device='cuda:0')
tensor(0.1342, device='cuda:0')
tensor(0.2365, device='cuda:0')
tensor(0.4108, device='cuda:0')
tensor(0.2514, device='cuda:0')
tensor(0.2713, device='cuda:0')
tensor(0.4149, device='cuda:0')
tensor(0.5360, device='cuda:0')
tensor(0.3012, device='cuda:0')
tensor(0.2227, device='cuda:0')


  1%|          | 25/4929 [3:28:42<697:36:22, 512.11s/it]

------- 1st valloss=0.3098

tensor(0.4066, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1920, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1148, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1603, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2336, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1226, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1840, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1402, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1252, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1533, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1952, devi

  1%|          | 26/4929 [3:36:58<690:43:52, 507.17s/it]

Epoch 96 finished ! Training Loss: 0.4183

tensor(0.2162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2624, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1629, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4539, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0729, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1572, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1897, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1493, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1909, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3290, device='cuda:0', grad_fn=<RsubBackward1>)


  1%|          | 27/4929 [3:45:05<682:36:48, 501.31s/it]

Epoch 97 finished ! Training Loss: 0.4099

tensor(0.1082, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1665, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2937, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3583, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4855, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2813, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2023, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1642, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1706, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1922, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1971, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)


  1%|          | 28/4929 [3:53:13<676:49:54, 497.16s/it]

Epoch 98 finished ! Training Loss: 0.4134

tensor(0.1603, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1634, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1778, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2085, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2347, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3487, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3572, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5806, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1805, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1573, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1008, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2349, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2443, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)


  1%|          | 29/4929 [4:01:17<671:33:41, 493.39s/it]

Epoch 99 finished ! Training Loss: 0.4139

tensor(0.1320, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3375, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2190, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1639, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4228, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2404, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2047, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2776, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3191, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.242

tensor(0.2646, device='cuda:0')
tensor(0.2105, device='cuda:0')
tensor(0.4974, device='cuda:0')
tensor(0.3540, device='cuda:0')
tensor(0.2677, device='cuda:0')
tensor(0.4353, device='cuda:0')
tensor(0.2828, device='cuda:0')
tensor(0.3060, device='cuda:0')
tensor(0.2432, device='cuda:0')
tensor(0.3033, device='cuda:0')
tensor(0.3568, device='cuda:0')


  1%|          | 30/4929 [4:10:29<695:24:50, 511.02s/it]

Checkpoint 100 saved !
------- 1st valloss=0.2907

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1472, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0804, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1274, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2144, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4409, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6538, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2151, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3225, device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.2815, device='cuda:0')
tensor(0.4309, device='cuda:0')
tensor(0.1839, device='cuda:0')
tensor(0.3794, device='cuda:0')
tensor(0.4945, device='cuda:0')
tensor(0.4906, device='cuda:0')
tensor(0.3071, device='cuda:0')
tensor(0.3478, device='cuda:0')
tensor(0.3708, device='cuda:0')
tensor(0.2978, device='cuda:0')
tensor(0.3308, device='cuda:0')


  1%|          | 31/4929 [4:19:44<712:59:04, 524.04s/it]

------- 1st valloss=0.3585

tensor(0.5561, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2466, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1608, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5622, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2734, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1557, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2049, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1958, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1476, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3191, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1524, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2650, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4868, devi

tensor(0.3705, device='cuda:0')
tensor(0.2183, device='cuda:0')
tensor(0.2670, device='cuda:0')
tensor(0.2823, device='cuda:0')
tensor(0.2824, device='cuda:0')
tensor(0.2568, device='cuda:0')
tensor(0.4306, device='cuda:0')
tensor(0.3385, device='cuda:0')
tensor(0.3904, device='cuda:0')
tensor(0.3490, device='cuda:0')
tensor(0.3471, device='cuda:0')


  1%|          | 32/4929 [4:29:07<728:41:00, 535.69s/it]

------- 1st valloss=0.3029

tensor(0.4616, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2039, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2038, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3534, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2221, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3273, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1894, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2288, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2340, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4500, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3433, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2335, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1471, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1285, devi

tensor(0.1908, device='cuda:0')
tensor(0.2150, device='cuda:0')
tensor(0.2639, device='cuda:0')
tensor(0.2796, device='cuda:0')
tensor(0.3933, device='cuda:0')
tensor(0.2552, device='cuda:0')
tensor(0.2153, device='cuda:0')
tensor(0.2926, device='cuda:0')
tensor(0.3187, device='cuda:0')
tensor(0.2382, device='cuda:0')
tensor(0.4117, device='cuda:0')


  1%|          | 33/4929 [4:38:18<734:51:08, 540.33s/it]

------- 1st valloss=0.2884

tensor(0.4163, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1759, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2294, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1725, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2016, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1634, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1811, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1140, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2023, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1414, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1045, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2725, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1911, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2486, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3985, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.4164, device='cuda:0')
tensor(0.3245, device='cuda:0')
tensor(0.3414, device='cuda:0')
tensor(0.5264, device='cuda:0')
tensor(0.3258, device='cuda:0')
tensor(0.4529, device='cuda:0')
tensor(0.4855, device='cuda:0')
tensor(0.2260, device='cuda:0')
tensor(0.2466, device='cuda:0')
tensor(0.3093, device='cuda:0')
tensor(0.3813, device='cuda:0')


  1%|          | 34/4929 [4:47:40<743:38:41, 546.91s/it]

------- 1st valloss=0.3658

tensor(0.5642, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4390, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2583, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0908, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4940, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4045, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1103, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0812, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1309, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2073, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.35

tensor(0.2441, device='cuda:0')
tensor(0.4033, device='cuda:0')
tensor(0.3303, device='cuda:0')
tensor(0.3431, device='cuda:0')
tensor(0.2197, device='cuda:0')
tensor(0.3305, device='cuda:0')
tensor(0.3238, device='cuda:0')
tensor(0.1685, device='cuda:0')
tensor(0.3455, device='cuda:0')
tensor(0.2741, device='cuda:0')
tensor(0.2124, device='cuda:0')


  1%|          | 35/4929 [4:57:03<749:50:35, 551.58s/it]

------- 1st valloss=0.2976

tensor(0.1588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1054, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3528, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6006, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1477, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1197, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2035, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2293, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3672, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2430, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1205, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.2519, device='cuda:0')
tensor(0.1837, device='cuda:0')
tensor(0.2251, device='cuda:0')
tensor(0.3244, device='cuda:0')
tensor(0.2278, device='cuda:0')
tensor(0.1436, device='cuda:0')
tensor(0.2379, device='cuda:0')
tensor(0.3374, device='cuda:0')
tensor(0.2784, device='cuda:0')
tensor(0.2773, device='cuda:0')
tensor(0.3732, device='cuda:0')


  1%|          | 36/4929 [5:06:18<751:21:12, 552.80s/it]

Checkpoint 106 saved !
------- 1st valloss=0.2808

tensor(0.1202, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3206, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1401, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1335, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1080, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1443, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2183, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1400, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2953, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2716, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1503, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3571, device='cuda:0', grad_fn=<

tensor(0.3262, device='cuda:0')
tensor(0.3022, device='cuda:0')
tensor(0.2303, device='cuda:0')
tensor(0.2278, device='cuda:0')
tensor(0.2089, device='cuda:0')
tensor(0.2477, device='cuda:0')
tensor(0.1571, device='cuda:0')
tensor(0.3881, device='cuda:0')
tensor(0.2849, device='cuda:0')
tensor(0.2384, device='cuda:0')
tensor(0.2676, device='cuda:0')


  1%|          | 37/4929 [5:15:38<753:53:41, 554.79s/it]

Checkpoint 107 saved !
------- 1st valloss=0.2754

tensor(0.2898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6607, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1053, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1495, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2441, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1095, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2886, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3245, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2070, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2640, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2282, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0869, device='cuda:0', grad_fn=<Rsub

tensor(0.2171, device='cuda:0')
tensor(0.2818, device='cuda:0')
tensor(0.1590, device='cuda:0')
tensor(0.2474, device='cuda:0')
tensor(0.3381, device='cuda:0')
tensor(0.2317, device='cuda:0')
tensor(0.3510, device='cuda:0')
tensor(0.1335, device='cuda:0')
tensor(0.3741, device='cuda:0')
tensor(0.2352, device='cuda:0')
tensor(0.1586, device='cuda:0')


  1%|          | 38/4929 [5:24:57<755:26:09, 556.04s/it]

Checkpoint 108 saved !
------- 1st valloss=0.2725

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1949, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1228, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0823, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1620, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2250, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2440, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4380, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0999, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1818, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2379, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0921, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2065, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2409, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1506, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2043, device='cuda:0', grad_fn=<Rsub

tensor(0.4191, device='cuda:0')
tensor(0.4173, device='cuda:0')
tensor(0.2617, device='cuda:0')
tensor(0.3539, device='cuda:0')
tensor(0.2438, device='cuda:0')
tensor(0.2884, device='cuda:0')
tensor(0.1826, device='cuda:0')
tensor(0.3156, device='cuda:0')
tensor(0.2247, device='cuda:0')
tensor(0.2043, device='cuda:0')
tensor(0.2721, device='cuda:0')


  1%|          | 39/4929 [5:34:14<755:56:13, 556.52s/it]

------- 1st valloss=0.3046

tensor(0.4944, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4899, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3190, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1029, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4620, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3054, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4516, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1137, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1458, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5794, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2849, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.29

tensor(0.4013, device='cuda:0')
tensor(0.1850, device='cuda:0')
tensor(0.3530, device='cuda:0')
tensor(0.2217, device='cuda:0')
tensor(0.1859, device='cuda:0')
tensor(0.2322, device='cuda:0')
tensor(0.2088, device='cuda:0')
tensor(0.3138, device='cuda:0')
tensor(0.2912, device='cuda:0')
tensor(0.3782, device='cuda:0')
tensor(0.3449, device='cuda:0')


  1%|          | 40/4929 [5:43:35<757:17:55, 557.63s/it]

Checkpoint 110 saved !
------- 1st valloss=0.2971

tensor(0.1305, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2801, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2700, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1626, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2089, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3071, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1470, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3679, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4905, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4163, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4340, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4684, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3394, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1350, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2265, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4652, device='cuda:0', grad_fn=<

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.4862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4023, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1233, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1994, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2594, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2197, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2766, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3409, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2687, device='cuda:0', grad_fn=<Rs

tensor(0.3872, device='cuda:0')
tensor(0.4098, device='cuda:0')
tensor(0.1232, device='cuda:0')
tensor(0.2161, device='cuda:0')
tensor(0.3408, device='cuda:0')
tensor(0.3595, device='cuda:0')
tensor(0.3905, device='cuda:0')
tensor(0.2864, device='cuda:0')
tensor(0.3570, device='cuda:0')
tensor(0.3201, device='cuda:0')


  1%|          | 47/4929 [6:48:16<752:54:44, 555.20s/it]

------- 1st valloss=0.3030

tensor(0.2119, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4428, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3038, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3819, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1080, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2260, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3925, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1151, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3361, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0874, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1332, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2841, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2286, devi

tensor(0.3799, device='cuda:0')
tensor(0.1700, device='cuda:0')
tensor(0.3846, device='cuda:0')
tensor(0.4778, device='cuda:0')
tensor(0.3925, device='cuda:0')
tensor(0.3259, device='cuda:0')
tensor(0.5470, device='cuda:0')
tensor(0.3141, device='cuda:0')
tensor(0.5333, device='cuda:0')
tensor(0.3723, device='cuda:0')
tensor(0.2449, device='cuda:0')


  1%|          | 48/4929 [6:57:34<753:39:00, 555.86s/it]

------- 1st valloss=0.3581

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2372, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2979, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2974, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1002, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2562, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4192, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2964, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1681, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2079, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4409, device='

tensor(0.3121, device='cuda:0')
tensor(0.4259, device='cuda:0')
tensor(0.3661, device='cuda:0')
tensor(0.3212, device='cuda:0')
tensor(0.2820, device='cuda:0')
tensor(0.3976, device='cuda:0')
tensor(0.2261, device='cuda:0')
tensor(0.1954, device='cuda:0')
tensor(0.2477, device='cuda:0')
tensor(0.3191, device='cuda:0')
tensor(0.2725, device='cuda:0')


  1%|          | 49/4929 [7:06:47<752:22:24, 555.03s/it]

------- 1st valloss=0.3120

tensor(0.2663, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3033, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2753, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2632, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3979, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3388, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0655, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2448, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3652, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1721, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1678, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2998, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1889, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2710, device='cuda:0')
tensor(0.2443, device='cuda:0')
tensor(0.3454, device='cuda:0')
tensor(0.2380, device='cuda:0')
tensor(0.3182, device='cuda:0')
tensor(0.2413, device='cuda:0')
tensor(0.2740, device='cuda:0')
tensor(0.3491, device='cuda:0')
tensor(0.2446, device='cuda:0')
tensor(0.1994, device='cuda:0')
tensor(0.2624, device='cuda:0')


  1%|          | 50/4929 [7:16:05<753:24:27, 555.91s/it]

Checkpoint 120 saved !
------- 1st valloss=0.3015

tensor(0.4088, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3498, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2683, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1968, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2731, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3380, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3130, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0997, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2126, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3776, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1403, device='cuda:0', grad_fn=<RsubBack

tensor(0.2613, device='cuda:0')
tensor(0.3861, device='cuda:0')
tensor(0.2630, device='cuda:0')
tensor(0.5228, device='cuda:0')
tensor(0.1865, device='cuda:0')
tensor(0.2727, device='cuda:0')
tensor(0.2587, device='cuda:0')
tensor(0.2571, device='cuda:0')
tensor(0.4288, device='cuda:0')
tensor(0.4546, device='cuda:0')
tensor(0.2747, device='cuda:0')


  1%|          | 51/4929 [7:25:17<751:38:43, 554.72s/it]

------- 1st valloss=0.3360

tensor(0.1952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4240, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1782, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2627, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3184, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1164, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2540, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3631, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4502, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1961, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1916, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3636, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2218, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3392, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.2616, device='cuda:0')
tensor(0.2058, device='cuda:0')
tensor(0.2707, device='cuda:0')
tensor(0.3969, device='cuda:0')
tensor(0.3602, device='cuda:0')
tensor(0.2013, device='cuda:0')
tensor(0.2674, device='cuda:0')
tensor(0.2662, device='cuda:0')
tensor(0.4607, device='cuda:0')


  1%|          | 52/4929 [7:34:32<751:41:37, 554.87s/it]

------- 1st valloss=0.3277

tensor(0.3412, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1741, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1442, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3822, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6259, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1907, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3597, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1002, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2105, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1983, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2729, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0999, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2606, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.11

tensor(0.1603, device='cuda:0')
tensor(0.2550, device='cuda:0')
tensor(0.6065, device='cuda:0')
tensor(0.4729, device='cuda:0')
tensor(0.4787, device='cuda:0')
tensor(0.1094, device='cuda:0')
tensor(0.4550, device='cuda:0')
tensor(0.5012, device='cuda:0')
tensor(0.4962, device='cuda:0')
tensor(0.3394, device='cuda:0')
tensor(0.3039, device='cuda:0')


  1%|          | 53/4929 [7:43:45<750:38:58, 554.21s/it]

------- 1st valloss=0.3495

tensor(0.4454, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4751, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3411, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2192, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1561, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1220, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2740, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1476, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1044, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2880, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2497, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1547, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.33

tensor(0.2870, device='cuda:0')
tensor(0.2251, device='cuda:0')
tensor(0.3774, device='cuda:0')
tensor(0.3064, device='cuda:0')
tensor(0.3440, device='cuda:0')
tensor(0.1637, device='cuda:0')
tensor(0.2238, device='cuda:0')
tensor(0.3793, device='cuda:0')
tensor(0.2391, device='cuda:0')
tensor(0.2729, device='cuda:0')
tensor(0.4410, device='cuda:0')


  1%|          | 54/4929 [7:53:00<751:02:46, 554.62s/it]

------- 1st valloss=0.2865

tensor(0.2400, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1059, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1646, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3071, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1591, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2099, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0852, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3964, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1506, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3270, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1404, 

tensor(0.2610, device='cuda:0')
tensor(0.3132, device='cuda:0')
tensor(0.1972, device='cuda:0')
tensor(0.3407, device='cuda:0')
tensor(0.2350, device='cuda:0')
tensor(0.3538, device='cuda:0')
tensor(0.2931, device='cuda:0')
tensor(0.2740, device='cuda:0')
tensor(0.3147, device='cuda:0')
tensor(0.3235, device='cuda:0')
tensor(0.1985, device='cuda:0')


  1%|          | 55/4929 [8:02:21<753:29:33, 556.54s/it]

------- 1st valloss=0.2848

tensor(0.0757, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0753, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1511, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1456, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1129, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1683, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3722, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1820, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2647, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2479, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3919, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0852, device='

tensor(0.2115, device='cuda:0')
tensor(0.2912, device='cuda:0')
tensor(0.2299, device='cuda:0')
tensor(0.2063, device='cuda:0')
tensor(0.2230, device='cuda:0')
tensor(0.2921, device='cuda:0')
tensor(0.2704, device='cuda:0')
tensor(0.4080, device='cuda:0')
tensor(0.4995, device='cuda:0')
tensor(0.3629, device='cuda:0')
tensor(0.3518, device='cuda:0')


  1%|          | 56/4929 [8:11:42<755:13:41, 557.94s/it]

------- 1st valloss=0.2790

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1774, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1149, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2419, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0741, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2382, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1449, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4350, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4457, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2499, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1507, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1845, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1630, 

tensor(0.3954, device='cuda:0')
tensor(0.1285, device='cuda:0')
tensor(0.3951, device='cuda:0')
tensor(0.3345, device='cuda:0')
tensor(0.3052, device='cuda:0')
tensor(0.2606, device='cuda:0')
tensor(0.6156, device='cuda:0')
tensor(0.2013, device='cuda:0')
tensor(0.3967, device='cuda:0')
tensor(0.2290, device='cuda:0')
tensor(0.4492, device='cuda:0')


  1%|          | 57/4929 [8:21:00<754:44:36, 557.69s/it]

------- 1st valloss=0.3230

tensor(0.4635, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1832, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4239, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1570, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2833, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1902, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2187, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1263, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2217, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2197, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3143, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3328, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2338, devi

tensor(0.2868, device='cuda:0')
tensor(0.3195, device='cuda:0')
tensor(0.3064, device='cuda:0')
tensor(0.3094, device='cuda:0')
tensor(0.3782, device='cuda:0')
tensor(0.4194, device='cuda:0')
tensor(0.2417, device='cuda:0')
tensor(0.1636, device='cuda:0')
tensor(0.2473, device='cuda:0')
tensor(0.3025, device='cuda:0')
tensor(0.2459, device='cuda:0')


  1%|          | 58/4929 [8:30:19<755:20:45, 558.25s/it]

------- 1st valloss=0.2890

tensor(0.0794, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2468, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1092, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1448, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2625, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1173, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2968, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2634, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1653, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2582, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1348, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1850, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1065, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1056, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2058, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2979, device='cuda:0', grad_fn=<RsubBackward1>)
ten

tensor(0.3291, device='cuda:0')
tensor(0.2213, device='cuda:0')
tensor(0.5404, device='cuda:0')
tensor(0.3107, device='cuda:0')
tensor(0.2313, device='cuda:0')
tensor(0.2405, device='cuda:0')
tensor(0.3444, device='cuda:0')
tensor(0.4030, device='cuda:0')
tensor(0.2814, device='cuda:0')
tensor(0.4007, device='cuda:0')
tensor(0.4440, device='cuda:0')


  1%|          | 59/4929 [8:39:31<752:32:23, 556.29s/it]

------- 1st valloss=0.3528

tensor(0.0883, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.7231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1279, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1153, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3839, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0929, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1452, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2974, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2002, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4143, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1183, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2504, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1431, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3286, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.1930, device='cuda:0')
tensor(0.3923, device='cuda:0')
tensor(0.3622, device='cuda:0')
tensor(0.3100, device='cuda:0')
tensor(0.4119, device='cuda:0')
tensor(0.2131, device='cuda:0')
tensor(0.3754, device='cuda:0')
tensor(0.2674, device='cuda:0')
tensor(0.1553, device='cuda:0')
tensor(0.3341, device='cuda:0')
tensor(0.3010, device='cuda:0')


  1%|          | 60/4929 [8:48:47<752:18:33, 556.24s/it]

Checkpoint 130 saved !
------- 1st valloss=0.2899

tensor(0.4200, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2420, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3504, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4063, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2614, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2272, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2289, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3148, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1619, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2941, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1282, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3184, device='cuda:0', grad_fn=<Rsub

tensor(0.2222, device='cuda:0')
tensor(0.3715, device='cuda:0')
tensor(0.3729, device='cuda:0')
tensor(0.2038, device='cuda:0')
tensor(0.2401, device='cuda:0')
tensor(0.1891, device='cuda:0')
tensor(0.2332, device='cuda:0')
tensor(0.1762, device='cuda:0')
tensor(0.3363, device='cuda:0')
tensor(0.2894, device='cuda:0')
tensor(0.2485, device='cuda:0')


  1%|          | 61/4929 [8:58:00<751:02:32, 555.41s/it]

Checkpoint 131 saved !
------- 1st valloss=0.2661

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1748, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1126, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4261, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1885, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1080, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3895, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1488, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2916, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.4458, device='cuda:0')
tensor(0.2282, device='cuda:0')
tensor(0.3486, device='cuda:0')
tensor(0.2497, device='cuda:0')
tensor(0.3223, device='cuda:0')
tensor(0.3369, device='cuda:0')
tensor(0.4119, device='cuda:0')
tensor(0.3231, device='cuda:0')
tensor(0.2647, device='cuda:0')
tensor(0.2408, device='cuda:0')
tensor(0.2899, device='cuda:0')


  1%|▏         | 62/4929 [9:07:10<748:21:41, 553.54s/it]

------- 1st valloss=0.3024

tensor(0.1529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1652, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2420, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0995, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0635, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2460, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2170, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2063, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3208, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4273, device='cuda:0', gra

tensor(0.3239, device='cuda:0')
tensor(0.4658, device='cuda:0')
tensor(0.3287, device='cuda:0')
tensor(0.2128, device='cuda:0')
tensor(0.2780, device='cuda:0')
tensor(0.2087, device='cuda:0')
tensor(0.2485, device='cuda:0')
tensor(0.1887, device='cuda:0')
tensor(0.2848, device='cuda:0')
tensor(0.1731, device='cuda:0')
tensor(0.1748, device='cuda:0')


  1%|▏         | 63/4929 [9:16:30<751:00:25, 555.62s/it]

------- 1st valloss=0.2674

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2189, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2635, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1686, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3172, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1922, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2030, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0954, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1230, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2316, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0875, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0954, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1102, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.20

tensor(0.3643, device='cuda:0')
tensor(0.3835, device='cuda:0')
tensor(0.2521, device='cuda:0')
tensor(0.3404, device='cuda:0')
tensor(0.2687, device='cuda:0')
tensor(0.1966, device='cuda:0')
tensor(0.3944, device='cuda:0')
tensor(0.3438, device='cuda:0')
tensor(0.2671, device='cuda:0')
tensor(0.4821, device='cuda:0')
tensor(0.3520, device='cuda:0')


  1%|▏         | 64/4929 [9:25:43<749:51:42, 554.88s/it]

------- 1st valloss=0.3039

tensor(0.0680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2987, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0606, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0993, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2703, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1893, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1552, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1494, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2644, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2556, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3267, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3064, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.2884, device='cuda:0')
tensor(0.2837, device='cuda:0')
tensor(0.5089, device='cuda:0')
tensor(0.2604, device='cuda:0')
tensor(0.3905, device='cuda:0')
tensor(0.4349, device='cuda:0')
tensor(0.1797, device='cuda:0')
tensor(0.3871, device='cuda:0')
tensor(0.4211, device='cuda:0')
tensor(0.3725, device='cuda:0')
tensor(0.5561, device='cuda:0')


  1%|▏         | 65/4929 [9:34:53<747:43:23, 553.41s/it]

------- 1st valloss=0.3656

tensor(0.2599, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1879, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0915, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2725, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2766, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3326, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2679, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2137, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2649, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3567, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4276, devi

tensor(0.2720, device='cuda:0')
tensor(0.4661, device='cuda:0')
tensor(0.4138, device='cuda:0')
tensor(0.3411, device='cuda:0')
tensor(0.3833, device='cuda:0')
tensor(0.2823, device='cuda:0')
tensor(0.2052, device='cuda:0')
tensor(0.2686, device='cuda:0')
tensor(0.3709, device='cuda:0')
tensor(0.2904, device='cuda:0')
tensor(0.4404, device='cuda:0')


  1%|▏         | 66/4929 [9:44:08<748:08:12, 553.83s/it]

------- 1st valloss=0.3283

tensor(0.2597, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2792, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3184, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1117, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2087, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1140, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2624, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3753, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2855, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5851, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3493, device='

tensor(0.2355, device='cuda:0')
tensor(0.1537, device='cuda:0')
tensor(0.3120, device='cuda:0')
tensor(0.3482, device='cuda:0')
tensor(0.4919, device='cuda:0')
tensor(0.3416, device='cuda:0')
tensor(0.1380, device='cuda:0')
tensor(0.2825, device='cuda:0')
tensor(0.3337, device='cuda:0')
tensor(0.2755, device='cuda:0')
tensor(0.2094, device='cuda:0')


  1%|▏         | 67/4929 [9:53:21<747:45:32, 553.67s/it]

------- 1st valloss=0.2717

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2993, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1325, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4037, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2322, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1207, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1700, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1919, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4874, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1344, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2937, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1624, devi

tensor(0.2165, device='cuda:0')
tensor(0.2941, device='cuda:0')
tensor(0.2006, device='cuda:0')
tensor(0.3867, device='cuda:0')
tensor(0.3032, device='cuda:0')
tensor(0.2708, device='cuda:0')
tensor(0.3305, device='cuda:0')
tensor(0.2665, device='cuda:0')
tensor(0.2626, device='cuda:0')
tensor(0.3363, device='cuda:0')
tensor(0.2882, device='cuda:0')


  1%|▏         | 68/4929 [10:02:36<748:05:52, 554.03s/it]

------- 1st valloss=0.2815

tensor(0.2858, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1731, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0968, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3163, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2577, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1051, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2524, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1189, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0926, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0980, devi

tensor(0.3043, device='cuda:0')
tensor(0.1532, device='cuda:0')
tensor(0.2641, device='cuda:0')
tensor(0.2561, device='cuda:0')
tensor(0.1235, device='cuda:0')
tensor(0.1512, device='cuda:0')
tensor(0.2764, device='cuda:0')
tensor(0.3316, device='cuda:0')
tensor(0.2848, device='cuda:0')
tensor(0.3079, device='cuda:0')
tensor(0.3430, device='cuda:0')


  1%|▏         | 69/4929 [10:11:53<748:58:43, 554.80s/it]

Checkpoint 139 saved !
------- 1st valloss=0.2630

tensor(0.1076, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1708, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5252, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3615, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3174, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3077, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0935, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2232, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2574, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.2142, device='cuda:0')
tensor(0.3675, device='cuda:0')
tensor(0.3720, device='cuda:0')
tensor(0.3125, device='cuda:0')
tensor(0.2466, device='cuda:0')
tensor(0.3071, device='cuda:0')
tensor(0.3953, device='cuda:0')
tensor(0.2682, device='cuda:0')
tensor(0.1516, device='cuda:0')
tensor(0.3879, device='cuda:0')
tensor(0.3040, device='cuda:0')
tensor(0.1955, device='cuda:0')
tensor(0.2752, device='cuda:0')
tensor(0.1884, device='cuda:0')
tensor(0.2156, device='cuda:0')
tensor(0.3220, device='cuda:0')
tensor(0.3017, device='cuda:0')
tensor(0.2577, device='cuda:0')
tensor(0.1855, device='cuda:0')
tensor(0.4074, device='cuda:0')
tensor(0.2299, device='cuda:0')
tensor(0.1233, device='cuda:0')
tensor(0.3376, device='cuda:0')


  2%|▏         | 116/4929 [17:27:48<744:50:01, 557.12s/it]

------- 1st valloss=0.2768

tensor(0.1175, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1284, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2516, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2543, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3006, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1536, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3139, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1455, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1054, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1829, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3690, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1928, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2305, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3877, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.3349, device='cuda:0')
tensor(0.2086, device='cuda:0')
tensor(0.1653, device='cuda:0')
tensor(0.3328, device='cuda:0')
tensor(0.2942, device='cuda:0')
tensor(0.2511, device='cuda:0')
tensor(0.2804, device='cuda:0')
tensor(0.2747, device='cuda:0')
tensor(0.3076, device='cuda:0')
tensor(0.2566, device='cuda:0')
tensor(0.2280, device='cuda:0')


  2%|▏         | 117/4929 [17:37:07<745:40:06, 557.86s/it]

------- 1st valloss=0.2638

tensor(0.3395, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1506, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1279, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2145, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2465, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1124, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1200, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1332, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3273, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1251, devi

tensor(0.2458, device='cuda:0')
tensor(0.2490, device='cuda:0')
tensor(0.2448, device='cuda:0')
tensor(0.2850, device='cuda:0')
tensor(0.2416, device='cuda:0')
tensor(0.1772, device='cuda:0')
tensor(0.1853, device='cuda:0')
tensor(0.3671, device='cuda:0')
tensor(0.3366, device='cuda:0')
tensor(0.2687, device='cuda:0')
tensor(0.1963, device='cuda:0')


  2%|▏         | 118/4929 [17:46:27<746:24:12, 558.52s/it]

Checkpoint 188 saved !
------- 1st valloss=0.2540

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1096, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1467, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1664, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1962, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1690, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2376, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1836, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2929, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1328, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0554, device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.2234, device='cuda:0')
tensor(0.1234, device='cuda:0')
tensor(0.3847, device='cuda:0')
tensor(0.3261, device='cuda:0')
tensor(0.3535, device='cuda:0')
tensor(0.2141, device='cuda:0')
tensor(0.2712, device='cuda:0')
tensor(0.1832, device='cuda:0')
tensor(0.2781, device='cuda:0')
tensor(0.1717, device='cuda:0')
tensor(0.2183, device='cuda:0')


  2%|▏         | 119/4929 [17:55:43<745:11:59, 557.74s/it]

------- 1st valloss=0.2577

tensor(0.2179, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1708, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1140, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2189, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0449, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0908, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1651, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1497, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4550, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5834, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0806, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2908, device='cuda:0')
tensor(0.2744, device='cuda:0')
tensor(0.3399, device='cuda:0')
tensor(0.1617, device='cuda:0')
tensor(0.2664, device='cuda:0')
tensor(0.2753, device='cuda:0')
tensor(0.2989, device='cuda:0')
tensor(0.2514, device='cuda:0')
tensor(0.2498, device='cuda:0')
tensor(0.1686, device='cuda:0')
tensor(0.3393, device='cuda:0')


  2%|▏         | 120/4929 [18:05:01<745:16:19, 557.91s/it]

Checkpoint 190 saved !
------- 1st valloss=0.2659

tensor(0.2887, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0975, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2047, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0650, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1601, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1189, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1181, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1072, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4764, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1423, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.3342, device='cuda:0')
tensor(0.1618, device='cuda:0')
tensor(0.1957, device='cuda:0')
tensor(0.3591, device='cuda:0')
tensor(0.3704, device='cuda:0')
tensor(0.3112, device='cuda:0')
tensor(0.2770, device='cuda:0')
tensor(0.2876, device='cuda:0')
tensor(0.3022, device='cuda:0')
tensor(0.2490, device='cuda:0')
tensor(0.1963, device='cuda:0')


  2%|▏         | 121/4929 [18:14:42<754:09:52, 564.68s/it]

Checkpoint 191 saved !
------- 1st valloss=0.2483

tensor(0.2681, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2121, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2133, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2654, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1318, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2977, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2465, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0628, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1209, device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.2767, device='cuda:0')
tensor(0.3888, device='cuda:0')
tensor(0.3679, device='cuda:0')
tensor(0.2956, device='cuda:0')
tensor(0.4767, device='cuda:0')
tensor(0.2702, device='cuda:0')
tensor(0.2297, device='cuda:0')
tensor(0.3442, device='cuda:0')
tensor(0.2126, device='cuda:0')
tensor(0.2698, device='cuda:0')
tensor(0.2635, device='cuda:0')


  2%|▏         | 122/4929 [18:24:18<758:25:40, 567.99s/it]

------- 1st valloss=0.2823

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1606, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2676, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0936, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0982, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1580, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4498, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2429, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2230, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2355, devi

tensor(0.2394, device='cuda:0')
tensor(0.3338, device='cuda:0')
tensor(0.2195, device='cuda:0')
tensor(0.3224, device='cuda:0')
tensor(0.1171, device='cuda:0')
tensor(0.2417, device='cuda:0')
tensor(0.2610, device='cuda:0')
tensor(0.2253, device='cuda:0')
tensor(0.1811, device='cuda:0')
tensor(0.3193, device='cuda:0')
tensor(0.2717, device='cuda:0')


  2%|▏         | 123/4929 [18:33:31<752:26:54, 563.63s/it]

------- 1st valloss=0.2592

tensor(0.3951, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2214, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1578, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2074, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2352, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1807, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2263, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1047, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1772, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2048, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2315, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1669, 

tensor(0.2453, device='cuda:0')
tensor(0.3374, device='cuda:0')
tensor(0.1333, device='cuda:0')
tensor(0.3424, device='cuda:0')
tensor(0.1821, device='cuda:0')
tensor(0.2851, device='cuda:0')
tensor(0.1655, device='cuda:0')
tensor(0.2275, device='cuda:0')
tensor(0.5148, device='cuda:0')
tensor(0.2162, device='cuda:0')
tensor(0.1662, device='cuda:0')


  3%|▎         | 124/4929 [18:42:57<753:17:04, 564.38s/it]

------- 1st valloss=0.2595

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2104, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1640, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3215, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0884, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1973, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1762, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2422, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2481, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0589, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3039, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.2475, device='cuda:0')
tensor(0.1586, device='cuda:0')
tensor(0.3255, device='cuda:0')
tensor(0.3042, device='cuda:0')
tensor(0.1895, device='cuda:0')
tensor(0.2847, device='cuda:0')
tensor(0.1517, device='cuda:0')
tensor(0.2569, device='cuda:0')
tensor(0.2819, device='cuda:0')
tensor(0.2793, device='cuda:0')
tensor(0.3296, device='cuda:0')


  3%|▎         | 125/4929 [18:52:14<750:02:21, 562.06s/it]

------- 1st valloss=0.2747

tensor(0.2592, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4366, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2300, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1758, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2673, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3206, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4326, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1066, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1540, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3308, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1814, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0730, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0972, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1612, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3012, device='cuda:0')
tensor(0.2566, device='cuda:0')
tensor(0.3048, device='cuda:0')
tensor(0.3032, device='cuda:0')
tensor(0.2057, device='cuda:0')
tensor(0.3175, device='cuda:0')
tensor(0.3644, device='cuda:0')
tensor(0.1100, device='cuda:0')
tensor(0.2723, device='cuda:0')
tensor(0.3077, device='cuda:0')
tensor(0.3944, device='cuda:0')


  3%|▎         | 126/4929 [19:01:29<747:03:16, 559.94s/it]

------- 1st valloss=0.2585

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2160, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1107, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1079, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1080, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3188, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1519, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1558, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2270, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2639, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0927, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0832, 

tensor(0.2257, device='cuda:0')
tensor(0.2739, device='cuda:0')
tensor(0.2971, device='cuda:0')
tensor(0.2436, device='cuda:0')
tensor(0.2949, device='cuda:0')
tensor(0.3613, device='cuda:0')
tensor(0.2113, device='cuda:0')
tensor(0.4357, device='cuda:0')
tensor(0.2661, device='cuda:0')
tensor(0.3939, device='cuda:0')
tensor(0.2730, device='cuda:0')


  3%|▎         | 127/4929 [19:10:49<746:51:25, 559.91s/it]

------- 1st valloss=0.2873

tensor(0.1086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1829, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3217, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0796, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0649, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1678, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4696, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1583, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0919, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1150, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2281, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4140, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4028, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.3202, device='cuda:0')
tensor(0.2238, device='cuda:0')
tensor(0.4941, device='cuda:0')
tensor(0.1934, device='cuda:0')
tensor(0.2808, device='cuda:0')
tensor(0.2516, device='cuda:0')
tensor(0.4072, device='cuda:0')
tensor(0.2403, device='cuda:0')
tensor(0.2887, device='cuda:0')
tensor(0.2103, device='cuda:0')
tensor(0.2249, device='cuda:0')


  3%|▎         | 128/4929 [19:19:59<742:50:06, 557.01s/it]

------- 1st valloss=0.2784

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2355, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2098, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0841, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1418, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1422, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3245, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0',

tensor(0.2966, device='cuda:0')
tensor(0.3379, device='cuda:0')
tensor(0.2117, device='cuda:0')
tensor(0.2840, device='cuda:0')
tensor(0.2595, device='cuda:0')
tensor(0.2324, device='cuda:0')
tensor(0.3704, device='cuda:0')
tensor(0.3523, device='cuda:0')
tensor(0.2244, device='cuda:0')
tensor(0.1102, device='cuda:0')
tensor(0.3858, device='cuda:0')


  3%|▎         | 129/4929 [19:29:20<744:10:53, 558.14s/it]

------- 1st valloss=0.2740

tensor(0.3859, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4110, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0743, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2509, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1766, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1698, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2288, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2089, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0917, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5268, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2382, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2180, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2233, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1357, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2003, 

tensor(0.2372, device='cuda:0')
tensor(0.1447, device='cuda:0')
tensor(0.1404, device='cuda:0')
tensor(0.1349, device='cuda:0')
tensor(0.2528, device='cuda:0')
tensor(0.3201, device='cuda:0')
tensor(0.2514, device='cuda:0')
tensor(0.1515, device='cuda:0')
tensor(0.2337, device='cuda:0')
tensor(0.3281, device='cuda:0')
tensor(0.3605, device='cuda:0')


  3%|▎         | 130/4929 [19:38:39<744:17:09, 558.33s/it]

Checkpoint 200 saved !
------- 1st valloss=0.2557

tensor(0.0547, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0866, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2050, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2346, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3281, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1331, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2556, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4079, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2178, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3386, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1157, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3596, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1179, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<Rsub

tensor(0.2976, device='cuda:0')
tensor(0.3284, device='cuda:0')
tensor(0.3620, device='cuda:0')
tensor(0.2562, device='cuda:0')
tensor(0.3328, device='cuda:0')
tensor(0.1356, device='cuda:0')
tensor(0.2218, device='cuda:0')
tensor(0.1976, device='cuda:0')
tensor(0.1985, device='cuda:0')
tensor(0.2748, device='cuda:0')
tensor(0.2283, device='cuda:0')


  3%|▎         | 131/4929 [19:47:58<744:44:36, 558.79s/it]

------- 1st valloss=0.2944

tensor(0.2029, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1577, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1573, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1126, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0991, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1268, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0868, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1400, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5116, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1370, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2926, devi

tensor(0.2890, device='cuda:0')
tensor(0.2672, device='cuda:0')
tensor(0.3399, device='cuda:0')
tensor(0.3505, device='cuda:0')
tensor(0.2724, device='cuda:0')
tensor(0.1154, device='cuda:0')
tensor(0.4125, device='cuda:0')
tensor(0.1622, device='cuda:0')
tensor(0.3116, device='cuda:0')
tensor(0.3816, device='cuda:0')
tensor(0.3686, device='cuda:0')


  3%|▎         | 132/4929 [19:57:12<742:31:42, 557.24s/it]

------- 1st valloss=0.2788

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6466, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1196, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2118, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5671, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0943, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2549, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0783, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2421, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1242, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2848, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0915, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3238, device='

tensor(0.3917, device='cuda:0')
tensor(0.4877, device='cuda:0')
tensor(0.3924, device='cuda:0')
tensor(0.3244, device='cuda:0')
tensor(0.1462, device='cuda:0')
tensor(0.2950, device='cuda:0')
tensor(0.4755, device='cuda:0')
tensor(0.2731, device='cuda:0')
tensor(0.3062, device='cuda:0')
tensor(0.3211, device='cuda:0')
tensor(0.2310, device='cuda:0')


  3%|▎         | 133/4929 [20:06:27<741:25:36, 556.53s/it]

------- 1st valloss=0.2971

tensor(0.1021, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4535, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1606, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1615, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2210, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1991, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1820, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0887, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4555, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1629, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2893, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0874, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3533, device='cuda:0')
tensor(0.3961, device='cuda:0')
tensor(0.2822, device='cuda:0')
tensor(0.3332, device='cuda:0')
tensor(0.2775, device='cuda:0')
tensor(0.3634, device='cuda:0')
tensor(0.2319, device='cuda:0')
tensor(0.5338, device='cuda:0')
tensor(0.1069, device='cuda:0')
tensor(0.1664, device='cuda:0')
tensor(0.3832, device='cuda:0')


  3%|▎         | 134/4929 [20:15:55<745:46:49, 559.92s/it]

------- 1st valloss=0.2861

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1067, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1986, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0568, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1639, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6910, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4929, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0552, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2322, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1473, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.2380, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1095, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.8081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1509, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0963, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1299, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1168, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1188, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2312, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0843, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2130, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1236, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4907, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1600, device='cuda:0', grad_fn=<Rs

  3%|▎         | 172/4929 [26:12:00<744:07:14, 563.14s/it]

------- 1st valloss=0.2626

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1109, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3130, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1474, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1515, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2316, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1760, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1362, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1693, devi

tensor(0.2671, device='cuda:0')
tensor(0.3725, device='cuda:0')
tensor(0.4234, device='cuda:0')
tensor(0.3485, device='cuda:0')
tensor(0.2207, device='cuda:0')
tensor(0.1190, device='cuda:0')
tensor(0.3185, device='cuda:0')
tensor(0.1369, device='cuda:0')
tensor(0.3095, device='cuda:0')
tensor(0.3025, device='cuda:0')
tensor(0.1226, device='cuda:0')


  4%|▎         | 173/4929 [26:21:20<742:53:20, 562.32s/it]

------- 1st valloss=0.2509

tensor(0.1748, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2636, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1975, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1794, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3389, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1185, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0908, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2331, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', gra

tensor(0.4101, device='cuda:0')
tensor(0.1370, device='cuda:0')
tensor(0.1332, device='cuda:0')
tensor(0.2224, device='cuda:0')
tensor(0.1307, device='cuda:0')
tensor(0.2227, device='cuda:0')
tensor(0.3332, device='cuda:0')
tensor(0.3278, device='cuda:0')
tensor(0.1234, device='cuda:0')
tensor(0.3238, device='cuda:0')
tensor(0.3271, device='cuda:0')


  4%|▎         | 174/4929 [26:30:41<742:17:02, 561.98s/it]

------- 1st valloss=0.2691

tensor(0.1146, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3763, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3861, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3863, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.6138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1244, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1149, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1549, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0828, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.1791, device='cuda:0')
tensor(0.3099, device='cuda:0')
tensor(0.3011, device='cuda:0')
tensor(0.1919, device='cuda:0')
tensor(0.3756, device='cuda:0')
tensor(0.3165, device='cuda:0')
tensor(0.2402, device='cuda:0')
tensor(0.2692, device='cuda:0')
tensor(0.2789, device='cuda:0')
tensor(0.3503, device='cuda:0')
tensor(0.1900, device='cuda:0')


  4%|▎         | 175/4929 [26:40:05<742:40:56, 562.40s/it]

------- 1st valloss=0.2712

tensor(0.1816, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3831, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2371, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3831, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1672, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1478, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1611, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0813, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1977, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0995, device='cuda

tensor(0.2978, device='cuda:0')
tensor(0.2254, device='cuda:0')
tensor(0.1377, device='cuda:0')
tensor(0.2292, device='cuda:0')
tensor(0.2722, device='cuda:0')
tensor(0.2915, device='cuda:0')
tensor(0.4068, device='cuda:0')
tensor(0.2740, device='cuda:0')
tensor(0.1471, device='cuda:0')
tensor(0.1909, device='cuda:0')
tensor(0.3252, device='cuda:0')


  4%|▎         | 176/4929 [26:49:29<743:28:26, 563.12s/it]

------- 1st valloss=0.2564

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1921, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1116, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2016, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2882, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1147, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0821, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2252, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2915, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2462, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1002, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2473, device='

tensor(0.2733, device='cuda:0')
tensor(0.2962, device='cuda:0')
tensor(0.1016, device='cuda:0')
tensor(0.4501, device='cuda:0')
tensor(0.2296, device='cuda:0')
tensor(0.2232, device='cuda:0')
tensor(0.3981, device='cuda:0')
tensor(0.1644, device='cuda:0')
tensor(0.3546, device='cuda:0')
tensor(0.2386, device='cuda:0')
tensor(0.2121, device='cuda:0')


  4%|▎         | 177/4929 [26:58:57<744:57:54, 564.37s/it]

------- 1st valloss=0.2796

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1402, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1212, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1430, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1777, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1524, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1211, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1001, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1358, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2185, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.08

tensor(0.3742, device='cuda:0')
tensor(0.2020, device='cuda:0')
tensor(0.1860, device='cuda:0')
tensor(0.3047, device='cuda:0')
tensor(0.2741, device='cuda:0')
tensor(0.2402, device='cuda:0')
tensor(0.4478, device='cuda:0')
tensor(0.2778, device='cuda:0')
tensor(0.1989, device='cuda:0')
tensor(0.2069, device='cuda:0')
tensor(0.3670, device='cuda:0')


  4%|▎         | 178/4929 [27:08:23<745:32:39, 564.93s/it]

------- 1st valloss=0.2660

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1734, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0882, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1465, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1448, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1631, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2493, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1102, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4671, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1052, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0897, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1410, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2408, devi

tensor(0.1418, device='cuda:0')
tensor(0.2468, device='cuda:0')
tensor(0.2301, device='cuda:0')
tensor(0.1234, device='cuda:0')
tensor(0.3260, device='cuda:0')
tensor(0.2602, device='cuda:0')
tensor(0.2781, device='cuda:0')
tensor(0.2433, device='cuda:0')
tensor(0.3496, device='cuda:0')
tensor(0.4219, device='cuda:0')
tensor(0.2251, device='cuda:0')


  4%|▎         | 179/4929 [27:17:53<747:21:54, 566.42s/it]

------- 1st valloss=0.2522

tensor(0.2745, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1095, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3741, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0877, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1350, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2011, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1336, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1173, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0868, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2519, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1427, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1101, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1481, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2205, 

tensor(0.2747, device='cuda:0')
tensor(0.2911, device='cuda:0')
tensor(0.2871, device='cuda:0')
tensor(0.2932, device='cuda:0')
tensor(0.3296, device='cuda:0')
tensor(0.3675, device='cuda:0')
tensor(0.2251, device='cuda:0')
tensor(0.3008, device='cuda:0')
tensor(0.3864, device='cuda:0')
tensor(0.2268, device='cuda:0')
tensor(0.5737, device='cuda:0')


  4%|▎         | 180/4929 [27:27:17<746:09:37, 565.63s/it]

Checkpoint 250 saved !
------- 1st valloss=0.2815

tensor(0.3185, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1519, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1689, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2329, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1664, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2798, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1482, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1798, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2425, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0934, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1150, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0750, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2225, device='cuda:0', grad_fn=<Rsub

tensor(0.3677, device='cuda:0')
tensor(0.4245, device='cuda:0')
tensor(0.2932, device='cuda:0')
tensor(0.1080, device='cuda:0')
tensor(0.1572, device='cuda:0')
tensor(0.2833, device='cuda:0')
tensor(0.2216, device='cuda:0')
tensor(0.1530, device='cuda:0')
tensor(0.3354, device='cuda:0')
tensor(0.2571, device='cuda:0')
tensor(0.2903, device='cuda:0')


  4%|▎         | 181/4929 [27:36:42<746:04:12, 565.68s/it]

------- 1st valloss=0.2608

tensor(0.1804, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5074, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0534, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1265, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2410, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1219, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0682, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2552, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1406, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2537, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2553, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5139, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1644, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3748, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1295, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.1480, device='cuda:0')
tensor(0.2559, device='cuda:0')
tensor(0.2147, device='cuda:0')
tensor(0.1686, device='cuda:0')
tensor(0.2369, device='cuda:0')
tensor(0.1791, device='cuda:0')
tensor(0.2265, device='cuda:0')
tensor(0.2363, device='cuda:0')
tensor(0.2557, device='cuda:0')
tensor(0.4894, device='cuda:0')
tensor(0.3144, device='cuda:0')


  4%|▎         | 182/4929 [27:46:09<746:10:23, 565.88s/it]

------- 1st valloss=0.2567

tensor(0.1068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1666, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2715, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1134, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1019, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5600, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1343, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0914, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2036, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0883, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2404, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2393, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.15

tensor(0.2124, device='cuda:0')
tensor(0.2386, device='cuda:0')
tensor(0.2697, device='cuda:0')
tensor(0.3146, device='cuda:0')
tensor(0.3336, device='cuda:0')
tensor(0.2631, device='cuda:0')
tensor(0.2176, device='cuda:0')
tensor(0.1310, device='cuda:0')
tensor(0.2778, device='cuda:0')
tensor(0.2889, device='cuda:0')
tensor(0.2184, device='cuda:0')


  4%|▎         | 183/4929 [27:55:29<743:42:17, 564.13s/it]

Checkpoint 253 saved !
------- 1st valloss=0.2422

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2451, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2191, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1434, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1903, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2299, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2221, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2542, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3310, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2063, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0438, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.1503, device='cuda:0')
tensor(0.2684, device='cuda:0')
tensor(0.3064, device='cuda:0')
tensor(0.2307, device='cuda:0')
tensor(0.2412, device='cuda:0')
tensor(0.2706, device='cuda:0')
tensor(0.2821, device='cuda:0')
tensor(0.2663, device='cuda:0')
tensor(0.4225, device='cuda:0')
tensor(0.2871, device='cuda:0')
tensor(0.3236, device='cuda:0')


  4%|▎         | 184/4929 [28:04:52<743:19:00, 563.95s/it]

------- 1st valloss=0.2567

tensor(0.3164, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4352, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0454, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0880, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1427, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0882, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2089, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1334, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0754, device='

tensor(0.2365, device='cuda:0')
tensor(0.1681, device='cuda:0')
tensor(0.1647, device='cuda:0')
tensor(0.2757, device='cuda:0')
tensor(0.2436, device='cuda:0')
tensor(0.3018, device='cuda:0')
tensor(0.1049, device='cuda:0')
tensor(0.2398, device='cuda:0')
tensor(0.2737, device='cuda:0')
tensor(0.3264, device='cuda:0')
tensor(0.1536, device='cuda:0')


  4%|▍         | 185/4929 [28:14:15<742:32:15, 563.48s/it]

------- 1st valloss=0.2522

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2647, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2401, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2449, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0613, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1470, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1115, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1344, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1276, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.1841, device='cuda:0')
tensor(0.1628, device='cuda:0')
tensor(0.1963, device='cuda:0')
tensor(0.3501, device='cuda:0')
tensor(0.2950, device='cuda:0')
tensor(0.1649, device='cuda:0')
tensor(0.1930, device='cuda:0')
tensor(0.3209, device='cuda:0')
tensor(0.2378, device='cuda:0')
tensor(0.1422, device='cuda:0')
tensor(0.2534, device='cuda:0')


  4%|▍         | 186/4929 [28:23:45<745:12:05, 565.62s/it]

------- 1st valloss=0.2490

tensor(0.0850, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1119, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2348, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2204, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1829, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2195, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0996, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2579, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2253, device='cuda:0',

tensor(0.3958, device='cuda:0')
tensor(0.3300, device='cuda:0')
tensor(0.3443, device='cuda:0')
tensor(0.3359, device='cuda:0')
tensor(0.2601, device='cuda:0')
tensor(0.2726, device='cuda:0')
tensor(0.2739, device='cuda:0')
tensor(0.1431, device='cuda:0')
tensor(0.3614, device='cuda:0')
tensor(0.1704, device='cuda:0')
tensor(0.1920, device='cuda:0')


  4%|▍         | 187/4929 [28:33:08<743:45:29, 564.64s/it]

------- 1st valloss=0.2862

tensor(0.2513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3170, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3681, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1409, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1866, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1403, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3413, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2371, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1981, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3200, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2186, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3434, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1854, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.1693, device='cuda:0')
tensor(0.1819, device='cuda:0')
tensor(0.3313, device='cuda:0')
tensor(0.2253, device='cuda:0')
tensor(0.1476, device='cuda:0')
tensor(0.2814, device='cuda:0')
tensor(0.4227, device='cuda:0')
tensor(0.1288, device='cuda:0')
tensor(0.2049, device='cuda:0')
tensor(0.2985, device='cuda:0')
tensor(0.4165, device='cuda:0')


  4%|▍         | 188/4929 [28:42:28<741:57:59, 563.40s/it]

------- 1st valloss=0.2529

tensor(0.2680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0589, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1287, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1089, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2262, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1242, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2899, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0892, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1396, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2403, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1701, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1066, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0962, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1189, devi

tensor(0.1968, device='cuda:0')
tensor(0.2790, device='cuda:0')
tensor(0.2439, device='cuda:0')
tensor(0.2555, device='cuda:0')
tensor(0.2270, device='cuda:0')
tensor(0.1728, device='cuda:0')
tensor(0.1143, device='cuda:0')
tensor(0.4382, device='cuda:0')
tensor(0.4496, device='cuda:0')
tensor(0.2598, device='cuda:0')
tensor(0.2321, device='cuda:0')


  4%|▍         | 189/4929 [28:51:56<743:36:50, 564.77s/it]

------- 1st valloss=0.2641

tensor(0.2216, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2413, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1559, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1239, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1715, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1128, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0915, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1726, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1418, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2052, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2401, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2901, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.2604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1498, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3452, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2274, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1325, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1948, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1407, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1631, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0894, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2778, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2368, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1322, device='cuda:0', grad_fn=<RsubBa

  5%|▍         | 228/4929 [34:53:34<727:14:44, 556.92s/it]

------- 1st valloss=0.2637

tensor(0.1159, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2759, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4020, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1705, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1590, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2158, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1398, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1774, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2916, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1444, devi

tensor(0.1963, device='cuda:0')
tensor(0.1252, device='cuda:0')
tensor(0.3119, device='cuda:0')
tensor(0.1377, device='cuda:0')
tensor(0.3575, device='cuda:0')
tensor(0.2673, device='cuda:0')
tensor(0.3724, device='cuda:0')
tensor(0.1072, device='cuda:0')
tensor(0.3463, device='cuda:0')
tensor(0.4252, device='cuda:0')
tensor(0.3455, device='cuda:0')


  5%|▍         | 229/4929 [35:02:51<726:53:52, 556.77s/it]

------- 1st valloss=0.2737

tensor(0.4131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3508, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2361, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1951, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0870, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0550, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0649, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2594, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0771, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2572, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0775, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2360, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.3560, device='cuda:0')
tensor(0.2513, device='cuda:0')
tensor(0.1862, device='cuda:0')
tensor(0.1113, device='cuda:0')
tensor(0.2266, device='cuda:0')
tensor(0.2862, device='cuda:0')
tensor(0.2185, device='cuda:0')
tensor(0.3340, device='cuda:0')
tensor(0.3084, device='cuda:0')
tensor(0.1345, device='cuda:0')
tensor(0.4550, device='cuda:0')


  5%|▍         | 230/4929 [35:12:10<727:48:48, 557.59s/it]

Checkpoint 300 saved !
------- 1st valloss=0.2491

tensor(0.0919, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1180, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2414, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2768, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1444, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1965, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1503, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3415, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1631, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1782, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0847, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0967, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBack

tensor(0.3409, device='cuda:0')
tensor(0.1977, device='cuda:0')
tensor(0.1789, device='cuda:0')
tensor(0.1918, device='cuda:0')
tensor(0.1778, device='cuda:0')
tensor(0.2560, device='cuda:0')
tensor(0.2397, device='cuda:0')
tensor(0.3166, device='cuda:0')
tensor(0.2232, device='cuda:0')
tensor(0.2775, device='cuda:0')
tensor(0.3105, device='cuda:0')


  5%|▍         | 231/4929 [35:21:30<728:23:17, 558.15s/it]

------- 1st valloss=0.2529

tensor(0.2968, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2757, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1571, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1319, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3889, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1639, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2439, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2384, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3158, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1653, devi

tensor(0.3496, device='cuda:0')
tensor(0.2903, device='cuda:0')
tensor(0.2945, device='cuda:0')
tensor(0.2592, device='cuda:0')
tensor(0.1736, device='cuda:0')
tensor(0.2488, device='cuda:0')
tensor(0.1903, device='cuda:0')
tensor(0.3142, device='cuda:0')
tensor(0.1957, device='cuda:0')
tensor(0.1817, device='cuda:0')
tensor(0.2859, device='cuda:0')


  5%|▍         | 232/4929 [35:30:42<726:10:47, 556.58s/it]

------- 1st valloss=0.2411

tensor(0.0933, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3600, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0575, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1465, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1254, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2988, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1812, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3722, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0590, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5630, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1271, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1435, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1444, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4986, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.14

tensor(0.1616, device='cuda:0')
tensor(0.1958, device='cuda:0')
tensor(0.2189, device='cuda:0')
tensor(0.2956, device='cuda:0')
tensor(0.2596, device='cuda:0')
tensor(0.2973, device='cuda:0')
tensor(0.3034, device='cuda:0')
tensor(0.2204, device='cuda:0')
tensor(0.2126, device='cuda:0')


  5%|▍         | 233/4929 [35:40:08<729:36:00, 559.32s/it]

------- 1st valloss=0.2629

tensor(0.0881, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2515, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0840, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2836, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1406, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1400, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2013, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2501, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2198, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0941, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0654, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1574, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1692, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3441, device='cuda:0')
tensor(0.3609, device='cuda:0')
tensor(0.3913, device='cuda:0')
tensor(0.1611, device='cuda:0')
tensor(0.4468, device='cuda:0')
tensor(0.2646, device='cuda:0')
tensor(0.2719, device='cuda:0')
tensor(0.2190, device='cuda:0')
tensor(0.2368, device='cuda:0')
tensor(0.2782, device='cuda:0')
tensor(0.3236, device='cuda:0')


  5%|▍         | 234/4929 [35:49:26<728:47:21, 558.82s/it]

------- 1st valloss=0.2908

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2096, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0733, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1006, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2124, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0605, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3797, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4687, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0693, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2928, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0978, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1366, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1578, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1798, devi

tensor(0.2804, device='cuda:0')
tensor(0.1302, device='cuda:0')
tensor(0.3440, device='cuda:0')
tensor(0.2737, device='cuda:0')
tensor(0.2497, device='cuda:0')
tensor(0.1578, device='cuda:0')
tensor(0.2474, device='cuda:0')
tensor(0.2875, device='cuda:0')
tensor(0.1447, device='cuda:0')
tensor(0.2719, device='cuda:0')
tensor(0.2610, device='cuda:0')


  5%|▍         | 235/4929 [35:58:35<725:01:21, 556.05s/it]

Checkpoint 305 saved !
------- 1st valloss=0.2398

tensor(0.0908, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1289, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2516, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0682, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1357, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2347, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3310, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1614, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0902, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1630, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1218, device='cuda:0', grad_fn=<RsubBackward

tensor(0.1370, device='cuda:0')
tensor(0.3421, device='cuda:0')
tensor(0.1475, device='cuda:0')
tensor(0.2960, device='cuda:0')
tensor(0.3316, device='cuda:0')
tensor(0.1982, device='cuda:0')
tensor(0.3040, device='cuda:0')
tensor(0.4234, device='cuda:0')
tensor(0.3918, device='cuda:0')
tensor(0.2893, device='cuda:0')
tensor(0.2095, device='cuda:0')


  5%|▍         | 236/4929 [36:07:50<724:08:01, 555.48s/it]

------- 1st valloss=0.2669

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2673, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0884, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0913, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1210, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2448, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3842, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0812, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3472, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1220, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0455, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0872, device='

tensor(0.3665, device='cuda:0')
tensor(0.2396, device='cuda:0')
tensor(0.3175, device='cuda:0')
tensor(0.2072, device='cuda:0')
tensor(0.2288, device='cuda:0')
tensor(0.1217, device='cuda:0')
tensor(0.1885, device='cuda:0')
tensor(0.2185, device='cuda:0')
tensor(0.3342, device='cuda:0')
tensor(0.2172, device='cuda:0')
tensor(0.2171, device='cuda:0')


  5%|▍         | 237/4929 [36:17:04<723:40:55, 555.25s/it]

------- 1st valloss=0.2578

tensor(0.0909, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2488, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2495, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2183, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2120, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1067, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3589, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1610, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3635, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0911, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2299, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2518, devi

tensor(0.3241, device='cuda:0')
tensor(0.1429, device='cuda:0')
tensor(0.2331, device='cuda:0')
tensor(0.1985, device='cuda:0')
tensor(0.1916, device='cuda:0')
tensor(0.2377, device='cuda:0')
tensor(0.1681, device='cuda:0')
tensor(0.2120, device='cuda:0')
tensor(0.2433, device='cuda:0')
tensor(0.3269, device='cuda:0')
tensor(0.1477, device='cuda:0')


  5%|▍         | 238/4929 [36:26:24<725:18:39, 556.62s/it]

------- 1st valloss=0.2452

tensor(0.1384, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1776, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2284, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1364, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2376, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0663, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4215, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2464, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1796, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2476, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1174, 

tensor(0.2144, device='cuda:0')
tensor(0.2756, device='cuda:0')
tensor(0.3378, device='cuda:0')
tensor(0.3379, device='cuda:0')
tensor(0.3018, device='cuda:0')
tensor(0.3773, device='cuda:0')
tensor(0.3403, device='cuda:0')
tensor(0.1631, device='cuda:0')
tensor(0.1879, device='cuda:0')
tensor(0.3491, device='cuda:0')
tensor(0.1627, device='cuda:0')


  5%|▍         | 239/4929 [36:35:41<725:19:34, 556.75s/it]

------- 1st valloss=0.2579

tensor(0.1464, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4392, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0924, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0779, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1655, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0949, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2356, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0605, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1431, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0816, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2965, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0681, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0901, devi

tensor(0.1549, device='cuda:0')
tensor(0.1955, device='cuda:0')
tensor(0.2927, device='cuda:0')
tensor(0.1750, device='cuda:0')
tensor(0.2652, device='cuda:0')
tensor(0.2932, device='cuda:0')
tensor(0.4024, device='cuda:0')
tensor(0.3077, device='cuda:0')
tensor(0.3510, device='cuda:0')
tensor(0.1395, device='cuda:0')
tensor(0.1939, device='cuda:0')


  5%|▍         | 240/4929 [36:45:00<726:00:14, 557.39s/it]

Checkpoint 310 saved !
------- 1st valloss=0.2526

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1612, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1861, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1912, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1067, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0597, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2247, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1468, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1861, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4794, device='cuda:0', grad_fn=<RsubBack

tensor(0.3655, device='cuda:0')
tensor(0.3155, device='cuda:0')
tensor(0.2867, device='cuda:0')
tensor(0.3158, device='cuda:0')
tensor(0.2795, device='cuda:0')
tensor(0.2292, device='cuda:0')
tensor(0.3775, device='cuda:0')
tensor(0.2819, device='cuda:0')
tensor(0.4105, device='cuda:0')
tensor(0.1495, device='cuda:0')
tensor(0.3447, device='cuda:0')


  5%|▍         | 241/4929 [36:54:10<722:52:40, 555.11s/it]

------- 1st valloss=0.2728

tensor(0.3126, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1286, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1889, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1705, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2271, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0652, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3934, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2742, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1659, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1899, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1944, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1230, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2640, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3750, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1293, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1293, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2704, device='cuda:0')
tensor(0.0934, device='cuda:0')
tensor(0.2278, device='cuda:0')
tensor(0.2062, device='cuda:0')
tensor(0.3794, device='cuda:0')
tensor(0.1919, device='cuda:0')
tensor(0.2279, device='cuda:0')
tensor(0.2693, device='cuda:0')
tensor(0.2353, device='cuda:0')
tensor(0.1862, device='cuda:0')
tensor(0.2249, device='cuda:0')


  5%|▍         | 242/4929 [37:03:22<721:31:32, 554.19s/it]

------- 1st valloss=0.2417

tensor(0.1447, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1903, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1961, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2105, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2211, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2112, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2328, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2397, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2520, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3443, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1436, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2235, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0849, devi

tensor(0.2104, device='cuda:0')
tensor(0.2357, device='cuda:0')
tensor(0.2000, device='cuda:0')
tensor(0.2614, device='cuda:0')
tensor(0.3100, device='cuda:0')
tensor(0.2108, device='cuda:0')
tensor(0.1971, device='cuda:0')
tensor(0.3126, device='cuda:0')
tensor(0.1158, device='cuda:0')
tensor(0.1955, device='cuda:0')
tensor(0.1775, device='cuda:0')


  5%|▍         | 243/4929 [37:12:38<722:02:58, 554.71s/it]

------- 1st valloss=0.2441

tensor(0.1523, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5298, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2548, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3992, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2563, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0629, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2019, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1298, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1221, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2740, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0762, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2372, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1930, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1888, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2560, device='cuda:0')
tensor(0.4698, device='cuda:0')
tensor(0.3082, device='cuda:0')
tensor(0.3464, device='cuda:0')
tensor(0.4525, device='cuda:0')
tensor(0.2685, device='cuda:0')
tensor(0.1504, device='cuda:0')
tensor(0.1719, device='cuda:0')
tensor(0.3143, device='cuda:0')
tensor(0.2826, device='cuda:0')
tensor(0.2939, device='cuda:0')


  5%|▍         | 244/4929 [37:21:51<721:17:29, 554.25s/it]

------- 1st valloss=0.2972

tensor(0.1544, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3077, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1351, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1896, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1176, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3927, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0713, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1594, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2983, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1271, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0',

tensor(0.3720, device='cuda:0')
tensor(0.2281, device='cuda:0')
tensor(0.2250, device='cuda:0')
tensor(0.3459, device='cuda:0')
tensor(0.3882, device='cuda:0')
tensor(0.1609, device='cuda:0')
tensor(0.2092, device='cuda:0')
tensor(0.2590, device='cuda:0')
tensor(0.4305, device='cuda:0')
tensor(0.2869, device='cuda:0')
tensor(0.1922, device='cuda:0')


  5%|▍         | 245/4929 [37:31:10<722:52:27, 555.58s/it]

------- 1st valloss=0.2752

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0981, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1381, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1337, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0919, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0577, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3122, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1609, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2336, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1772, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1930, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2139, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1757, devi

tensor(0.2251, device='cuda:0')
tensor(0.1657, device='cuda:0')
tensor(0.2244, device='cuda:0')
tensor(0.3325, device='cuda:0')
tensor(0.3549, device='cuda:0')
tensor(0.1144, device='cuda:0')
tensor(0.3546, device='cuda:0')
tensor(0.2493, device='cuda:0')
tensor(0.1833, device='cuda:0')
tensor(0.2242, device='cuda:0')
tensor(0.2410, device='cuda:0')


  5%|▍         | 246/4929 [37:40:25<722:40:43, 555.55s/it]

------- 1st valloss=0.2612

tensor(0.1452, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1500, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2536, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0822, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0606, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2642, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1490, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1013, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2542, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2280, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3051, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1341, 

tensor(0.2260, device='cuda:0')
tensor(0.1810, device='cuda:0')
tensor(0.2546, device='cuda:0')
tensor(0.4269, device='cuda:0')
tensor(0.4475, device='cuda:0')
tensor(0.3469, device='cuda:0')
tensor(0.4345, device='cuda:0')
tensor(0.2429, device='cuda:0')
tensor(0.2228, device='cuda:0')
tensor(0.1886, device='cuda:0')
tensor(0.3611, device='cuda:0')


  5%|▌         | 247/4929 [37:49:39<721:49:43, 555.02s/it]

------- 1st valloss=0.2645

tensor(0.1028, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1128, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2735, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0670, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1793, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4763, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1181, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2930, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2827, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0480, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1243, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.2987, device='cuda:0')
tensor(0.3899, device='cuda:0')
tensor(0.2580, device='cuda:0')
tensor(0.2446, device='cuda:0')
tensor(0.1888, device='cuda:0')
tensor(0.3016, device='cuda:0')
tensor(0.1532, device='cuda:0')
tensor(0.2982, device='cuda:0')
tensor(0.1684, device='cuda:0')
tensor(0.2588, device='cuda:0')
tensor(0.3392, device='cuda:0')


  5%|▌         | 248/4929 [37:59:00<724:10:13, 556.94s/it]

------- 1st valloss=0.2649

tensor(0.1317, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1211, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1609, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2010, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0608, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0956, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2208, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2263, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3850, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1136, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1977, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.4212, device='cuda:0')
tensor(0.2006, device='cuda:0')
tensor(0.3241, device='cuda:0')
tensor(0.2948, device='cuda:0')
tensor(0.2424, device='cuda:0')
tensor(0.3179, device='cuda:0')
tensor(0.1828, device='cuda:0')
tensor(0.1169, device='cuda:0')
tensor(0.3176, device='cuda:0')
tensor(0.1929, device='cuda:0')
tensor(0.2274, device='cuda:0')


  5%|▌         | 249/4929 [38:08:21<725:28:53, 558.06s/it]

------- 1st valloss=0.2569

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1739, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2877, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1744, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0972, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2360, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1431, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1643, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3372, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1576, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1105, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.4013, device='cuda:0')
tensor(0.3250, device='cuda:0')
tensor(0.3615, device='cuda:0')
tensor(0.2368, device='cuda:0')
tensor(0.1605, device='cuda:0')
tensor(0.2868, device='cuda:0')
tensor(0.1665, device='cuda:0')
tensor(0.2716, device='cuda:0')
tensor(0.3175, device='cuda:0')
tensor(0.3246, device='cuda:0')
tensor(0.1459, device='cuda:0')


  5%|▌         | 250/4929 [38:17:37<724:32:58, 557.46s/it]

Checkpoint 320 saved !
------- 1st valloss=0.2438

tensor(0.1541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2311, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0611, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0989, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0864, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2457, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1415, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2684, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4463, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0782, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1517, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1449, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1102, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0880, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1032, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0697, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1401, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1323, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1137, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2337, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0890, device='cuda:0', grad_fn=<Rs

  6%|▌         | 295/4929 [45:13:53<711:44:19, 552.93s/it]

------- 1st valloss=0.2449

tensor(0.2131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1170, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3581, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0750, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2097, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2438, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3469, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2049, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2538, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1559, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0762, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2259, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2161, devi

tensor(0.2194, device='cuda:0')
tensor(0.2801, device='cuda:0')
tensor(0.2812, device='cuda:0')
tensor(0.3437, device='cuda:0')
tensor(0.3122, device='cuda:0')
tensor(0.3334, device='cuda:0')
tensor(0.2009, device='cuda:0')
tensor(0.3368, device='cuda:0')
tensor(0.1538, device='cuda:0')
tensor(0.2556, device='cuda:0')
tensor(0.3460, device='cuda:0')


  6%|▌         | 296/4929 [45:23:10<713:18:06, 554.26s/it]

------- 1st valloss=0.2738

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1735, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1770, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3309, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1700, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2084, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3309, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0846, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2383, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3290, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2678, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0563, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2715, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.1596, device='cuda:0')
tensor(0.3528, device='cuda:0')
tensor(0.4401, device='cuda:0')
tensor(0.3668, device='cuda:0')
tensor(0.1390, device='cuda:0')
tensor(0.2803, device='cuda:0')
tensor(0.1974, device='cuda:0')
tensor(0.2129, device='cuda:0')
tensor(0.1787, device='cuda:0')
tensor(0.3339, device='cuda:0')
tensor(0.2041, device='cuda:0')


  6%|▌         | 297/4929 [45:32:29<715:01:00, 555.71s/it]

------- 1st valloss=0.2626

tensor(0.4837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0975, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2633, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1347, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1458, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1567, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2159, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0707, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1186, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2006, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0794, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2080, device='

tensor(0.1484, device='cuda:0')
tensor(0.1755, device='cuda:0')
tensor(0.1797, device='cuda:0')
tensor(0.2203, device='cuda:0')
tensor(0.3214, device='cuda:0')
tensor(0.3170, device='cuda:0')
tensor(0.3397, device='cuda:0')
tensor(0.3036, device='cuda:0')
tensor(0.2157, device='cuda:0')
tensor(0.1880, device='cuda:0')
tensor(0.3578, device='cuda:0')


  6%|▌         | 298/4929 [45:41:47<715:46:24, 556.42s/it]

------- 1st valloss=0.2421

tensor(0.0701, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1598, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4219, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1615, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1101, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1660, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2726, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0763, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1413, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0745, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4310, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1370, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1052, device='cuda:0', grad_fn=<RsubBackward1>)
ten

tensor(0.2710, device='cuda:0')
tensor(0.1507, device='cuda:0')
tensor(0.3220, device='cuda:0')
tensor(0.2562, device='cuda:0')
tensor(0.1036, device='cuda:0')
tensor(0.1956, device='cuda:0')
tensor(0.3359, device='cuda:0')
tensor(0.1852, device='cuda:0')
tensor(0.2092, device='cuda:0')
tensor(0.3128, device='cuda:0')
tensor(0.1888, device='cuda:0')


  6%|▌         | 299/4929 [45:51:03<715:22:57, 556.24s/it]

------- 1st valloss=0.2451

tensor(0.1034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1247, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0928, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1724, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0812, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2037, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1325, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3697, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2305, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1298, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1723, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1011, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0797, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.2917, device='cuda:0')
tensor(0.2998, device='cuda:0')
tensor(0.2730, device='cuda:0')
tensor(0.2023, device='cuda:0')
tensor(0.1916, device='cuda:0')
tensor(0.3497, device='cuda:0')
tensor(0.2195, device='cuda:0')
tensor(0.1711, device='cuda:0')
tensor(0.2223, device='cuda:0')
tensor(0.2900, device='cuda:0')
tensor(0.3350, device='cuda:0')


  6%|▌         | 300/4929 [46:00:13<712:39:01, 554.23s/it]

Checkpoint 370 saved !
------- 1st valloss=0.2541

tensor(0.3195, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0612, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1473, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1406, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1881, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1033, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1075, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0633, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3082, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1299, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3365, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2889, device='cuda:0', grad_fn=<Rsub

tensor(0.4482, device='cuda:0')
tensor(0.2926, device='cuda:0')
tensor(0.2801, device='cuda:0')
tensor(0.4012, device='cuda:0')
tensor(0.1439, device='cuda:0')
tensor(0.2974, device='cuda:0')
tensor(0.2822, device='cuda:0')
tensor(0.1322, device='cuda:0')
tensor(0.1540, device='cuda:0')
tensor(0.3418, device='cuda:0')
tensor(0.1827, device='cuda:0')


  6%|▌         | 301/4929 [46:09:28<712:44:38, 554.42s/it]

------- 1st valloss=0.2610

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1046, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1331, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1861, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0584, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1666, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1065, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2594, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1018, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2377, 

tensor(0.3067, device='cuda:0')
tensor(0.1819, device='cuda:0')
tensor(0.2033, device='cuda:0')
tensor(0.1710, device='cuda:0')
tensor(0.3241, device='cuda:0')
tensor(0.2818, device='cuda:0')
tensor(0.1267, device='cuda:0')
tensor(0.1848, device='cuda:0')
tensor(0.2909, device='cuda:0')
tensor(0.2534, device='cuda:0')
tensor(0.2574, device='cuda:0')


  6%|▌         | 302/4929 [46:18:39<711:31:54, 553.60s/it]

------- 1st valloss=0.2522

tensor(0.0706, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3508, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1109, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2138, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2104, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1000, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0798, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2010, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1695, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2502, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0441, 

tensor(0.3553, device='cuda:0')
tensor(0.3523, device='cuda:0')
tensor(0.3712, device='cuda:0')
tensor(0.2797, device='cuda:0')
tensor(0.1611, device='cuda:0')
tensor(0.3020, device='cuda:0')
tensor(0.1587, device='cuda:0')
tensor(0.1066, device='cuda:0')
tensor(0.1834, device='cuda:0')
tensor(0.3539, device='cuda:0')
tensor(0.2725, device='cuda:0')


  6%|▌         | 303/4929 [46:27:52<710:56:54, 553.27s/it]

------- 1st valloss=0.2502

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4330, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0651, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0774, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1281, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0813, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1121, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3733, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1973, devi

tensor(0.1731, device='cuda:0')
tensor(0.4286, device='cuda:0')
tensor(0.2303, device='cuda:0')
tensor(0.2830, device='cuda:0')
tensor(0.3541, device='cuda:0')
tensor(0.1683, device='cuda:0')
tensor(0.3572, device='cuda:0')
tensor(0.0966, device='cuda:0')
tensor(0.1626, device='cuda:0')
tensor(0.1980, device='cuda:0')
tensor(0.2550, device='cuda:0')


  6%|▌         | 304/4929 [46:37:05<710:37:24, 553.13s/it]

------- 1st valloss=0.2459

tensor(0.1267, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0885, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1037, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2329, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0555, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3973, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2272, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2287, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4073, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1692, device='

tensor(0.2881, device='cuda:0')
tensor(0.3196, device='cuda:0')
tensor(0.3233, device='cuda:0')
tensor(0.2055, device='cuda:0')
tensor(0.1546, device='cuda:0')
tensor(0.2015, device='cuda:0')
tensor(0.2511, device='cuda:0')
tensor(0.1849, device='cuda:0')
tensor(0.3409, device='cuda:0')
tensor(0.3656, device='cuda:0')
tensor(0.2271, device='cuda:0')


  6%|▌         | 305/4929 [46:46:20<711:29:01, 553.92s/it]

------- 1st valloss=0.2720

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2327, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1987, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2076, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1940, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2093, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1370, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1151, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1890, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1044, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1112, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.2236, device='cuda:0')
tensor(0.1828, device='cuda:0')
tensor(0.2275, device='cuda:0')
tensor(0.1798, device='cuda:0')
tensor(0.3289, device='cuda:0')
tensor(0.2001, device='cuda:0')
tensor(0.2117, device='cuda:0')
tensor(0.1764, device='cuda:0')
tensor(0.3315, device='cuda:0')
tensor(0.3056, device='cuda:0')
tensor(0.2875, device='cuda:0')


  6%|▌         | 306/4929 [46:55:39<712:59:03, 555.21s/it]

------- 1st valloss=0.2416

tensor(0.2055, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1118, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3094, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0950, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0605, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2379, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0501, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0764, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0888, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0901, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1858, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1932, device='

tensor(0.3456, device='cuda:0')
tensor(0.4758, device='cuda:0')
tensor(0.3191, device='cuda:0')
tensor(0.2816, device='cuda:0')
tensor(0.2903, device='cuda:0')
tensor(0.1100, device='cuda:0')
tensor(0.1490, device='cuda:0')
tensor(0.1979, device='cuda:0')
tensor(0.4575, device='cuda:0')
tensor(0.1530, device='cuda:0')
tensor(0.2044, device='cuda:0')


  6%|▌         | 307/4929 [47:04:48<710:28:06, 553.37s/it]

------- 1st valloss=0.2784

tensor(0.2332, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1880, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3773, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1106, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1401, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1611, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3377, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1106, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2055, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2584, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1637, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0849, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0795, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1441, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0511, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.1599, device='cuda:0')
tensor(0.3317, device='cuda:0')
tensor(0.3512, device='cuda:0')
tensor(0.2418, device='cuda:0')
tensor(0.3732, device='cuda:0')
tensor(0.2445, device='cuda:0')
tensor(0.3571, device='cuda:0')
tensor(0.2537, device='cuda:0')
tensor(0.4763, device='cuda:0')
tensor(0.2734, device='cuda:0')
tensor(0.2805, device='cuda:0')


  6%|▌         | 308/4929 [47:14:03<711:02:47, 553.94s/it]

------- 1st valloss=0.2765

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0645, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1560, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1462, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0799, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3449, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2476, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4811, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2498, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4052, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2199, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2551, device='

tensor(0.1365, device='cuda:0')
tensor(0.3976, device='cuda:0')
tensor(0.2074, device='cuda:0')
tensor(0.2606, device='cuda:0')
tensor(0.3134, device='cuda:0')
tensor(0.3033, device='cuda:0')
tensor(0.3625, device='cuda:0')
tensor(0.2676, device='cuda:0')
tensor(0.1406, device='cuda:0')
tensor(0.3736, device='cuda:0')
tensor(0.3158, device='cuda:0')


  6%|▋         | 309/4929 [47:23:15<710:07:33, 553.34s/it]

------- 1st valloss=0.2776

tensor(0.1425, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1927, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1745, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0946, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2109, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4054, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3699, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2693, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1014, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0829, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0913, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1165, devi

tensor(0.1811, device='cuda:0')
tensor(0.2570, device='cuda:0')
tensor(0.4104, device='cuda:0')
tensor(0.2453, device='cuda:0')
tensor(0.1842, device='cuda:0')
tensor(0.1972, device='cuda:0')
tensor(0.2224, device='cuda:0')
tensor(0.3215, device='cuda:0')
tensor(0.4454, device='cuda:0')
tensor(0.2774, device='cuda:0')
tensor(0.2060, device='cuda:0')


  6%|▋         | 310/4929 [47:32:31<710:59:32, 554.14s/it]

Checkpoint 380 saved !
------- 1st valloss=0.2875

tensor(0.0553, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2957, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1598, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1192, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0625, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1265, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0744, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1371, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2358, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1763, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2118, device='cuda:0', grad_fn=<RsubBack

tensor(0.2845, device='cuda:0')
tensor(0.1862, device='cuda:0')
tensor(0.3246, device='cuda:0')
tensor(0.2652, device='cuda:0')
tensor(0.3944, device='cuda:0')
tensor(0.1801, device='cuda:0')
tensor(0.2111, device='cuda:0')
tensor(0.3771, device='cuda:0')
tensor(0.3666, device='cuda:0')
tensor(0.3721, device='cuda:0')
tensor(0.3519, device='cuda:0')


  6%|▋         | 311/4929 [47:41:47<711:26:48, 554.61s/it]

------- 1st valloss=0.2824

tensor(0.3236, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2858, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1390, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1232, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3042, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1478, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0941, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3344, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0976, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1623, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1604, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4286, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1515, device='

tensor(0.2228, device='cuda:0')
tensor(0.3676, device='cuda:0')
tensor(0.1925, device='cuda:0')
tensor(0.3493, device='cuda:0')
tensor(0.2493, device='cuda:0')
tensor(0.1137, device='cuda:0')
tensor(0.3187, device='cuda:0')
tensor(0.2715, device='cuda:0')
tensor(0.1855, device='cuda:0')
tensor(0.0974, device='cuda:0')
tensor(0.2460, device='cuda:0')


  6%|▋         | 312/4929 [47:51:01<710:59:55, 554.39s/it]

------- 1st valloss=0.2462

tensor(0.0850, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1336, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1706, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1910, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0974, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1382, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1979, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1960, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3668, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2360, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1612, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1962, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2103, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1266, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.10

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.2011, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1053, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0909, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0957, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1348, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1718, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0440, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1234, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1098, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2036, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0584, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1237, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0916, device='cuda:0', grad_fn=<RsubBackward1>)
Epoch 384 finished ! Training Loss: 0.3523

tensor(0.2844, device='cuda:0')
tensor(0.3448, device='cuda:0')
tensor(0.2975, device='cuda:0')
tensor(0.2550, device='cuda:0')
tensor(0.2633, device='cuda:0')
te

  6%|▋         | 314/4929 [48:09:23<708:39:44, 552.80s/it]

------- 1st valloss=0.2468

tensor(0.1102, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1058, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0792, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1618, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1900, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1560, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2228, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2597, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2275, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0962, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1651, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1025, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1801, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1366, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1739, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3009, device='cuda:0')
tensor(0.2478, device='cuda:0')
tensor(0.2788, device='cuda:0')
tensor(0.0963, device='cuda:0')
tensor(0.2619, device='cuda:0')
tensor(0.1920, device='cuda:0')
tensor(0.2463, device='cuda:0')
tensor(0.3328, device='cuda:0')
tensor(0.3617, device='cuda:0')
tensor(0.1679, device='cuda:0')


  6%|▋         | 315/4929 [48:18:41<710:30:24, 554.36s/it]

------- 1st valloss=0.2676

tensor(0.1379, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0901, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0659, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2318, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2436, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1429, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2747, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1214, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1259, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1986, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0851, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0943, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2132, devi

tensor(0.1669, device='cuda:0')
tensor(0.2988, device='cuda:0')
tensor(0.1412, device='cuda:0')
tensor(0.4226, device='cuda:0')
tensor(0.2525, device='cuda:0')
tensor(0.3119, device='cuda:0')
tensor(0.1967, device='cuda:0')
tensor(0.1962, device='cuda:0')
tensor(0.4162, device='cuda:0')
tensor(0.3212, device='cuda:0')
tensor(0.3274, device='cuda:0')


  6%|▋         | 316/4929 [48:27:53<709:17:26, 553.53s/it]

------- 1st valloss=0.2741

tensor(0.2785, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1736, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1206, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5034, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1988, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1094, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1232, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2217, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1424, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0685, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1645, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5039, 

tensor(0.3321, device='cuda:0')
tensor(0.3228, device='cuda:0')
tensor(0.2917, device='cuda:0')
tensor(0.1638, device='cuda:0')
tensor(0.1602, device='cuda:0')
tensor(0.3839, device='cuda:0')
tensor(0.2896, device='cuda:0')
tensor(0.2644, device='cuda:0')
tensor(0.1874, device='cuda:0')
tensor(0.2871, device='cuda:0')
tensor(0.1499, device='cuda:0')


  6%|▋         | 317/4929 [48:37:05<708:24:08, 552.96s/it]

------- 1st valloss=0.2497

tensor(0.1065, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1698, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2407, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0607, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1373, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1655, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1828, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2621, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0971, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1245, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0693, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2099, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1623, 

tensor(0.2002, device='cuda:0')
tensor(0.1825, device='cuda:0')
tensor(0.2919, device='cuda:0')
tensor(0.3307, device='cuda:0')
tensor(0.3132, device='cuda:0')
tensor(0.1222, device='cuda:0')
tensor(0.2542, device='cuda:0')
tensor(0.1936, device='cuda:0')
tensor(0.1850, device='cuda:0')
tensor(0.1567, device='cuda:0')
tensor(0.2882, device='cuda:0')


  6%|▋         | 318/4929 [48:46:17<707:56:12, 552.72s/it]

Checkpoint 388 saved !
------- 1st valloss=0.2311

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2128, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0952, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1253, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2102, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0797, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1973, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2273, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0963, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0866, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3093, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0674, device='cuda:0', grad_fn=<RsubBackward

tensor(0.2497, device='cuda:0')
tensor(0.2174, device='cuda:0')
tensor(0.2561, device='cuda:0')
tensor(0.2152, device='cuda:0')
tensor(0.2148, device='cuda:0')
tensor(0.2721, device='cuda:0')
tensor(0.4968, device='cuda:0')
tensor(0.1712, device='cuda:0')
tensor(0.2854, device='cuda:0')
tensor(0.1926, device='cuda:0')
tensor(0.2730, device='cuda:0')


  6%|▋         | 319/4929 [48:55:34<709:27:50, 554.03s/it]

------- 1st valloss=0.2490

tensor(0.0981, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1460, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0917, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1844, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1489, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0619, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0941, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1852, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0883, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1602, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2848, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0773, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1778, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3079, 

tensor(0.2970, device='cuda:0')
tensor(0.1812, device='cuda:0')
tensor(0.2095, device='cuda:0')
tensor(0.1916, device='cuda:0')
tensor(0.3073, device='cuda:0')
tensor(0.2213, device='cuda:0')
tensor(0.4157, device='cuda:0')
tensor(0.1248, device='cuda:0')
tensor(0.1430, device='cuda:0')
tensor(0.1237, device='cuda:0')
tensor(0.2487, device='cuda:0')


  6%|▋         | 320/4929 [49:04:44<707:42:37, 552.78s/it]

Checkpoint 390 saved !
------- 1st valloss=0.2378

tensor(0.0862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2353, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0846, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0479, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1556, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2298, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0760, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1685, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2287, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2091, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0838, device='cuda:0', grad_fn=<RsubBackward1>)
tens

tensor(0.2659, device='cuda:0')
tensor(0.1309, device='cuda:0')
tensor(0.1774, device='cuda:0')
tensor(0.2330, device='cuda:0')
tensor(0.2244, device='cuda:0')
tensor(0.1741, device='cuda:0')
tensor(0.1499, device='cuda:0')
tensor(0.2333, device='cuda:0')
tensor(0.1799, device='cuda:0')
tensor(0.2083, device='cuda:0')
tensor(0.2222, device='cuda:0')


  7%|▋         | 321/4929 [49:13:52<705:56:18, 551.51s/it]

------- 1st valloss=0.2334

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1563, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1390, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0535, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3437, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3122, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0547, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1421, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1169, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2457, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1277, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1028, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3318, devi

tensor(0.1206, device='cuda:0')
tensor(0.2061, device='cuda:0')
tensor(0.2576, device='cuda:0')
tensor(0.2004, device='cuda:0')
tensor(0.4392, device='cuda:0')
tensor(0.3019, device='cuda:0')
tensor(0.2338, device='cuda:0')
tensor(0.2547, device='cuda:0')
tensor(0.4134, device='cuda:0')
tensor(0.1247, device='cuda:0')
tensor(0.3599, device='cuda:0')


  7%|▋         | 322/4929 [49:23:11<708:24:36, 553.57s/it]

------- 1st valloss=0.2753

tensor(0.1754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1036, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1630, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0991, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3904, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2726, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1672, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1289, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1910, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1165, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1676, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0675, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1031, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1010, devi

tensor(0.2027, device='cuda:0')
tensor(0.2714, device='cuda:0')
tensor(0.1850, device='cuda:0')
tensor(0.2855, device='cuda:0')
tensor(0.2327, device='cuda:0')
tensor(0.2314, device='cuda:0')
tensor(0.3770, device='cuda:0')
tensor(0.1901, device='cuda:0')
tensor(0.3611, device='cuda:0')
tensor(0.1912, device='cuda:0')
tensor(0.1875, device='cuda:0')


  7%|▋         | 323/4929 [49:32:24<708:03:10, 553.41s/it]

------- 1st valloss=0.2441

tensor(0.1996, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2109, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0790, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0660, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1922, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0908, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0963, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1461, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2116, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1596, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2684, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2050, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2483, device='cuda:0')
tensor(0.1403, device='cuda:0')
tensor(0.3613, device='cuda:0')
tensor(0.2557, device='cuda:0')
tensor(0.2560, device='cuda:0')
tensor(0.4116, device='cuda:0')
tensor(0.4041, device='cuda:0')
tensor(0.2587, device='cuda:0')
tensor(0.3232, device='cuda:0')
tensor(0.2718, device='cuda:0')
tensor(0.3060, device='cuda:0')


  7%|▋         | 324/4929 [49:41:33<706:18:23, 552.16s/it]

------- 1st valloss=0.2542

tensor(0.2279, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1443, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1001, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1018, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2172, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0848, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3477, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1081, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1023, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1003, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1253, devi

tensor(0.2683, device='cuda:0')
tensor(0.1353, device='cuda:0')
tensor(0.1548, device='cuda:0')
tensor(0.3530, device='cuda:0')
tensor(0.1694, device='cuda:0')
tensor(0.2718, device='cuda:0')
tensor(0.2198, device='cuda:0')
tensor(0.1741, device='cuda:0')
tensor(0.1586, device='cuda:0')
tensor(0.2435, device='cuda:0')
tensor(0.2485, device='cuda:0')


  7%|▋         | 325/4929 [49:50:47<707:02:11, 552.85s/it]

------- 1st valloss=0.2562

tensor(0.2270, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1479, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0794, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0889, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1832, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1083, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1030, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1982, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0713, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1528, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2364, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2587, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3203, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.3114, device='cuda:0')
tensor(0.2545, device='cuda:0')
tensor(0.2387, device='cuda:0')
tensor(0.2829, device='cuda:0')
tensor(0.1424, device='cuda:0')
tensor(0.2320, device='cuda:0')
tensor(0.1946, device='cuda:0')
tensor(0.1502, device='cuda:0')
tensor(0.2297, device='cuda:0')
tensor(0.2715, device='cuda:0')
tensor(0.3599, device='cuda:0')


  7%|▋         | 326/4929 [50:00:06<709:05:52, 554.58s/it]

------- 1st valloss=0.2777

tensor(0.1667, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1041, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1687, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2683, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1387, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1547, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0699, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0665, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0970, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3059, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5039, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1925, devi

tensor(0.2882, device='cuda:0')
tensor(0.1285, device='cuda:0')
tensor(0.2302, device='cuda:0')
tensor(0.3138, device='cuda:0')
tensor(0.1086, device='cuda:0')
tensor(0.1971, device='cuda:0')
tensor(0.2725, device='cuda:0')
tensor(0.2529, device='cuda:0')
tensor(0.2660, device='cuda:0')
tensor(0.2760, device='cuda:0')
tensor(0.1435, device='cuda:0')


  7%|▋         | 327/4929 [50:09:17<707:42:32, 553.62s/it]

Checkpoint 397 saved !
------- 1st valloss=0.2258

tensor(0.1524, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1513, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2018, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0924, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0962, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3732, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2166, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1139, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2658, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1436, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3052, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2314, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3491, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1402, device='cuda:0', grad_fn=<Rsub

tensor(0.3463, device='cuda:0')
tensor(0.3251, device='cuda:0')
tensor(0.3740, device='cuda:0')
tensor(0.2312, device='cuda:0')
tensor(0.2809, device='cuda:0')
tensor(0.3111, device='cuda:0')
tensor(0.3751, device='cuda:0')
tensor(0.3746, device='cuda:0')
tensor(0.2557, device='cuda:0')
tensor(0.1358, device='cuda:0')
tensor(0.3490, device='cuda:0')


  7%|▋         | 328/4929 [50:18:34<708:30:53, 554.37s/it]

------- 1st valloss=0.2795

tensor(0.1681, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1453, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3333, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1184, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0550, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1283, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1315, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1178, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0970, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0765, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1994, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3136, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0665, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0539, 

tensor(0.2598, device='cuda:0')
tensor(0.2704, device='cuda:0')
tensor(0.4422, device='cuda:0')
tensor(0.4152, device='cuda:0')
tensor(0.1523, device='cuda:0')
tensor(0.2604, device='cuda:0')
tensor(0.2559, device='cuda:0')
tensor(0.2869, device='cuda:0')
tensor(0.2108, device='cuda:0')
tensor(0.2718, device='cuda:0')
tensor(0.2354, device='cuda:0')


  7%|▋         | 329/4929 [50:27:43<706:31:48, 552.94s/it]

------- 1st valloss=0.2623

tensor(0.2011, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1899, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2470, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1025, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1488, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1174, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1724, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1071, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1660, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2153, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2444, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1859, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3739, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

tensor(0.2126, device='cuda:0')
tensor(0.3131, device='cuda:0')
tensor(0.3694, device='cuda:0')
tensor(0.2284, device='cuda:0')
tensor(0.1510, device='cuda:0')
tensor(0.2392, device='cuda:0')
tensor(0.2420, device='cuda:0')
tensor(0.1723, device='cuda:0')
tensor(0.2896, device='cuda:0')
tensor(0.2167, device='cuda:0')
tensor(0.2323, device='cuda:0')


  7%|▋         | 330/4929 [50:36:53<705:08:17, 551.97s/it]

Checkpoint 400 saved !
------- 1st valloss=0.2350

tensor(0.0748, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1816, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0914, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0695, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1395, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1031, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1460, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0807, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1356, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2463, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3970, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4087, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1526, device='cuda:0', grad_fn=<Rsub

tensor(0.4603, device='cuda:0')
tensor(0.3398, device='cuda:0')
tensor(0.1759, device='cuda:0')
tensor(0.3259, device='cuda:0')
tensor(0.3277, device='cuda:0')
tensor(0.2406, device='cuda:0')
tensor(0.2780, device='cuda:0')
tensor(0.3062, device='cuda:0')
tensor(0.2530, device='cuda:0')
tensor(0.2830, device='cuda:0')
tensor(0.1877, device='cuda:0')


  7%|▋         | 331/4929 [50:45:59<702:47:49, 550.25s/it]

------- 1st valloss=0.2731

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4190, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2454, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1071, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0795, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1027, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2562, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0745, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2459, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1759, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2045, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1870, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1378, device='cuda

tensor(0.1826, device='cuda:0')
tensor(0.1159, device='cuda:0')
tensor(0.2885, device='cuda:0')
tensor(0.2984, device='cuda:0')
tensor(0.3765, device='cuda:0')
tensor(0.1637, device='cuda:0')
tensor(0.1714, device='cuda:0')
tensor(0.2660, device='cuda:0')
tensor(0.1110, device='cuda:0')
tensor(0.2305, device='cuda:0')
tensor(0.1849, device='cuda:0')


  7%|▋         | 332/4929 [50:55:16<705:15:48, 552.31s/it]

------- 1st valloss=0.2344

tensor(0.1705, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3255, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0388, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1549, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2148, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0747, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1492, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0805, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0760, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2392, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1776, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1084, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1601, 

tensor(0.1740, device='cuda:0')
tensor(0.2697, device='cuda:0')
tensor(0.2799, device='cuda:0')
tensor(0.2585, device='cuda:0')
tensor(0.1368, device='cuda:0')
tensor(0.2975, device='cuda:0')
tensor(0.3698, device='cuda:0')
tensor(0.1935, device='cuda:0')
tensor(0.2370, device='cuda:0')
tensor(0.1548, device='cuda:0')
tensor(0.1644, device='cuda:0')


  7%|▋         | 333/4929 [51:04:30<705:39:11, 552.73s/it]

------- 1st valloss=0.2461

tensor(0.2127, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0873, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2454, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3490, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1416, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1442, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0759, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2113, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1535, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2074, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0872, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1114, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1467, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1839, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1464, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0904, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2240, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1027, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3214, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1949, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1168, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2292, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3057, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1301, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1462, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1317, device='cuda:0', grad_fn=<RsubBackwa

  7%|▋         | 366/4929 [56:10:19<701:13:27, 553.23s/it]

------- 1st valloss=0.2545

tensor(0.1312, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0890, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2442, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2078, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1367, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3377, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1002, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1422, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1308, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1838, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1757, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1982, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0834, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.2704, device='cuda:0')
tensor(0.2551, device='cuda:0')
tensor(0.2318, device='cuda:0')
tensor(0.1772, device='cuda:0')
tensor(0.1943, device='cuda:0')
tensor(0.1888, device='cuda:0')
tensor(0.2373, device='cuda:0')
tensor(0.2675, device='cuda:0')
tensor(0.1221, device='cuda:0')
tensor(0.3177, device='cuda:0')
tensor(0.2414, device='cuda:0')


  7%|▋         | 367/4929 [56:19:40<704:02:44, 555.58s/it]

------- 1st valloss=0.2491

tensor(0.2022, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2199, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1251, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1375, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0732, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1194, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2929, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1832, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2354, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1117, device='cuda

tensor(0.2459, device='cuda:0')
tensor(0.2486, device='cuda:0')
tensor(0.2806, device='cuda:0')
tensor(0.2130, device='cuda:0')
tensor(0.2222, device='cuda:0')
tensor(0.1845, device='cuda:0')
tensor(0.1169, device='cuda:0')
tensor(0.2198, device='cuda:0')
tensor(0.2165, device='cuda:0')
tensor(0.2653, device='cuda:0')
tensor(0.3285, device='cuda:0')


  7%|▋         | 368/4929 [56:28:53<702:58:11, 554.85s/it]

------- 1st valloss=0.2356

tensor(0.1470, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1765, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1235, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0839, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0423, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0958, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0768, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2865, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0527, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2572, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2943, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1368, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1514, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0989, devi

tensor(0.3970, device='cuda:0')
tensor(0.3620, device='cuda:0')
tensor(0.2683, device='cuda:0')
tensor(0.2088, device='cuda:0')
tensor(0.2906, device='cuda:0')
tensor(0.3820, device='cuda:0')
tensor(0.3455, device='cuda:0')
tensor(0.1166, device='cuda:0')
tensor(0.1872, device='cuda:0')
tensor(0.2637, device='cuda:0')
tensor(0.3055, device='cuda:0')


  7%|▋         | 369/4929 [56:38:10<703:45:25, 555.60s/it]

------- 1st valloss=0.2647

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0956, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1356, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1703, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2266, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2385, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1069, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3410, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0856, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1362, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1742, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1045, device='

tensor(0.2505, device='cuda:0')
tensor(0.1016, device='cuda:0')
tensor(0.2488, device='cuda:0')
tensor(0.1094, device='cuda:0')
tensor(0.1247, device='cuda:0')
tensor(0.2435, device='cuda:0')
tensor(0.2864, device='cuda:0')
tensor(0.2753, device='cuda:0')
tensor(0.3314, device='cuda:0')
tensor(0.3134, device='cuda:0')
tensor(0.3025, device='cuda:0')


  8%|▊         | 370/4929 [56:47:24<702:57:43, 555.09s/it]

Checkpoint 440 saved !
------- 1st valloss=0.2551

tensor(0.1450, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3530, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1637, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0743, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0885, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2471, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2225, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0702, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2708, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0830, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1387, device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.2619, device='cuda:0')
tensor(0.2616, device='cuda:0')
tensor(0.1786, device='cuda:0')
tensor(0.1567, device='cuda:0')
tensor(0.1153, device='cuda:0')
tensor(0.2558, device='cuda:0')
tensor(0.3086, device='cuda:0')
tensor(0.2297, device='cuda:0')
tensor(0.2915, device='cuda:0')
tensor(0.2065, device='cuda:0')
tensor(0.2402, device='cuda:0')


  8%|▊         | 371/4929 [56:56:37<701:52:25, 554.35s/it]

------- 1st valloss=0.2291

tensor(0.0969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1525, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1666, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1831, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1414, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1254, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1881, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5206, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2344, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2265, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1022, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0838, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1381, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1769, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3414, device='cuda:0')
tensor(0.1547, device='cuda:0')
tensor(0.3235, device='cuda:0')
tensor(0.2548, device='cuda:0')
tensor(0.2380, device='cuda:0')
tensor(0.3058, device='cuda:0')
tensor(0.1906, device='cuda:0')
tensor(0.1192, device='cuda:0')
tensor(0.1567, device='cuda:0')
tensor(0.3522, device='cuda:0')
tensor(0.1560, device='cuda:0')


  8%|▊         | 372/4929 [57:05:54<702:50:34, 555.24s/it]

------- 1st valloss=0.2391

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1103, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1644, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1995, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3195, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1485, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2593, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2993, device='cuda:0', grad_fn=<RsubBackward1>)


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.2169, device='cuda:0')
tensor(0.2309, device='cuda:0')
tensor(0.3522, device='cuda:0')
tensor(0.1770, device='cuda:0')
tensor(0.3086, device='cuda:0')
tensor(0.1423, device='cuda:0')


  8%|▊         | 375/4929 [57:33:37<702:41:09, 555.48s/it]

------- 1st valloss=0.2390

tensor(0.1255, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0547, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1736, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1581, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1251, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1318, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1528, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0739, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0654, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1788, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0913, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1369, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1016, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.2136, device='cuda:0')
tensor(0.2769, device='cuda:0')
tensor(0.2729, device='cuda:0')
tensor(0.1542, device='cuda:0')
tensor(0.1421, device='cuda:0')
tensor(0.3912, device='cuda:0')
tensor(0.2964, device='cuda:0')
tensor(0.3017, device='cuda:0')
tensor(0.1540, device='cuda:0')
tensor(0.3233, device='cuda:0')
tensor(0.2350, device='cuda:0')


  8%|▊         | 376/4929 [57:42:59<704:40:27, 557.18s/it]

------- 1st valloss=0.2439

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1658, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2003, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2586, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1229, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0768, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1169, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0966, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0992, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0995, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2035, device='cuda

tensor(0.2835, device='cuda:0')
tensor(0.2843, device='cuda:0')
tensor(0.3174, device='cuda:0')
tensor(0.2210, device='cuda:0')
tensor(0.1217, device='cuda:0')
tensor(0.1425, device='cuda:0')
tensor(0.3055, device='cuda:0')
tensor(0.3875, device='cuda:0')
tensor(0.3167, device='cuda:0')
tensor(0.2118, device='cuda:0')
tensor(0.2070, device='cuda:0')


  8%|▊         | 377/4929 [57:52:12<703:07:50, 556.08s/it]

------- 1st valloss=0.2448

tensor(0.2011, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3421, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3803, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1114, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2475, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1653, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1442, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0714, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0787, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0940, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1020, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1608, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1033, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1718, 

tensor(0.2960, device='cuda:0')
tensor(0.2827, device='cuda:0')
tensor(0.2727, device='cuda:0')
tensor(0.3978, device='cuda:0')
tensor(0.2443, device='cuda:0')
tensor(0.1964, device='cuda:0')
tensor(0.1637, device='cuda:0')
tensor(0.2305, device='cuda:0')
tensor(0.1874, device='cuda:0')
tensor(0.2838, device='cuda:0')
tensor(0.2310, device='cuda:0')


  8%|▊         | 378/4929 [58:01:40<707:36:43, 559.75s/it]

------- 1st valloss=0.2659

tensor(0.2016, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1019, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0911, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1461, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2036, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0820, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1033, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1512, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1252, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1268, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1400, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1911, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1256, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3728, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0738, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.06

tensor(0.2448, device='cuda:0')
tensor(0.2219, device='cuda:0')
tensor(0.4076, device='cuda:0')
tensor(0.3579, device='cuda:0')
tensor(0.3561, device='cuda:0')
tensor(0.1664, device='cuda:0')
tensor(0.2362, device='cuda:0')
tensor(0.3564, device='cuda:0')
tensor(0.3764, device='cuda:0')
tensor(0.3795, device='cuda:0')
tensor(0.1301, device='cuda:0')


  8%|▊         | 379/4929 [58:10:52<704:26:21, 557.36s/it]

------- 1st valloss=0.2687

tensor(0.0861, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1010, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2290, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1371, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0743, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0665, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2385, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2957, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1006, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0773, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1975, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1391, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1362, device='

tensor(0.3213, device='cuda:0')
tensor(0.1995, device='cuda:0')
tensor(0.4271, device='cuda:0')
tensor(0.1501, device='cuda:0')
tensor(0.1029, device='cuda:0')
tensor(0.3298, device='cuda:0')
tensor(0.2677, device='cuda:0')
tensor(0.2603, device='cuda:0')
tensor(0.1665, device='cuda:0')
tensor(0.2576, device='cuda:0')
tensor(0.4209, device='cuda:0')


  8%|▊         | 380/4929 [58:19:59<700:10:55, 554.11s/it]

Checkpoint 450 saved !
------- 1st valloss=0.2563

tensor(0.1508, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3426, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0801, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0785, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1420, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0819, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1670, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1244, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1115, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1549, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0

tensor(0.1827, device='cuda:0')
tensor(0.3558, device='cuda:0')
tensor(0.3343, device='cuda:0')
tensor(0.1440, device='cuda:0')
tensor(0.3106, device='cuda:0')
tensor(0.2770, device='cuda:0')
tensor(0.2878, device='cuda:0')
tensor(0.1696, device='cuda:0')
tensor(0.3375, device='cuda:0')
tensor(0.2045, device='cuda:0')
tensor(0.3169, device='cuda:0')


  8%|▊         | 381/4929 [58:29:09<698:30:13, 552.91s/it]

------- 1st valloss=0.2605

tensor(0.2639, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2294, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1365, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1682, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1503, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1628, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1966, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1888, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0674, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0765, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1420, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1377, devi

tensor(0.2364, device='cuda:0')
tensor(0.3325, device='cuda:0')
tensor(0.3615, device='cuda:0')
tensor(0.3889, device='cuda:0')
tensor(0.2026, device='cuda:0')
tensor(0.3703, device='cuda:0')
tensor(0.2956, device='cuda:0')
tensor(0.1316, device='cuda:0')
tensor(0.2298, device='cuda:0')
tensor(0.1900, device='cuda:0')
tensor(0.3408, device='cuda:0')


  8%|▊         | 382/4929 [58:38:24<699:07:53, 553.52s/it]

------- 1st valloss=0.2762

tensor(0.2175, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0860, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2221, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4209, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2009, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0943, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1324, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3554, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1780, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0733, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1810, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0785, devi

tensor(0.1664, device='cuda:0')
tensor(0.3342, device='cuda:0')
tensor(0.1813, device='cuda:0')
tensor(0.2204, device='cuda:0')
tensor(0.1463, device='cuda:0')
tensor(0.2894, device='cuda:0')
tensor(0.1839, device='cuda:0')
tensor(0.2402, device='cuda:0')
tensor(0.1888, device='cuda:0')
tensor(0.2078, device='cuda:0')
tensor(0.3788, device='cuda:0')


  8%|▊         | 383/4929 [58:47:44<701:30:25, 555.53s/it]

------- 1st valloss=0.2256

tensor(0.3486, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3582, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0801, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0913, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0862, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2354, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0796, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1973, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0793, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2465, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1259, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1759, device='cuda

tensor(0.2217, device='cuda:0')
tensor(0.3120, device='cuda:0')
tensor(0.2636, device='cuda:0')
tensor(0.1820, device='cuda:0')
tensor(0.2218, device='cuda:0')
tensor(0.1634, device='cuda:0')
tensor(0.2448, device='cuda:0')
tensor(0.1524, device='cuda:0')
tensor(0.2860, device='cuda:0')
tensor(0.1590, device='cuda:0')
tensor(0.0799, device='cuda:0')


  8%|▊         | 384/4929 [58:56:57<700:34:15, 554.91s/it]

------- 1st valloss=0.2348

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2374, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1255, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2368, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3445, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1942, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1930, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1328, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1007, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.5124, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3767, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='

tensor(0.3117, device='cuda:0')
tensor(0.2316, device='cuda:0')
tensor(0.3194, device='cuda:0')
tensor(0.3046, device='cuda:0')
tensor(0.2279, device='cuda:0')
tensor(0.1086, device='cuda:0')
tensor(0.1802, device='cuda:0')
tensor(0.2528, device='cuda:0')
tensor(0.1330, device='cuda:0')
tensor(0.3502, device='cuda:0')
tensor(0.1791, device='cuda:0')


  8%|▊         | 385/4929 [59:06:10<699:34:34, 554.24s/it]

------- 1st valloss=0.2443

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1403, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2108, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2227, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1688, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2044, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1795, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2029, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2991, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0836, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1101, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2127, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1997, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0490, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1375, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0752, 

tensor(0.1599, device='cuda:0')
tensor(0.3599, device='cuda:0')
tensor(0.2579, device='cuda:0')
tensor(0.3277, device='cuda:0')
tensor(0.2034, device='cuda:0')
tensor(0.1529, device='cuda:0')
tensor(0.3065, device='cuda:0')
tensor(0.1981, device='cuda:0')
tensor(0.1249, device='cuda:0')
tensor(0.2433, device='cuda:0')
tensor(0.2298, device='cuda:0')


  8%|▊         | 386/4929 [59:15:39<705:08:11, 558.77s/it]

------- 1st valloss=0.2430

tensor(0.0559, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1737, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0921, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4152, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1014, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1190, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2699, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1721, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2528, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1599, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2589, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1742, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0469, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3118, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.0847, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0877, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1086, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2423, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0418, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1679, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0802, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1104, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0807, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1522, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3573, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0963, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0512, device='cuda:0', grad_fn=<RsubBa

tensor(0.2777, device='cuda:0')
tensor(0.3238, device='cuda:0')
tensor(0.1793, device='cuda:0')
tensor(0.1401, device='cuda:0')
tensor(0.3808, device='cuda:0')
tensor(0.3627, device='cuda:0')
tensor(0.3518, device='cuda:0')
tensor(0.2476, device='cuda:0')


  9%|▊         | 423/4929 [64:58:20<694:51:12, 555.14s/it]

------- 1st valloss=0.2507

tensor(0.1496, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1697, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3024, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1171, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1868, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4072, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3425, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1289, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2306, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1450, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0768, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0867, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., devi

tensor(0.1987, device='cuda:0')
tensor(0.2490, device='cuda:0')
tensor(0.1777, device='cuda:0')
tensor(0.1465, device='cuda:0')
tensor(0.3256, device='cuda:0')
tensor(0.2640, device='cuda:0')
tensor(0.5205, device='cuda:0')
tensor(0.2206, device='cuda:0')
tensor(0.3510, device='cuda:0')
tensor(0.1695, device='cuda:0')
tensor(0.3193, device='cuda:0')


  9%|▊         | 424/4929 [65:07:46<698:37:29, 558.28s/it]

------- 1st valloss=0.2587

tensor(0.1357, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1996, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0563, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3543, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2108, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1286, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0916, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4650, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1357, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0741, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1293, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2761, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0582, device='

tensor(0.2815, device='cuda:0')
tensor(0.1726, device='cuda:0')
tensor(0.2120, device='cuda:0')
tensor(0.2267, device='cuda:0')
tensor(0.1876, device='cuda:0')
tensor(0.2211, device='cuda:0')
tensor(0.1815, device='cuda:0')
tensor(0.2726, device='cuda:0')
tensor(0.2162, device='cuda:0')
tensor(0.2509, device='cuda:0')
tensor(0.3177, device='cuda:0')


  9%|▊         | 425/4929 [65:17:00<697:01:57, 557.13s/it]

------- 1st valloss=0.2464

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2092, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3322, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2838, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2805, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1503, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2758, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3549, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1419, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0671, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1137, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1129, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1062, device='cuda

tensor(0.2238, device='cuda:0')
tensor(0.1830, device='cuda:0')
tensor(0.3102, device='cuda:0')
tensor(0.3465, device='cuda:0')
tensor(0.3936, device='cuda:0')
tensor(0.2234, device='cuda:0')
tensor(0.1120, device='cuda:0')
tensor(0.2624, device='cuda:0')
tensor(0.1735, device='cuda:0')
tensor(0.3327, device='cuda:0')
tensor(0.1184, device='cuda:0')


  9%|▊         | 426/4929 [65:26:13<695:27:15, 555.99s/it]

------- 1st valloss=0.2365

tensor(0.0415, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2512, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0509, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2060, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1564, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1633, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4487, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1543, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0631, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1503, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1752, devi

tensor(0.2966, device='cuda:0')
tensor(0.2487, device='cuda:0')
tensor(0.2301, device='cuda:0')
tensor(0.1626, device='cuda:0')
tensor(0.3173, device='cuda:0')
tensor(0.1302, device='cuda:0')
tensor(0.3187, device='cuda:0')
tensor(0.1941, device='cuda:0')
tensor(0.1578, device='cuda:0')
tensor(0.2631, device='cuda:0')
tensor(0.3141, device='cuda:0')


  9%|▊         | 427/4929 [65:35:27<694:22:26, 555.25s/it]

------- 1st valloss=0.2460

tensor(0.1101, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2601, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1291, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1743, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2616, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1001, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1331, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1017, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3382, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0845, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1419, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0904, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.1850, device='cuda:0')
tensor(0.3001, device='cuda:0')
tensor(0.2838, device='cuda:0')
tensor(0.1044, device='cuda:0')
tensor(0.2742, device='cuda:0')
tensor(0.2861, device='cuda:0')
tensor(0.2372, device='cuda:0')
tensor(0.2051, device='cuda:0')
tensor(0.1945, device='cuda:0')
tensor(0.1389, device='cuda:0')
tensor(0.2045, device='cuda:0')


  9%|▊         | 428/4929 [65:44:43<694:42:44, 555.65s/it]

------- 1st valloss=0.2275

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1987, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1151, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1529, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3652, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1855, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1205, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0735, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1025, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1216, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2197, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2040, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1316, device='

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.0504, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3068, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0869, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0648, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0841, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1028, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1397, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2001, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0959, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0808, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1895, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2081, device='cuda:0', grad_fn=<RsubBa

  9%|▉         | 439/4929 [67:26:48<692:54:00, 555.55s/it]

------- 1st valloss=0.2390

tensor(0.0926, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1545, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2213, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3621, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2048, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0851, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1927, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0431, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0625, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1371, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1087, devi

tensor(0.2184, device='cuda:0')
tensor(0.2888, device='cuda:0')
tensor(0.2348, device='cuda:0')
tensor(0.1416, device='cuda:0')
tensor(0.2366, device='cuda:0')
tensor(0.1990, device='cuda:0')
tensor(0.1934, device='cuda:0')
tensor(0.3165, device='cuda:0')
tensor(0.1975, device='cuda:0')
tensor(0.2929, device='cuda:0')
tensor(0.3167, device='cuda:0')


  9%|▉         | 440/4929 [67:36:03<692:44:07, 555.55s/it]

Checkpoint 510 saved !
------- 1st valloss=0.2360

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3923, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0879, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1191, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0817, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1987, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2328, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0466, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3334, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2429, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1403, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1288, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1901, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward

tensor(0.2060, device='cuda:0')
tensor(0.2453, device='cuda:0')
tensor(0.3400, device='cuda:0')
tensor(0.2814, device='cuda:0')
tensor(0.2610, device='cuda:0')
tensor(0.3950, device='cuda:0')
tensor(0.1844, device='cuda:0')
tensor(0.1092, device='cuda:0')
tensor(0.2643, device='cuda:0')
tensor(0.2335, device='cuda:0')
tensor(0.1695, device='cuda:0')


  9%|▉         | 441/4929 [67:45:20<693:11:38, 556.04s/it]

------- 1st valloss=0.2669

tensor(0.0522, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1230, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0910, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1136, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2070, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0841, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1099, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1758, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2474, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3439, device='cuda:0',

tensor(0.3456, device='cuda:0')
tensor(0.1055, device='cuda:0')
tensor(0.1929, device='cuda:0')
tensor(0.2240, device='cuda:0')
tensor(0.1161, device='cuda:0')
tensor(0.2423, device='cuda:0')
tensor(0.2424, device='cuda:0')
tensor(0.2854, device='cuda:0')
tensor(0.2028, device='cuda:0')
tensor(0.2256, device='cuda:0')
tensor(0.1246, device='cuda:0')


  9%|▉         | 442/4929 [67:54:38<693:29:51, 556.41s/it]

------- 1st valloss=0.2387

tensor(0.1061, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0798, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1721, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3799, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3596, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1351, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0974, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1367, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1285, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3603, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2595, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1273, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2200, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0420, devi

tensor(0.2495, device='cuda:0')
tensor(0.3717, device='cuda:0')
tensor(0.3529, device='cuda:0')
tensor(0.2477, device='cuda:0')
tensor(0.2922, device='cuda:0')
tensor(0.2889, device='cuda:0')
tensor(0.1298, device='cuda:0')
tensor(0.2932, device='cuda:0')
tensor(0.1730, device='cuda:0')
tensor(0.2409, device='cuda:0')
tensor(0.2614, device='cuda:0')


  9%|▉         | 443/4929 [68:03:51<692:21:35, 555.62s/it]

------- 1st valloss=0.2455

tensor(0.3732, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1411, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0875, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1518, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1791, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2759, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1310, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1891, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0717, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2300, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0838, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2799, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0848, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2026, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3903, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3103, device='cuda:0')
tensor(0.1763, device='cuda:0')
tensor(0.2543, device='cuda:0')
tensor(0.1198, device='cuda:0')
tensor(0.3007, device='cuda:0')
tensor(0.2938, device='cuda:0')
tensor(0.2771, device='cuda:0')
tensor(0.4677, device='cuda:0')
tensor(0.3075, device='cuda:0')
tensor(0.3341, device='cuda:0')
tensor(0.1603, device='cuda:0')


  9%|▉         | 444/4929 [68:13:11<693:33:12, 556.70s/it]

------- 1st valloss=0.2668

tensor(0.1566, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1275, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1986, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2452, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2432, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1691, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1074, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1613, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1580, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2131, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1472, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2435, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0939, 

tensor(0.3254, device='cuda:0')
tensor(0.2958, device='cuda:0')
tensor(0.2447, device='cuda:0')
tensor(0.1329, device='cuda:0')
tensor(0.1878, device='cuda:0')
tensor(0.1204, device='cuda:0')
tensor(0.1350, device='cuda:0')
tensor(0.1910, device='cuda:0')
tensor(0.2806, device='cuda:0')
tensor(0.2922, device='cuda:0')
tensor(0.3324, device='cuda:0')


  9%|▉         | 445/4929 [68:22:24<692:10:18, 555.71s/it]

------- 1st valloss=0.2340

tensor(0.0755, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0764, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0796, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1364, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1497, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0846, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1498, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2654, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1037, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1966, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1673, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1104, device='cuda

tensor(0.1910, device='cuda:0')
tensor(0.3360, device='cuda:0')
tensor(0.1818, device='cuda:0')
tensor(0.2855, device='cuda:0')
tensor(0.2594, device='cuda:0')
tensor(0.2212, device='cuda:0')
tensor(0.1850, device='cuda:0')
tensor(0.2004, device='cuda:0')
tensor(0.3042, device='cuda:0')
tensor(0.2406, device='cuda:0')
tensor(0.3576, device='cuda:0')


  9%|▉         | 446/4929 [68:31:39<691:46:36, 555.52s/it]

------- 1st valloss=0.2617

tensor(0.1475, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2505, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0871, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1247, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1846, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1714, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1637, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1280, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1445, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1000, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1790, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0830, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0778, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.21

tensor(0.1972, device='cuda:0')
tensor(0.2832, device='cuda:0')
tensor(0.2334, device='cuda:0')
tensor(0.2068, device='cuda:0')
tensor(0.3282, device='cuda:0')
tensor(0.2256, device='cuda:0')
tensor(0.2722, device='cuda:0')
tensor(0.2397, device='cuda:0')
tensor(0.2844, device='cuda:0')
tensor(0.3021, device='cuda:0')
tensor(0.3528, device='cuda:0')


  9%|▉         | 447/4929 [68:40:56<692:07:00, 555.92s/it]

------- 1st valloss=0.2633

tensor(0.0706, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2537, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0882, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0809, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0696, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0981, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3311, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0783, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1504, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0935, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0997, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2511, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2162, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3611, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1222, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1178, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.3193, device='cuda:0')
tensor(0.2878, device='cuda:0')
tensor(0.3437, device='cuda:0')
tensor(0.4090, device='cuda:0')
tensor(0.2659, device='cuda:0')
tensor(0.2999, device='cuda:0')
tensor(0.1583, device='cuda:0')
tensor(0.4349, device='cuda:0')
tensor(0.2347, device='cuda:0')
tensor(0.1818, device='cuda:0')
tensor(0.1684, device='cuda:0')


  9%|▉         | 448/4929 [68:50:16<693:28:26, 557.13s/it]

------- 1st valloss=0.3091

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1090, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1467, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1541, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2776, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2684, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0990, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1492, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3346, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0886, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2150, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2069, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.2287, device='cuda:0')
tensor(0.1639, device='cuda:0')
tensor(0.1173, device='cuda:0')
tensor(0.2999, device='cuda:0')
tensor(0.1775, device='cuda:0')
tensor(0.2840, device='cuda:0')
tensor(0.2386, device='cuda:0')
tensor(0.2916, device='cuda:0')
tensor(0.3344, device='cuda:0')
tensor(0.1540, device='cuda:0')
tensor(0.2201, device='cuda:0')


  9%|▉         | 449/4929 [68:59:23<689:32:14, 554.09s/it]

------- 1st valloss=0.2478

tensor(0.1806, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0860, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3378, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1164, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2092, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2704, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0680, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1570, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0801, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0940, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0697, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1707, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0854, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1332, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., 

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(0.0618, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1837, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2085, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2789, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0811, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0788, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1733, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1965, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2014, device='cuda:0', grad_fn=<RsubBackward1>)
Epoch 549 finished ! Training Loss: 0.3447

tensor(0.2441, device='cuda:0')
tensor(0.1581, device='cuda:0')
tensor(0.1982, device='cuda:0')
tensor(0.1428, device='cuda:0')
tensor(0.3042, device='cuda:0')
tensor(0.1181, device='cuda:0')
tensor(0.0906, device='cuda:0')

 10%|▉         | 479/4929 [73:37:12<692:04:06, 559.88s/it]

------- 1st valloss=0.2356

tensor(0.1362, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1296, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1158, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0998, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1928, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1311, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1290, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1781, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1561, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0883, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1553, device='cuda:0',

tensor(0.2888, device='cuda:0')
tensor(0.2632, device='cuda:0')
tensor(0.2243, device='cuda:0')
tensor(0.1365, device='cuda:0')
tensor(0.1234, device='cuda:0')
tensor(0.2560, device='cuda:0')
tensor(0.2386, device='cuda:0')
tensor(0.2518, device='cuda:0')
tensor(0.3679, device='cuda:0')
tensor(0.2160, device='cuda:0')
tensor(0.2237, device='cuda:0')


 10%|▉         | 480/4929 [73:46:19<687:03:01, 555.94s/it]

Checkpoint 550 saved !
------- 1st valloss=0.2421

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3855, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2256, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1804, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1403, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0750, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3458, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2333, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0835, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1384, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1398, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0996, device='cuda:0', grad_fn=<RsubBackward1>)


tensor(0.2349, device='cuda:0')
tensor(0.1721, device='cuda:0')
tensor(0.1957, device='cuda:0')
tensor(0.2340, device='cuda:0')
tensor(0.1809, device='cuda:0')
tensor(0.3293, device='cuda:0')
tensor(0.2226, device='cuda:0')
tensor(0.2441, device='cuda:0')
tensor(0.3037, device='cuda:0')
tensor(0.2316, device='cuda:0')
tensor(0.2923, device='cuda:0')


 10%|▉         | 481/4929 [73:55:35<686:59:34, 556.02s/it]

------- 1st valloss=0.2356

tensor(0.1217, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1168, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0396, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0992, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2411, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2546, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1422, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0897, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0881, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0636, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1744, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0868, device='

tensor(0.1429, device='cuda:0')
tensor(0.1847, device='cuda:0')
tensor(0.2513, device='cuda:0')
tensor(0.2842, device='cuda:0')
tensor(0.3412, device='cuda:0')
tensor(0.2357, device='cuda:0')
tensor(0.2511, device='cuda:0')
tensor(0.1618, device='cuda:0')
tensor(0.2825, device='cuda:0')
tensor(0.2109, device='cuda:0')
tensor(0.2487, device='cuda:0')


 10%|▉         | 482/4929 [74:04:44<684:24:20, 554.05s/it]

------- 1st valloss=0.2337

tensor(0.1584, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1467, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0928, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0868, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1281, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0778, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0898, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3018, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2231, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0485, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2039, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2431, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1600, device='

tensor(0.2774, device='cuda:0')
tensor(0.3073, device='cuda:0')
tensor(0.3117, device='cuda:0')
tensor(0.2760, device='cuda:0')
tensor(0.2261, device='cuda:0')
tensor(0.2931, device='cuda:0')
tensor(0.2202, device='cuda:0')
tensor(0.1687, device='cuda:0')
tensor(0.1348, device='cuda:0')
tensor(0.2302, device='cuda:0')
tensor(0.2798, device='cuda:0')


 10%|▉         | 483/4929 [74:13:59<684:22:49, 554.15s/it]

------- 1st valloss=0.2405

tensor(0.0969, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0754, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1234, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4548, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1661, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1110, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1693, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0570, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2786, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1379, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1585, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3533, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.3742, device='cuda:0')
tensor(0.2146, device='cuda:0')
tensor(0.2008, device='cuda:0')
tensor(0.3458, device='cuda:0')
tensor(0.2339, device='cuda:0')
tensor(0.1231, device='cuda:0')
tensor(0.1564, device='cuda:0')
tensor(0.2988, device='cuda:0')
tensor(0.2887, device='cuda:0')
tensor(0.3120, device='cuda:0')
tensor(0.2784, device='cuda:0')


 10%|▉         | 484/4929 [74:23:11<683:29:52, 553.56s/it]

------- 1st valloss=0.2609

tensor(0.1308, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1476, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1461, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0955, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1532, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0746, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2815, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0651, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0904, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1339, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0422, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1766, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda

tensor(0.2882, device='cuda:0')
tensor(0.3111, device='cuda:0')
tensor(0.1188, device='cuda:0')
tensor(0.1568, device='cuda:0')
tensor(0.1089, device='cuda:0')
tensor(0.2202, device='cuda:0')
tensor(0.2427, device='cuda:0')
tensor(0.2150, device='cuda:0')
tensor(0.3727, device='cuda:0')
tensor(0.1157, device='cuda:0')
tensor(0.2477, device='cuda:0')


 10%|▉         | 485/4929 [74:32:28<684:36:11, 554.58s/it]

------- 1st valloss=0.2469

tensor(0.1451, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3274, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1052, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1710, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1091, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1968, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1653, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1112, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0538, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1823, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0684, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0886, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0656, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1960, devi

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1886, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2165, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2220, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1015, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2278, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2062, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1310, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0910, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0947, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1720, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3119, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0799, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1282, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1388, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1111, device='cuda:0', grad_fn=<Rs

 10%|▉         | 491/4929 [75:27:48<680:39:43, 552.14s/it]

------- 1st valloss=0.2443

tensor(0.1015, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1360, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1677, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2122, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0587, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2964, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1832, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1159, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1357, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0677, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2522, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3567, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1405, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3072, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1522, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.10

tensor(0.2421, device='cuda:0')
tensor(0.2602, device='cuda:0')
tensor(0.2624, device='cuda:0')
tensor(0.2555, device='cuda:0')
tensor(0.3353, device='cuda:0')
tensor(0.1654, device='cuda:0')
tensor(0.1736, device='cuda:0')
tensor(0.2368, device='cuda:0')
tensor(0.2717, device='cuda:0')
tensor(0.2873, device='cuda:0')
tensor(0.1286, device='cuda:0')


 10%|▉         | 492/4929 [75:37:03<681:23:18, 552.85s/it]

------- 1st valloss=0.2424

tensor(0.1334, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1630, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0521, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0601, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1742, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4524, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.4258, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0906, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0697, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0729, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1416, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0740, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1883, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0925, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0826, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1349, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(

tensor(0.2267, device='cuda:0')
tensor(0.1662, device='cuda:0')
tensor(0.2671, device='cuda:0')
tensor(0.2239, device='cuda:0')
tensor(0.2717, device='cuda:0')
tensor(0.2705, device='cuda:0')
tensor(0.2090, device='cuda:0')
tensor(0.3818, device='cuda:0')
tensor(0.3593, device='cuda:0')
tensor(0.4693, device='cuda:0')
tensor(0.1372, device='cuda:0')


 10%|█         | 493/4929 [75:46:18<681:56:40, 553.43s/it]

------- 1st valloss=0.2605

tensor(0.1319, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1171, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1558, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0692, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0795, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1561, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1190, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1163, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2857, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0896, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0907, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0784, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0885, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1870, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1885, 

tensor(0.2495, device='cuda:0')
tensor(0.4154, device='cuda:0')
tensor(0.3033, device='cuda:0')
tensor(0.1792, device='cuda:0')
tensor(0.1825, device='cuda:0')
tensor(0.2585, device='cuda:0')
tensor(0.2002, device='cuda:0')
tensor(0.2433, device='cuda:0')
tensor(0.2399, device='cuda:0')
tensor(0.2479, device='cuda:0')
tensor(0.2806, device='cuda:0')


 10%|█         | 494/4929 [75:55:31<681:34:08, 553.25s/it]

------- 1st valloss=0.2666

tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0729, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2120, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1596, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0838, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.3664, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1223, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1239, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.2246, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.0588, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)
tensor(0.1526, device='cuda:0', grad_fn=<RsubBackward1>)
tensor(1., device='cuda:0', grad_fn=<RsubBackward1>)


In [None]:
deeplab.eval()
refine_model.eval()

with torch.no_grad():

    val_loss = 0
    
    for v, vbatch in tqdm(enumerate(validation_loader)):
        # move data to device, convert dtype to desirable dtype
        val_losses = []
        for minibatch in range(BATCH_SIZE):
            image_1 = vbatch['image1_data'][minibatch].to(device=device, dtype=dtype)
            image_1 = image_1.view(1,1,256,256,256)

            label_1 = vbatch['image1_label'][minibatch].to(device=device, dtype=dtype)
            label_1 = label_1.view(1,3,256,256,256)

            bv_label = label_1[:, 2, :, :, :]
            bv_label = bv_label.view(1,1,256,256,256)

            original_res = [a[minibatch].item() for a in vbatch['original_resolution']]

            image_1_resize = F.interpolate(image_1, size=original_res, mode='trilinear', align_corners=True)
            image_1_resize = image_1_resize.view(1,1,original_res[0], original_res[1], original_res[2])

            bv_label_resize = F.interpolate(bv_label, size=original_res, mode='trilinear', align_corners=True)

            # Get coarse output from deeplab model from 256 resolution input
            out_coarse = deeplab(image_1)
            out_coarse = out_coarse.view(1,3,256,256,256)

            bv_coarse = out_coarse[:, 2, :, :, :]
            bv_coarse = bv_coarse.view(1,1,256,256,256)

            bv_coarse_resize = F.interpolate(bv_coarse, size=original_res, mode='trilinear', align_corners=True)
            
            box_size = 192
            half_size = int(box_size / 2)
            
            image_size_x = int(image_1_resize.shape[-3])
            image_size_y = int(image_1_resize.shape[-2])
            image_size_z = int(image_1_resize.shape[-1])
            
            x,y,z = loadbvcenter(binarize_output(bv_coarse_resize).view([1] + original_res))
            x, y, z = np.clip([x, y, z], a_min=box_size-half_size, a_max=box_size+half_size)
            x1 = max(x-half_size, 0)
            x2 = min(x+half_size, image_size_x)
            y1 = max(y-half_size, 0)
            y2 = min(y+half_size, image_size_y)
            z1 = max(z-half_size, 0)
            z2 = min(z+half_size, image_size_z)
            
            
            bbox_bv = bv_coarse_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
            bbox_bv = reshape_image(bbox_bv.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_bv = bbox_bv.view(1,1,box_size,box_size,box_size)
            
            bbox_bv_label = bv_label_resize.view(original_res)[x1:x2, y1:y2, z1:z2]
            bbox_bv_label = reshape_image(bbox_bv_label.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_bv_label = bbox_bv_label.view(1,1,box_size,box_size,box_size)

            #bbox_image = get_bounding_box_image(image_1, (256,256,256)).to(device, dtype)
            bbox_image = image_1_resize[:, :, x1:x2, y1:y2, z1:z2]
            bbox_image = reshape_image(bbox_image.squeeze(), box_size, box_size, box_size).to(device, dtype)
            bbox_image = bbox_image.view(1, 1, box_size, box_size, box_size)
            
            #bbox_iamge, bbox_bv_label, bbox_bv = get_bboxes(image_1_resize, bv_label_resize, bv_coarse_resize, 1, 200)
            
            bbox_concat = torch.cat([bbox_bv, bbox_image], dim=1)
            bbox_concat_2 = F.interpolate(bbox_concat, scale_factor=1/2, mode='trilinear', align_corners=True)
            bbox_concat_4 = F.interpolate(bbox_concat, scale_factor=1/4, mode='trilinear', align_corners=True)

            refine_out = refine_model(bbox_concat, bbox_concat_2, bbox_concat_4)

            loss = dice_loss(refine_out, bbox_bv_label)
            val_losses.append(loss)
            
            if loss.item() > .04:
                show_image_slice(image_1)
                show_image_slice(bv_label_resize)
                show_image_slice(bv_coarse)
                show_image_slice(bbox_image)
                show_image_slice(bbox_bv_label)
                show_image_slice(bbox_bv)
                show_image_slice(refine_out)
        
        loss = sum(val_losses) / BATCH_SIZE
        print(loss.item())
        val_loss += loss.item()
        val_losses = []
        '''
        if loss.item() > .05:
            show_image_slice(image_1)
            show_image_slice(label_1)
            show_image_slice(output)
        '''

    outstr = 'bv loss = {0:.4f}'\
        .format(val_loss/(v+1)) + '\n'
    print(outstr)