In [1]:
seed = 1
sparsity = 0.9
width = 32

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import torch
import torch.nn as nn
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import time
import copy
import sys

import random
import numpy as np
import torch
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import scipy.stats as ss
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

import sys
import numpy as np
import torch.nn.utils.prune as prune
from datautils import *

In [4]:
def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)

random_seed(seed)

In [5]:
train_loader, val_loader = get_loaders(
    "imagenet", path="",
    batchsize=256, workers=8,
    nsamples=-1, seed=0,
    noaug=False
)

In [6]:
def train(train_loader, model, criterion, optimizer, scaler, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    #top1 = AverageMeter('Acc@1', ':6.2f')
    #top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with torch.cuda.amp.autocast():
            output = model(images)
            loss = criterion(output, target)

        # measure accuracy and record loss
        #acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        #top1.update(acc1[0], images.size(0))
        #top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #loss.backward()
        #optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 50 == 0:
            progress.display(i)
        if epoch == -1 and i == 50:
            break

    return losses.avg


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        #print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
        #      .format(top1=top1, top5=top5))

    return top1.avg

In [7]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = LR * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [8]:
def get_res(epochs=20):
    from torchvision.models import resnet50
    model = resnet50(pretrained=True)
    #model = resnet20()
    #model.load_state_dict(torch.load("done_rn20_%s_%s_160_amp_fixed.pth" % (seed, width)))
    print(model, file=sys.stderr)
    model.cuda()
    
    
    #optimizer = torch.optim.AdamW(model.parameters(), 0.001)
    optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, nesterov=True, weight_decay=1e-4)
    opt0 = torch.optim.SGD(model.parameters(), 0.0, momentum=0.9, nesterov=True, weight_decay=1e-4)
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2], gamma=0.1)
    scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=epochs, power=1)
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.004, epochs, cycle_momentum=False)
    criterion = nn.CrossEntropyLoss()#SoftTargetCrossEntropy()
    criterion_val = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler()
    
    total_params = 0
    for n, m in model.named_modules():
        if ("conv" in n or "downsample" in n) and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
            print(n, m.weight.shape)
            total_params += m.weight.numel()
            prune.l1_unstructured(m, name='weight', amount=sparsity)
    print("tot", total_params)
    
    best_acc1 = 0
    acc1 = validate(val_loader, model, criterion_val).item()
    print("start acc no bn", acc1)
    train_loss = train(train_loader, model, criterion, opt0, scaler, -1)
    acc1 = validate(val_loader, model, criterion_val).item()
    total_active = 0
    for n, m in model.named_modules():
        if "conv" in n or "downsample" in n and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
            total_active += (m.weight != 0).sum().item()
    print("start acc bn", acc1, total_active, total_active / total_params)

    for epoch in range(epochs):
        train_loss = train(train_loader, model, criterion, optimizer, scaler, epoch)
        acc1 = validate(val_loader, model, criterion_val).item()
        scheduler.step()
        
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        total_active = 0
        for n, m in model.named_modules():
            if "conv" in n or "downsample" in n and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
                total_active += (m.weight != 0).sum().item()
        print("epoch", epoch, train_loss, acc1, optimizer.param_groups[0]['lr'], total_active, total_active / total_params)
    
    return acc1, copy.deepcopy(model.state_dict())

acc, end = get_res()

print("acc", acc)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

layer1.0.conv1 torch.Size([64, 64, 1, 1])
layer1.0.conv2 torch.Size([64, 64, 3, 3])
layer1.0.conv3 torch.Size([256, 64, 1, 1])
layer1.0.downsample.0 torch.Size([256, 64, 1, 1])
layer1.1.conv1 torch.Size([64, 256, 1, 1])
layer1.1.conv2 torch.Size([64, 64, 3, 3])
layer1.1.conv3 torch.Size([256, 64, 1, 1])
layer1.2.conv1 torch.Size([64, 256, 1, 1])
layer1.2.conv2 torch.Size([64, 64, 3, 3])
layer1.2.conv3 torch.Size([256, 64, 1, 1])
layer2.0.conv1 torch.Size([128, 256, 1, 1])
layer2.0.conv2 torch.Size([128, 128, 3, 3])
layer2.0.conv3 torch.Size([512, 128, 1, 1])
layer2.0.downsample.0 torch.Size([512, 256, 1, 1])
layer2.1.conv1 torch.Size([128, 512, 1, 1])
layer2.1.conv2 torch.Size([128, 128, 3, 3])
layer2.1.conv3 torch.Size([512, 128, 1, 1])
layer2.2.conv1 torch.Size([128, 512, 1, 1])
layer2.2.conv2 torch.Size([128, 128, 3, 3])
layer2.2.conv3 torch.Size([512, 128, 1, 1])
layer2.3.conv1 torch.Size([128, 512, 1, 1])
layer2.3.conv2 torch.Size([128, 128, 3, 3])
layer2.3.conv3 torch.Size([512, 

Epoch: [0][2550/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.3117e+00 (1.5793e+00)
Epoch: [0][2600/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.4314e+00 (1.5765e+00)
Epoch: [0][2650/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.2277e+00 (1.5737e+00)
Epoch: [0][2700/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.4547e+00 (1.5711e+00)
Epoch: [0][2750/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.5176e+00 (1.5685e+00)
Epoch: [0][2800/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.3230e+00 (1.5658e+00)
Epoch: [0][2850/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.2650e+00 (1.5634e+00)
Epoch: [0][2900/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.3520e+00 (1.5604e+00)
Epoch: [0][2950/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.4344e+00 (1.5578e+00)
Epoch: [0][3000/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.3306e+00 (1.5552e+00)
Epoch: [0][3050/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1

Epoch: [1][1650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.3333e+00 (1.3178e+00)
Epoch: [1][1700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.3678e+00 (1.3183e+00)
Epoch: [1][1750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.4587e+00 (1.3182e+00)
Epoch: [1][1800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.4753e+00 (1.3185e+00)
Epoch: [1][1850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.4913e+00 (1.3181e+00)
Epoch: [1][1900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.2681e+00 (1.3180e+00)
Epoch: [1][1950/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.5629e+00 (1.3177e+00)
Epoch: [1][2000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.4205e+00 (1.3184e+00)
Epoch: [1][2050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.0460e+00 (1.3180e+00)
Epoch: [1][2100/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2332e+00 (1.3174e+00)
Epoch: [1][2150/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [2][ 750/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.3027e+00 (1.2632e+00)
Epoch: [2][ 800/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.3378e+00 (1.2627e+00)
Epoch: [2][ 850/5005]	Time  0.560 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.2538e+00 (1.2632e+00)
Epoch: [2][ 900/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.2693e+00 (1.2627e+00)
Epoch: [2][ 950/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.2643e+00 (1.2617e+00)
Epoch: [2][1000/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.2918e+00 (1.2617e+00)
Epoch: [2][1050/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.3807e+00 (1.2629e+00)
Epoch: [2][1100/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1302e+00 (1.2619e+00)
Epoch: [2][1150/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.3756e+00 (1.2629e+00)
Epoch: [2][1200/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.3058e+00 (1.2624e+00)
Epoch: [2][1250/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1

Test: [100/196]	Time  0.377 ( 0.407)	Loss 1.7339e+00 (1.1925e+00)	Acc@1  58.59 ( 69.48)	Acc@5  82.42 ( 90.14)
Test: [150/196]	Time  0.378 ( 0.397)	Loss 1.3621e+00 (1.3336e+00)	Acc@1  69.92 ( 66.82)	Acc@5  84.38 ( 88.06)
epoch 2 1.2687634622129418 66.0219955444336 0.0085 2353956 0.10040116859931866
Epoch: [3][   0/5005]	Time  3.278 ( 3.278)	Data  2.713 ( 2.713)	Loss 1.4356e+00 (1.4356e+00)
Epoch: [3][  50/5005]	Time  0.558 ( 0.612)	Data  0.000 ( 0.053)	Loss 1.1244e+00 (1.2279e+00)
Epoch: [3][ 100/5005]	Time  0.559 ( 0.586)	Data  0.000 ( 0.027)	Loss 1.0817e+00 (1.2346e+00)
Epoch: [3][ 150/5005]	Time  0.559 ( 0.577)	Data  0.000 ( 0.018)	Loss 1.2548e+00 (1.2405e+00)
Epoch: [3][ 200/5005]	Time  0.559 ( 0.573)	Data  0.000 ( 0.014)	Loss 1.2519e+00 (1.2257e+00)
Epoch: [3][ 250/5005]	Time  0.559 ( 0.570)	Data  0.000 ( 0.011)	Loss 1.3461e+00 (1.2256e+00)
Epoch: [3][ 300/5005]	Time  0.559 ( 0.568)	Data  0.000 ( 0.009)	Loss 1.3740e+00 (1.2235e+00)
Epoch: [3][ 350/5005]	Time  0.559 ( 0.567)	Data  0

Epoch: [3][4250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1738e+00 (1.2364e+00)
Epoch: [3][4300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1273e+00 (1.2366e+00)
Epoch: [3][4350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2666e+00 (1.2365e+00)
Epoch: [3][4400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3989e+00 (1.2365e+00)
Epoch: [3][4450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.4492e+00 (1.2365e+00)
Epoch: [3][4500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0786e+00 (1.2364e+00)
Epoch: [3][4550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3811e+00 (1.2363e+00)
Epoch: [3][4600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2835e+00 (1.2362e+00)
Epoch: [3][4650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3651e+00 (1.2362e+00)
Epoch: [3][4700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2580e+00 (1.2363e+00)
Epoch: [3][4750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [4][3350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0611e+00 (1.2050e+00)
Epoch: [4][3400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1410e+00 (1.2052e+00)
Epoch: [4][3450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3577e+00 (1.2051e+00)
Epoch: [4][3500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1849e+00 (1.2049e+00)
Epoch: [4][3550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1206e+00 (1.2048e+00)
Epoch: [4][3600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0442e+00 (1.2048e+00)
Epoch: [4][3650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3940e+00 (1.2050e+00)
Epoch: [4][3700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1926e+00 (1.2052e+00)
Epoch: [4][3750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1362e+00 (1.2053e+00)
Epoch: [4][3800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1744e+00 (1.2057e+00)
Epoch: [4][3850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [5][2450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0458e+00 (1.1827e+00)
Epoch: [5][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3112e+00 (1.1829e+00)
Epoch: [5][2550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0029e+00 (1.1829e+00)
Epoch: [5][2600/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1968e+00 (1.1835e+00)
Epoch: [5][2650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1211e+00 (1.1838e+00)
Epoch: [5][2700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3254e+00 (1.1843e+00)
Epoch: [5][2750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1805e+00 (1.1847e+00)
Epoch: [5][2800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3200e+00 (1.1849e+00)
Epoch: [5][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0836e+00 (1.1850e+00)
Epoch: [5][2900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.3514e+00 (1.1849e+00)
Epoch: [5][2950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [6][1550/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.2779e+00 (1.1592e+00)
Epoch: [6][1600/5005]	Time  0.560 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1900e+00 (1.1579e+00)
Epoch: [6][1650/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0797e+00 (1.1580e+00)
Epoch: [6][1700/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1289e+00 (1.1573e+00)
Epoch: [6][1750/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0323e+00 (1.1573e+00)
Epoch: [6][1800/5005]	Time  0.560 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0358e+00 (1.1572e+00)
Epoch: [6][1850/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0998e+00 (1.1581e+00)
Epoch: [6][1900/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.3044e+00 (1.1584e+00)
Epoch: [6][1950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.2267e+00 (1.1584e+00)
Epoch: [6][2000/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2579e+00 (1.1585e+00)
Epoch: [6][2050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [7][ 650/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.0122e+00 (1.1326e+00)
Epoch: [7][ 700/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.1959e+00 (1.1346e+00)
Epoch: [7][ 750/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.0631e+00 (1.1343e+00)
Epoch: [7][ 800/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.0941e+00 (1.1353e+00)
Epoch: [7][ 850/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 9.3405e-01 (1.1331e+00)
Epoch: [7][ 900/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.1820e+00 (1.1327e+00)
Epoch: [7][ 950/5005]	Time  0.560 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.0807e+00 (1.1351e+00)
Epoch: [7][1000/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.0538e+00 (1.1368e+00)
Epoch: [7][1050/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.1302e+00 (1.1372e+00)
Epoch: [7][1100/5005]	Time  0.560 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.1393e+00 (1.1382e+00)
Epoch: [7][1150/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1

Test: [ 50/196]	Time  0.377 ( 0.435)	Loss 5.4727e-01 (9.0674e-01)	Acc@1  84.77 ( 75.97)	Acc@5  97.27 ( 93.57)
Test: [100/196]	Time  0.378 ( 0.407)	Loss 1.4483e+00 (1.0343e+00)	Acc@1  62.50 ( 73.22)	Acc@5  87.89 ( 92.20)
Test: [150/196]	Time  0.378 ( 0.397)	Loss 1.4907e+00 (1.1656e+00)	Acc@1  70.70 ( 70.90)	Acc@5  84.38 ( 90.29)
epoch 7 1.148222963404673 69.8479995727539 0.006000000000000002 2353956 0.10040116859931866
Epoch: [8][   0/5005]	Time  3.235 ( 3.235)	Data  2.671 ( 2.671)	Loss 1.0094e+00 (1.0094e+00)
Epoch: [8][  50/5005]	Time  0.559 ( 0.612)	Data  0.000 ( 0.053)	Loss 1.1515e+00 (1.1202e+00)
Epoch: [8][ 100/5005]	Time  0.560 ( 0.586)	Data  0.000 ( 0.027)	Loss 1.1940e+00 (1.1163e+00)
Epoch: [8][ 150/5005]	Time  0.560 ( 0.577)	Data  0.000 ( 0.018)	Loss 1.0851e+00 (1.1119e+00)
Epoch: [8][ 200/5005]	Time  0.559 ( 0.572)	Data  0.000 ( 0.013)	Loss 1.1544e+00 (1.1166e+00)
Epoch: [8][ 250/5005]	Time  0.559 ( 0.570)	Data  0.000 ( 0.011)	Loss 1.0927e+00 (1.1218e+00)
Epoch: [8][ 300/5005

Epoch: [8][4200/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2927e+00 (1.1294e+00)
Epoch: [8][4250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1103e+00 (1.1292e+00)
Epoch: [8][4300/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1517e+00 (1.1295e+00)
Epoch: [8][4350/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0783e+00 (1.1295e+00)
Epoch: [8][4400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1611e+00 (1.1296e+00)
Epoch: [8][4450/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2193e+00 (1.1299e+00)
Epoch: [8][4500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.4819e-01 (1.1298e+00)
Epoch: [8][4550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1265e+00 (1.1301e+00)
Epoch: [8][4600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2026e+00 (1.1302e+00)
Epoch: [8][4650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0718e+00 (1.1302e+00)
Epoch: [8][4700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [9][3300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0247e+00 (1.1101e+00)
Epoch: [9][3350/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2064e+00 (1.1104e+00)
Epoch: [9][3400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0600e+00 (1.1101e+00)
Epoch: [9][3450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6226e-01 (1.1101e+00)
Epoch: [9][3500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1552e+00 (1.1106e+00)
Epoch: [9][3550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0851e+00 (1.1107e+00)
Epoch: [9][3600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1079e+00 (1.1111e+00)
Epoch: [9][3650/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1230e+00 (1.1115e+00)
Epoch: [9][3700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0819e+00 (1.1113e+00)
Epoch: [9][3750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0469e-01 (1.1111e+00)
Epoch: [9][3800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [10][2350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1255e+00 (1.0923e+00)
Epoch: [10][2400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6689e-01 (1.0927e+00)
Epoch: [10][2450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.9029e-01 (1.0929e+00)
Epoch: [10][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0930e+00 (1.0928e+00)
Epoch: [10][2550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1045e+00 (1.0929e+00)
Epoch: [10][2600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1057e+00 (1.0937e+00)
Epoch: [10][2650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5363e-01 (1.0935e+00)
Epoch: [10][2700/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1154e+00 (1.0937e+00)
Epoch: [10][2750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1648e+00 (1.0936e+00)
Epoch: [10][2800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2588e+00 (1.0942e+00)
Epoch: [10][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [11][1400/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0428e+00 (1.0693e+00)
Epoch: [11][1450/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.8056e-01 (1.0697e+00)
Epoch: [11][1500/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0078e+00 (1.0701e+00)
Epoch: [11][1550/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0211e+00 (1.0704e+00)
Epoch: [11][1600/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.8310e-01 (1.0703e+00)
Epoch: [11][1650/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0105e+00 (1.0705e+00)
Epoch: [11][1700/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0085e+00 (1.0706e+00)
Epoch: [11][1750/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.001)	Loss 1.0363e+00 (1.0704e+00)
Epoch: [11][1800/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.001)	Loss 1.2299e+00 (1.0704e+00)
Epoch: [11][1850/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.001)	Loss 1.1624e+00 (1.0706e+00)
Epoch: [11][1900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [12][ 450/5005]	Time  0.560 ( 0.565)	Data  0.000 ( 0.006)	Loss 1.0309e+00 (1.0514e+00)
Epoch: [12][ 500/5005]	Time  0.559 ( 0.565)	Data  0.000 ( 0.006)	Loss 1.1085e+00 (1.0504e+00)
Epoch: [12][ 550/5005]	Time  0.558 ( 0.564)	Data  0.000 ( 0.005)	Loss 1.0110e+00 (1.0510e+00)
Epoch: [12][ 600/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.005)	Loss 1.0070e+00 (1.0515e+00)
Epoch: [12][ 650/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 9.8414e-01 (1.0506e+00)
Epoch: [12][ 700/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 9.7527e-01 (1.0508e+00)
Epoch: [12][ 750/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.0330e+00 (1.0497e+00)
Epoch: [12][ 800/5005]	Time  0.560 ( 0.563)	Data  0.000 ( 0.004)	Loss 9.7572e-01 (1.0507e+00)
Epoch: [12][ 850/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.2793e+00 (1.0505e+00)
Epoch: [12][ 900/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.0758e+00 (1.0496e+00)
Epoch: [12][ 950/5005]	Time  0.560 ( 0.562)	Data  0.000 ( 0.

Epoch: [12][4850/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0023e+00 (1.0609e+00)
Epoch: [12][4900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0867e+00 (1.0610e+00)
Epoch: [12][4950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5410e-01 (1.0614e+00)
Epoch: [12][5000/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1866e+00 (1.0617e+00)
Test: [  0/196]	Time  3.375 ( 3.375)	Loss 6.5323e-01 (6.5323e-01)	Acc@1  81.25 ( 81.25)	Acc@5  96.48 ( 96.48)
Test: [ 50/196]	Time  0.378 ( 0.436)	Loss 5.2324e-01 (8.3940e-01)	Acc@1  87.50 ( 77.72)	Acc@5  96.48 ( 94.23)
Test: [100/196]	Time  0.378 ( 0.407)	Loss 1.4116e+00 (9.7806e-01)	Acc@1  63.28 ( 74.64)	Acc@5  87.89 ( 92.72)
Test: [150/196]	Time  0.378 ( 0.397)	Loss 1.1879e+00 (1.0970e+00)	Acc@1  73.83 ( 72.39)	Acc@5  89.45 ( 91.13)
epoch 12 1.0616191091124174 71.36199951171875 0.003500000000000001 2353956 0.10040116859931866
Epoch: [13][   0/5005]	Time  2.892 ( 2.892)	Data  2.331 ( 2.331)	Loss 1.1572e+00 (1.1572e

Epoch: [13][3900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0936e+00 (1.0416e+00)
Epoch: [13][3950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0880e+00 (1.0414e+00)
Epoch: [13][4000/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.2639e-01 (1.0414e+00)
Epoch: [13][4050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0282e+00 (1.0416e+00)
Epoch: [13][4100/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1386e+00 (1.0417e+00)
Epoch: [13][4150/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0535e+00 (1.0416e+00)
Epoch: [13][4200/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0368e+00 (1.0416e+00)
Epoch: [13][4250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2879e+00 (1.0419e+00)
Epoch: [13][4300/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0673e+00 (1.0416e+00)
Epoch: [13][4350/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1610e+00 (1.0416e+00)
Epoch: [13][4400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [14][2950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.4935e-01 (1.0194e+00)
Epoch: [14][3000/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1270e+00 (1.0197e+00)
Epoch: [14][3050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1166e+00 (1.0197e+00)
Epoch: [14][3100/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1198e+00 (1.0199e+00)
Epoch: [14][3150/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0179e+00 (1.0197e+00)
Epoch: [14][3200/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5509e-01 (1.0202e+00)
Epoch: [14][3250/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0393e-01 (1.0202e+00)
Epoch: [14][3300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0269e+00 (1.0203e+00)
Epoch: [14][3350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.2353e-01 (1.0199e+00)
Epoch: [14][3400/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.3773e-01 (1.0203e+00)
Epoch: [14][3450/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.

Epoch: [15][2000/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.8435e-01 (9.9938e-01)
Epoch: [15][2050/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0656e+00 (9.9917e-01)
Epoch: [15][2100/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.9064e-01 (9.9885e-01)
Epoch: [15][2150/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.001)	Loss 9.1140e-01 (9.9893e-01)
Epoch: [15][2200/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5707e-01 (9.9848e-01)
Epoch: [15][2250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.9017e-01 (9.9877e-01)
Epoch: [15][2300/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6033e-01 (9.9895e-01)
Epoch: [15][2350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0441e+00 (9.9942e-01)
Epoch: [15][2400/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.5871e-01 (9.9938e-01)
Epoch: [15][2450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.5966e-01 (9.9918e-01)
Epoch: [15][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [16][1050/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.002)	Loss 1.1207e+00 (9.7553e-01)
Epoch: [16][1100/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.2031e+00 (9.7635e-01)
Epoch: [16][1150/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.8007e-01 (9.7719e-01)
Epoch: [16][1200/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.4168e-01 (9.7755e-01)
Epoch: [16][1250/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 7.5214e-01 (9.7743e-01)
Epoch: [16][1300/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0205e+00 (9.7657e-01)
Epoch: [16][1350/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0567e+00 (9.7721e-01)
Epoch: [16][1400/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.0068e+00 (9.7651e-01)
Epoch: [16][1450/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 7.0601e-01 (9.7722e-01)
Epoch: [16][1500/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.6612e-01 (9.7657e-01)
Epoch: [16][1550/5005]	Time  0.560 ( 0.561)	Data  0.000 ( 0.

Epoch: [17][ 100/5005]	Time  0.559 ( 0.582)	Data  0.000 ( 0.023)	Loss 9.2221e-01 (9.7387e-01)
Epoch: [17][ 150/5005]	Time  0.560 ( 0.574)	Data  0.000 ( 0.015)	Loss 7.8894e-01 (9.7570e-01)
Epoch: [17][ 200/5005]	Time  0.560 ( 0.570)	Data  0.000 ( 0.011)	Loss 1.0130e+00 (9.6431e-01)
Epoch: [17][ 250/5005]	Time  0.559 ( 0.568)	Data  0.000 ( 0.009)	Loss 1.0155e+00 (9.6444e-01)
Epoch: [17][ 300/5005]	Time  0.559 ( 0.567)	Data  0.000 ( 0.008)	Loss 1.0555e+00 (9.6541e-01)
Epoch: [17][ 350/5005]	Time  0.559 ( 0.566)	Data  0.000 ( 0.007)	Loss 1.0263e+00 (9.6509e-01)
Epoch: [17][ 400/5005]	Time  0.559 ( 0.565)	Data  0.000 ( 0.006)	Loss 9.2433e-01 (9.6767e-01)
Epoch: [17][ 450/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.005)	Loss 9.8653e-01 (9.6747e-01)
Epoch: [17][ 500/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.005)	Loss 9.5321e-01 (9.6836e-01)
Epoch: [17][ 550/5005]	Time  0.559 ( 0.563)	Data  0.000 ( 0.004)	Loss 9.0155e-01 (9.6855e-01)
Epoch: [17][ 600/5005]	Time  0.560 ( 0.563)	Data  0.000 ( 0.

Epoch: [17][4500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5495e-01 (9.6637e-01)
Epoch: [17][4550/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5907e-01 (9.6613e-01)
Epoch: [17][4600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.7828e-01 (9.6623e-01)
Epoch: [17][4650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7708e-01 (9.6627e-01)
Epoch: [17][4700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1006e+00 (9.6629e-01)
Epoch: [17][4750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.8662e-01 (9.6645e-01)
Epoch: [17][4800/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.9863e-01 (9.6652e-01)
Epoch: [17][4850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.8229e-01 (9.6663e-01)
Epoch: [17][4900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.4706e-01 (9.6645e-01)
Epoch: [17][4950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.1920e-01 (9.6658e-01)
Epoch: [17][5000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.

Epoch: [18][3550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.8466e-01 (9.4548e-01)
Epoch: [18][3600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7771e-01 (9.4552e-01)
Epoch: [18][3650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5421e-01 (9.4554e-01)
Epoch: [18][3700/5005]	Time  0.562 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.9104e-01 (9.4536e-01)
Epoch: [18][3750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.5887e-01 (9.4512e-01)
Epoch: [18][3800/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.4017e-01 (9.4517e-01)
Epoch: [18][3850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1595e+00 (9.4495e-01)
Epoch: [18][3900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1105e+00 (9.4500e-01)
Epoch: [18][3950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0718e+00 (9.4496e-01)
Epoch: [18][4000/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6237e-01 (9.4479e-01)
Epoch: [18][4050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [19][2600/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.8835e-01 (9.2693e-01)
Epoch: [19][2650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.2079e-01 (9.2709e-01)
Epoch: [19][2700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0275e-01 (9.2700e-01)
Epoch: [19][2750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0001e+00 (9.2702e-01)
Epoch: [19][2800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0702e+00 (9.2680e-01)
Epoch: [19][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0462e+00 (9.2708e-01)
Epoch: [19][2900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.1197e-01 (9.2731e-01)
Epoch: [19][2950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0258e+00 (9.2742e-01)
Epoch: [19][3000/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.1706e-01 (9.2714e-01)
Epoch: [19][3050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0082e+00 (9.2714e-01)
Epoch: [19][3100/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.