In [1]:
seed = 1
sparsity = 0.8
width = 32

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import torch
import torch.nn as nn
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import time
import copy
import sys

import random
import numpy as np
import torch
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import scipy.stats as ss
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

import sys
import numpy as np
import torch.nn.utils.prune as prune
from datautils import *

In [4]:
def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)

random_seed(seed)

In [5]:
train_loader, val_loader = get_loaders(
    "imagenet", path="",
    batchsize=256, workers=8,
    nsamples=-1, seed=0,
    noaug=False
)

In [6]:
def train(train_loader, model, criterion, optimizer, scaler, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    #top1 = AverageMeter('Acc@1', ':6.2f')
    #top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with torch.cuda.amp.autocast():
            output = model(images)
            loss = criterion(output, target)

        # measure accuracy and record loss
        #acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        #top1.update(acc1[0], images.size(0))
        #top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #loss.backward()
        #optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 50 == 0:
            progress.display(i)
        if epoch == -1 and i == 50:
            break

    return losses.avg


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        #print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
        #      .format(top1=top1, top5=top5))

    return top1.avg

In [7]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = LR * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [8]:
def get_res(epochs=20):
    from torchvision.models import resnet50
    model = resnet50(pretrained=True)
    #model = resnet20()
    #model.load_state_dict(torch.load("done_rn20_%s_%s_160_amp_fixed.pth" % (seed, width)))
    print(model, file=sys.stderr)
    model.cuda()
    
    
    #optimizer = torch.optim.AdamW(model.parameters(), 0.001)
    optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, nesterov=True, weight_decay=1e-4)
    opt0 = torch.optim.SGD(model.parameters(), 0.0, momentum=0.9, nesterov=True, weight_decay=1e-4)
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2], gamma=0.1)
    scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=epochs, power=1)
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.004, epochs, cycle_momentum=False)
    criterion = nn.CrossEntropyLoss()#SoftTargetCrossEntropy()
    criterion_val = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler()
    
    total_params = 0
    for n, m in model.named_modules():
        if ("conv" in n or "downsample" in n) and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
            print(n, m.weight.shape)
            total_params += m.weight.numel()
            prune.l1_unstructured(m, name='weight', amount=sparsity)
    print("tot", total_params)
    
    best_acc1 = 0
    acc1 = validate(val_loader, model, criterion_val).item()
    print("start acc no bn", acc1)
    train_loss = train(train_loader, model, criterion, opt0, scaler, -1)
    acc1 = validate(val_loader, model, criterion_val).item()
    total_active = 0
    for n, m in model.named_modules():
        if "conv" in n or "downsample" in n and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
            total_active += (m.weight != 0).sum().item()
    print("start acc bn", acc1, total_active, total_active / total_params)

    for epoch in range(epochs):
        train_loss = train(train_loader, model, criterion, optimizer, scaler, epoch)
        acc1 = validate(val_loader, model, criterion_val).item()
        scheduler.step()
        
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        total_active = 0
        for n, m in model.named_modules():
            if "conv" in n or "downsample" in n and "Conv" in str(type(m)) and m.weight.shape[1] > 3:
                total_active += (m.weight != 0).sum().item()
        print("epoch", epoch, train_loss, acc1, optimizer.param_groups[0]['lr'], total_active, total_active / total_params)
    
    return acc1, copy.deepcopy(model.state_dict())

acc, end = get_res()

print("acc", acc)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

layer1.0.conv1 torch.Size([64, 64, 1, 1])
layer1.0.conv2 torch.Size([64, 64, 3, 3])
layer1.0.conv3 torch.Size([256, 64, 1, 1])
layer1.0.downsample.0 torch.Size([256, 64, 1, 1])
layer1.1.conv1 torch.Size([64, 256, 1, 1])
layer1.1.conv2 torch.Size([64, 64, 3, 3])
layer1.1.conv3 torch.Size([256, 64, 1, 1])
layer1.2.conv1 torch.Size([64, 256, 1, 1])
layer1.2.conv2 torch.Size([64, 64, 3, 3])
layer1.2.conv3 torch.Size([256, 64, 1, 1])
layer2.0.conv1 torch.Size([128, 256, 1, 1])
layer2.0.conv2 torch.Size([128, 128, 3, 3])
layer2.0.conv3 torch.Size([512, 128, 1, 1])
layer2.0.downsample.0 torch.Size([512, 256, 1, 1])
layer2.1.conv1 torch.Size([128, 512, 1, 1])
layer2.1.conv2 torch.Size([128, 128, 3, 3])
layer2.1.conv3 torch.Size([512, 128, 1, 1])
layer2.2.conv1 torch.Size([128, 512, 1, 1])
layer2.2.conv2 torch.Size([128, 128, 3, 3])
layer2.2.conv3 torch.Size([512, 128, 1, 1])
layer2.3.conv1 torch.Size([128, 512, 1, 1])
layer2.3.conv2 torch.Size([128, 128, 3, 3])
layer2.3.conv3 torch.Size([512, 

Epoch: [0][2550/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0736e+00 (1.1638e+00)
Epoch: [0][2600/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0988e+00 (1.1632e+00)
Epoch: [0][2650/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 9.0896e-01 (1.1628e+00)
Epoch: [0][2700/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0670e+00 (1.1624e+00)
Epoch: [0][2750/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.2468e+00 (1.1619e+00)
Epoch: [0][2800/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0274e+00 (1.1613e+00)
Epoch: [0][2850/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0289e+00 (1.1611e+00)
Epoch: [0][2900/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0493e+00 (1.1602e+00)
Epoch: [0][2950/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.0800e+00 (1.1596e+00)
Epoch: [0][3000/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 1.1059e+00 (1.1591e+00)
Epoch: [0][3050/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 9

Epoch: [1][1650/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.1183e+00 (1.0667e+00)
Epoch: [1][1700/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.1739e+00 (1.0671e+00)
Epoch: [1][1750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.2097e+00 (1.0676e+00)
Epoch: [1][1800/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.2935e+00 (1.0683e+00)
Epoch: [1][1850/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.2205e+00 (1.0682e+00)
Epoch: [1][1900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.1254e+00 (1.0685e+00)
Epoch: [1][1950/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.3243e+00 (1.0686e+00)
Epoch: [1][2000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1900e+00 (1.0695e+00)
Epoch: [1][2050/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.6963e-01 (1.0697e+00)
Epoch: [1][2100/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0618e+00 (1.0693e+00)
Epoch: [1][2150/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [2][ 750/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.004)	Loss 1.1033e+00 (1.0289e+00)
Epoch: [2][ 800/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.1044e+00 (1.0291e+00)
Epoch: [2][ 850/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.003)	Loss 1.0012e+00 (1.0304e+00)
Epoch: [2][ 900/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.003)	Loss 9.5912e-01 (1.0298e+00)
Epoch: [2][ 950/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.003)	Loss 1.0624e+00 (1.0298e+00)
Epoch: [2][1000/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.003)	Loss 1.0298e+00 (1.0300e+00)
Epoch: [2][1050/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.003)	Loss 1.1333e+00 (1.0315e+00)
Epoch: [2][1100/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.003)	Loss 8.9054e-01 (1.0306e+00)
Epoch: [2][1150/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1295e+00 (1.0315e+00)
Epoch: [2][1200/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1133e+00 (1.0315e+00)
Epoch: [2][1250/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1

Test: [100/196]	Time  0.377 ( 0.413)	Loss 1.3742e+00 (9.8111e-01)	Acc@1  66.80 ( 74.44)	Acc@5  87.11 ( 92.66)
Test: [150/196]	Time  0.378 ( 0.401)	Loss 1.1886e+00 (1.1128e+00)	Acc@1  73.44 ( 71.85)	Acc@5  87.89 ( 90.95)
epoch 2 1.0462989768734747 71.06999969482422 0.0085 4698510 0.20040132214688156
Epoch: [3][   0/5005]	Time  3.221 ( 3.221)	Data  2.657 ( 2.657)	Loss 1.2142e+00 (1.2142e+00)
Epoch: [3][  50/5005]	Time  0.558 ( 0.611)	Data  0.000 ( 0.052)	Loss 9.5621e-01 (1.0098e+00)
Epoch: [3][ 100/5005]	Time  0.558 ( 0.585)	Data  0.000 ( 0.027)	Loss 8.8497e-01 (1.0148e+00)
Epoch: [3][ 150/5005]	Time  0.558 ( 0.576)	Data  0.000 ( 0.018)	Loss 1.0436e+00 (1.0206e+00)
Epoch: [3][ 200/5005]	Time  0.558 ( 0.572)	Data  0.000 ( 0.013)	Loss 1.0050e+00 (1.0080e+00)
Epoch: [3][ 250/5005]	Time  0.558 ( 0.569)	Data  0.000 ( 0.011)	Loss 1.0820e+00 (1.0065e+00)
Epoch: [3][ 300/5005]	Time  0.558 ( 0.567)	Data  0.000 ( 0.009)	Loss 1.1982e+00 (1.0057e+00)
Epoch: [3][ 350/5005]	Time  0.558 ( 0.566)	Data  

Epoch: [3][4250/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.2525e-01 (1.0240e+00)
Epoch: [3][4300/5005]	Time  0.562 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0136e+00 (1.0242e+00)
Epoch: [3][4350/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0841e+00 (1.0242e+00)
Epoch: [3][4400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2150e+00 (1.0241e+00)
Epoch: [3][4450/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1713e+00 (1.0242e+00)
Epoch: [3][4500/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.3674e-01 (1.0241e+00)
Epoch: [3][4550/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.2302e+00 (1.0240e+00)
Epoch: [3][4600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1086e+00 (1.0240e+00)
Epoch: [3][4650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0767e+00 (1.0241e+00)
Epoch: [3][4700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0596e+00 (1.0243e+00)
Epoch: [3][4750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1

Epoch: [4][3350/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.9542e-01 (9.9807e-01)
Epoch: [4][3400/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.4848e-01 (9.9834e-01)
Epoch: [4][3450/5005]	Time  0.562 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1386e+00 (9.9825e-01)
Epoch: [4][3500/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0120e+00 (9.9817e-01)
Epoch: [4][3550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7283e-01 (9.9812e-01)
Epoch: [4][3600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.1716e-01 (9.9833e-01)
Epoch: [4][3650/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1992e+00 (9.9865e-01)
Epoch: [4][3700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7073e-01 (9.9879e-01)
Epoch: [4][3750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6913e-01 (9.9877e-01)
Epoch: [4][3800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0821e+00 (9.9912e-01)
Epoch: [4][3850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9

Epoch: [5][2450/5005]	Time  0.561 ( 0.560)	Data  0.001 ( 0.001)	Loss 8.9964e-01 (9.7833e-01)
Epoch: [5][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1006e+00 (9.7864e-01)
Epoch: [5][2550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.3472e-01 (9.7883e-01)
Epoch: [5][2600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7198e-01 (9.7933e-01)
Epoch: [5][2650/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.7519e-01 (9.7966e-01)
Epoch: [5][2700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0780e+00 (9.8021e-01)
Epoch: [5][2750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.2796e-01 (9.8067e-01)
Epoch: [5][2800/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1164e+00 (9.8083e-01)
Epoch: [5][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.8341e-01 (9.8102e-01)
Epoch: [5][2900/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.1017e+00 (9.8109e-01)
Epoch: [5][2950/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9

Epoch: [6][1550/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 1.1130e+00 (9.5739e-01)
Epoch: [6][1600/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.6129e-01 (9.5621e-01)
Epoch: [6][1650/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.8287e-01 (9.5589e-01)
Epoch: [6][1700/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 9.0073e-01 (9.5544e-01)
Epoch: [6][1750/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.2898e-01 (9.5547e-01)
Epoch: [6][1800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 9.0552e-01 (9.5542e-01)
Epoch: [6][1850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.4614e-01 (9.5645e-01)
Epoch: [6][1900/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.0975e+00 (9.5680e-01)
Epoch: [6][1950/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.0230e+00 (9.5684e-01)
Epoch: [6][2000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 1.0198e+00 (9.5662e-01)
Epoch: [6][2050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 1

Epoch: [7][ 650/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.004)	Loss 7.8760e-01 (9.3351e-01)
Epoch: [7][ 700/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.004)	Loss 1.0247e+00 (9.3512e-01)
Epoch: [7][ 750/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.004)	Loss 8.5250e-01 (9.3451e-01)
Epoch: [7][ 800/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.004)	Loss 8.3944e-01 (9.3545e-01)
Epoch: [7][ 850/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.003)	Loss 7.4175e-01 (9.3368e-01)
Epoch: [7][ 900/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.003)	Loss 9.7891e-01 (9.3303e-01)
Epoch: [7][ 950/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.003)	Loss 8.3470e-01 (9.3509e-01)
Epoch: [7][1000/5005]	Time  0.561 ( 0.561)	Data  0.000 ( 0.003)	Loss 8.3214e-01 (9.3703e-01)
Epoch: [7][1050/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.003)	Loss 9.2922e-01 (9.3723e-01)
Epoch: [7][1100/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.003)	Loss 9.7858e-01 (9.3821e-01)
Epoch: [7][1150/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.003)	Loss 1

Test: [ 50/196]	Time  0.377 ( 0.435)	Loss 4.8007e-01 (8.0124e-01)	Acc@1  87.89 ( 78.72)	Acc@5  97.27 ( 94.75)
Test: [100/196]	Time  0.377 ( 0.407)	Loss 1.3399e+00 (9.2762e-01)	Acc@1  61.33 ( 75.72)	Acc@5  89.06 ( 93.37)
Test: [150/196]	Time  0.378 ( 0.397)	Loss 1.2720e+00 (1.0487e+00)	Acc@1  71.48 ( 73.46)	Acc@5  89.06 ( 91.73)
epoch 7 0.9515117987812275 72.51200103759766 0.006000000000000002 4698510 0.20040132214688156
Epoch: [8][   0/5005]	Time  3.208 ( 3.208)	Data  2.648 ( 2.648)	Loss 8.7304e-01 (8.7304e-01)
Epoch: [8][  50/5005]	Time  0.559 ( 0.611)	Data  0.000 ( 0.052)	Loss 9.5669e-01 (9.1540e-01)
Epoch: [8][ 100/5005]	Time  0.560 ( 0.585)	Data  0.000 ( 0.026)	Loss 9.7956e-01 (9.1327e-01)
Epoch: [8][ 150/5005]	Time  0.558 ( 0.577)	Data  0.000 ( 0.018)	Loss 9.0067e-01 (9.1244e-01)
Epoch: [8][ 200/5005]	Time  0.558 ( 0.572)	Data  0.000 ( 0.013)	Loss 8.9880e-01 (9.1479e-01)
Epoch: [8][ 250/5005]	Time  0.560 ( 0.570)	Data  0.000 ( 0.011)	Loss 8.8328e-01 (9.1931e-01)
Epoch: [8][ 300/50

Epoch: [8][4200/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0631e+00 (9.3401e-01)
Epoch: [8][4250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.3810e-01 (9.3406e-01)
Epoch: [8][4300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.3702e-01 (9.3434e-01)
Epoch: [8][4350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.8409e-01 (9.3438e-01)
Epoch: [8][4400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.9458e-01 (9.3452e-01)
Epoch: [8][4450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0388e+00 (9.3466e-01)
Epoch: [8][4500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.6537e-01 (9.3459e-01)
Epoch: [8][4550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.7045e-01 (9.3490e-01)
Epoch: [8][4600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.9674e-01 (9.3506e-01)
Epoch: [8][4650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.6445e-01 (9.3499e-01)
Epoch: [8][4700/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9

Epoch: [9][3300/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.5172e-01 (9.1453e-01)
Epoch: [9][3350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0276e+00 (9.1477e-01)
Epoch: [9][3400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.9390e-01 (9.1452e-01)
Epoch: [9][3450/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.0690e-01 (9.1446e-01)
Epoch: [9][3500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0410e+00 (9.1502e-01)
Epoch: [9][3550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.2686e-01 (9.1511e-01)
Epoch: [9][3600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0212e-01 (9.1546e-01)
Epoch: [9][3650/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.1757e-01 (9.1578e-01)
Epoch: [9][3700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.7530e-01 (9.1573e-01)
Epoch: [9][3750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.3165e-01 (9.1557e-01)
Epoch: [9][3800/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8

Epoch: [10][2350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5394e-01 (8.9557e-01)
Epoch: [10][2400/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7244e-01 (8.9584e-01)
Epoch: [10][2450/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.9894e-01 (8.9592e-01)
Epoch: [10][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.8286e-01 (8.9580e-01)
Epoch: [10][2550/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.9123e-01 (8.9588e-01)
Epoch: [10][2600/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.8129e-01 (8.9661e-01)
Epoch: [10][2650/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7716e-01 (8.9666e-01)
Epoch: [10][2700/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0245e-01 (8.9696e-01)
Epoch: [10][2750/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5191e-01 (8.9683e-01)
Epoch: [10][2800/5005]	Time  0.562 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0556e+00 (8.9739e-01)
Epoch: [10][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [11][1400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.1933e-01 (8.7380e-01)
Epoch: [11][1450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.6606e-01 (8.7429e-01)
Epoch: [11][1500/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 7.8474e-01 (8.7445e-01)
Epoch: [11][1550/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.0149e-01 (8.7482e-01)
Epoch: [11][1600/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.1726e-01 (8.7488e-01)
Epoch: [11][1650/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 7.4511e-01 (8.7513e-01)
Epoch: [11][1700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.0583e-01 (8.7511e-01)
Epoch: [11][1750/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.6052e-01 (8.7479e-01)
Epoch: [11][1800/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 9.5248e-01 (8.7455e-01)
Epoch: [11][1850/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 9.2905e-01 (8.7504e-01)
Epoch: [11][1900/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.

Epoch: [12][ 450/5005]	Time  0.559 ( 0.565)	Data  0.000 ( 0.006)	Loss 8.4366e-01 (8.5460e-01)
Epoch: [12][ 500/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.006)	Loss 8.7721e-01 (8.5414e-01)
Epoch: [12][ 550/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.005)	Loss 9.1172e-01 (8.5495e-01)
Epoch: [12][ 600/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.005)	Loss 8.3046e-01 (8.5578e-01)
Epoch: [12][ 650/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.004)	Loss 7.7503e-01 (8.5552e-01)
Epoch: [12][ 700/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.004)	Loss 7.8837e-01 (8.5558e-01)
Epoch: [12][ 750/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.004)	Loss 8.4215e-01 (8.5472e-01)
Epoch: [12][ 800/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.004)	Loss 7.5991e-01 (8.5515e-01)
Epoch: [12][ 850/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.003)	Loss 1.0375e+00 (8.5493e-01)
Epoch: [12][ 900/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.003)	Loss 8.5979e-01 (8.5418e-01)
Epoch: [12][ 950/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.

Epoch: [12][4850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.4728e-01 (8.6608e-01)
Epoch: [12][4900/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.5365e-01 (8.6624e-01)
Epoch: [12][4950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7661e-01 (8.6661e-01)
Epoch: [12][5000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0463e+00 (8.6693e-01)
Test: [  0/196]	Time  3.311 ( 3.311)	Loss 5.7075e-01 (5.7075e-01)	Acc@1  84.38 ( 84.38)	Acc@5  96.48 ( 96.48)
Test: [ 50/196]	Time  0.377 ( 0.435)	Loss 4.6218e-01 (7.5216e-01)	Acc@1  88.67 ( 79.98)	Acc@5  96.48 ( 95.11)
Test: [100/196]	Time  0.378 ( 0.406)	Loss 1.3548e+00 (8.8647e-01)	Acc@1  65.23 ( 76.88)	Acc@5  88.28 ( 93.78)
Test: [150/196]	Time  0.377 ( 0.397)	Loss 1.1043e+00 (1.0008e+00)	Acc@1  76.56 ( 74.66)	Acc@5  88.67 ( 92.27)
epoch 12 0.8669103983416425 73.64799499511719 0.003500000000000001 4698510 0.20040132214688156
Epoch: [13][   0/5005]	Time  2.779 ( 2.779)	Data  2.217 ( 2.217)	Loss 8.6160e-01 (8.6160e

Epoch: [13][3900/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.4763e-01 (8.4668e-01)
Epoch: [13][3950/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.6633e-01 (8.4652e-01)
Epoch: [13][4000/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.0685e-01 (8.4662e-01)
Epoch: [13][4050/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.4958e-01 (8.4680e-01)
Epoch: [13][4100/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.8019e-01 (8.4705e-01)
Epoch: [13][4150/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.7521e-01 (8.4699e-01)
Epoch: [13][4200/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.0115e-01 (8.4710e-01)
Epoch: [13][4250/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 1.0846e+00 (8.4744e-01)
Epoch: [13][4300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.8320e-01 (8.4721e-01)
Epoch: [13][4350/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.4898e-01 (8.4722e-01)
Epoch: [13][4400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.

Epoch: [14][2950/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 5.7969e-01 (8.2155e-01)
Epoch: [14][3000/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.0424e-01 (8.2180e-01)
Epoch: [14][3050/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.4216e-01 (8.2178e-01)
Epoch: [14][3100/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.7088e-01 (8.2192e-01)
Epoch: [14][3150/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7809e-01 (8.2176e-01)
Epoch: [14][3200/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7926e-01 (8.2225e-01)
Epoch: [14][3250/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.4471e-01 (8.2250e-01)
Epoch: [14][3300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 9.3179e-01 (8.2272e-01)
Epoch: [14][3350/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.8987e-01 (8.2255e-01)
Epoch: [14][3400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.5693e-01 (8.2311e-01)
Epoch: [14][3450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [15][2000/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 7.0394e-01 (8.0192e-01)
Epoch: [15][2050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.002)	Loss 8.5155e-01 (8.0172e-01)
Epoch: [15][2100/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.002)	Loss 6.9524e-01 (8.0148e-01)
Epoch: [15][2150/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7267e-01 (8.0162e-01)
Epoch: [15][2200/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7761e-01 (8.0128e-01)
Epoch: [15][2250/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.0417e-01 (8.0158e-01)
Epoch: [15][2300/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.8435e-01 (8.0170e-01)
Epoch: [15][2350/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.3310e-01 (8.0220e-01)
Epoch: [15][2400/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.4703e-01 (8.0203e-01)
Epoch: [15][2450/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.0524e-01 (8.0205e-01)
Epoch: [15][2500/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [16][1050/5005]	Time  0.559 ( 0.562)	Data  0.000 ( 0.003)	Loss 9.6731e-01 (7.7894e-01)
Epoch: [16][1100/5005]	Time  0.558 ( 0.562)	Data  0.000 ( 0.002)	Loss 9.7354e-01 (7.7964e-01)
Epoch: [16][1150/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.5428e-01 (7.7994e-01)
Epoch: [16][1200/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 6.6317e-01 (7.8008e-01)
Epoch: [16][1250/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 5.9824e-01 (7.8026e-01)
Epoch: [16][1300/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.1940e-01 (7.7943e-01)
Epoch: [16][1350/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.6015e-01 (7.7989e-01)
Epoch: [16][1400/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 8.3941e-01 (7.7931e-01)
Epoch: [16][1450/5005]	Time  0.559 ( 0.561)	Data  0.000 ( 0.002)	Loss 5.4901e-01 (7.7970e-01)
Epoch: [16][1500/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.002)	Loss 7.8743e-01 (7.7939e-01)
Epoch: [16][1550/5005]	Time  0.558 ( 0.561)	Data  0.000 ( 0.

Epoch: [17][ 100/5005]	Time  0.558 ( 0.581)	Data  0.000 ( 0.022)	Loss 6.8795e-01 (7.6921e-01)
Epoch: [17][ 150/5005]	Time  0.559 ( 0.573)	Data  0.000 ( 0.015)	Loss 6.5588e-01 (7.7128e-01)
Epoch: [17][ 200/5005]	Time  0.558 ( 0.570)	Data  0.000 ( 0.011)	Loss 8.2722e-01 (7.6364e-01)
Epoch: [17][ 250/5005]	Time  0.559 ( 0.567)	Data  0.000 ( 0.009)	Loss 8.4938e-01 (7.6351e-01)
Epoch: [17][ 300/5005]	Time  0.558 ( 0.566)	Data  0.000 ( 0.008)	Loss 8.8848e-01 (7.6511e-01)
Epoch: [17][ 350/5005]	Time  0.558 ( 0.565)	Data  0.000 ( 0.007)	Loss 8.0716e-01 (7.6455e-01)
Epoch: [17][ 400/5005]	Time  0.559 ( 0.564)	Data  0.000 ( 0.006)	Loss 7.3322e-01 (7.6700e-01)
Epoch: [17][ 450/5005]	Time  0.560 ( 0.563)	Data  0.000 ( 0.005)	Loss 7.9011e-01 (7.6739e-01)
Epoch: [17][ 500/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.005)	Loss 7.3769e-01 (7.6815e-01)
Epoch: [17][ 550/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.004)	Loss 6.8866e-01 (7.6809e-01)
Epoch: [17][ 600/5005]	Time  0.558 ( 0.563)	Data  0.000 ( 0.

Epoch: [17][4500/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.9351e-01 (7.6801e-01)
Epoch: [17][4550/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.7050e-01 (7.6780e-01)
Epoch: [17][4600/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 6.4788e-01 (7.6793e-01)
Epoch: [17][4650/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.8464e-01 (7.6803e-01)
Epoch: [17][4700/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 8.7276e-01 (7.6803e-01)
Epoch: [17][4750/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.8922e-01 (7.6820e-01)
Epoch: [17][4800/5005]	Time  0.559 ( 0.559)	Data  0.000 ( 0.001)	Loss 8.5940e-01 (7.6827e-01)
Epoch: [17][4850/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.9251e-01 (7.6839e-01)
Epoch: [17][4900/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 6.8060e-01 (7.6821e-01)
Epoch: [17][4950/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.001)	Loss 7.6958e-01 (7.6836e-01)
Epoch: [17][5000/5005]	Time  0.558 ( 0.559)	Data  0.000 ( 0.

Epoch: [18][3550/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.4037e-01 (7.4643e-01)
Epoch: [18][3600/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.2831e-01 (7.4652e-01)
Epoch: [18][3650/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.8507e-01 (7.4655e-01)
Epoch: [18][3700/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.9964e-01 (7.4646e-01)
Epoch: [18][3750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.7636e-01 (7.4624e-01)
Epoch: [18][3800/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.4982e-01 (7.4631e-01)
Epoch: [18][3850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.5131e-01 (7.4610e-01)
Epoch: [18][3900/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.3028e-01 (7.4616e-01)
Epoch: [18][3950/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.6968e-01 (7.4617e-01)
Epoch: [18][4000/5005]	Time  0.562 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.1373e-01 (7.4607e-01)
Epoch: [18][4050/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.

Epoch: [19][2600/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.8668e-01 (7.2699e-01)
Epoch: [19][2650/5005]	Time  0.561 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.3903e-01 (7.2719e-01)
Epoch: [19][2700/5005]	Time  0.560 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.9846e-01 (7.2707e-01)
Epoch: [19][2750/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.9705e-01 (7.2705e-01)
Epoch: [19][2800/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.6591e-01 (7.2694e-01)
Epoch: [19][2850/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 8.2229e-01 (7.2709e-01)
Epoch: [19][2900/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.6299e-01 (7.2736e-01)
Epoch: [19][2950/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.7997e-01 (7.2766e-01)
Epoch: [19][3000/5005]	Time  0.559 ( 0.560)	Data  0.000 ( 0.001)	Loss 6.3463e-01 (7.2728e-01)
Epoch: [19][3050/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.001)	Loss 7.3583e-01 (7.2742e-01)
Epoch: [19][3100/5005]	Time  0.558 ( 0.560)	Data  0.000 ( 0.