In [1]:
seed = 10
sparsity = 0.8
width = 32

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
import torch
import torch.nn as nn
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import time
import copy
import sys

import random
import numpy as np
import torch
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import scipy.stats as ss
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

import sys
import numpy as np
import torch.nn.utils.prune as prune
from datautils import *

In [4]:
def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)

random_seed(47)

In [5]:
train_loader, val_loader = get_loaders(
    "imagenet", path="",
    batchsize=256, workers=8,
    nsamples=-1, seed=0,
    noaug=False
)

In [6]:
def train(train_loader, model, criterion, optimizer, scaler, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    #top1 = AverageMeter('Acc@1', ':6.2f')
    #top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with torch.cuda.amp.autocast(enabled=True):
            output = model(images)
            loss = criterion(output, target)

        # measure accuracy and record loss
        #acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        #top1.update(acc1[0], images.size(0))
        #top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #loss.backward()
        #optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 50 == 0:
            progress.display(i)
        if epoch == -1 and i == 50:
            break

    return losses.avg


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        #print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
        #      .format(top1=top1, top5=top5))

    return top1.avg

In [7]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = LR * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [8]:
def find_other2(A, W, nnz, Z, U, print_sc=None, debug=False, reg=0, rho_start=0.03, iters=5, prune_iters=2):
    XX = A.T.matmul(A)
    norm2 = torch.diag(XX).sqrt() + 1e-8
    An = A / norm2
    XX = An.T.matmul(An)
    XX += torch.diag(torch.ones_like(XX.diag())) * XX.diag().mean() * reg
    
    #norm2 = torch.ones_like(norm2)
    Wnn = W# * norm2.unsqueeze(1)
    rho = 1
    XY = An.T.matmul(Wnn)
    XXinv = torch.inverse(XX + torch.eye(XX.shape[1], device=XX.device)*rho)
    XXinv2 = torch.inverse(XX + torch.eye(XX.shape[1], device=XX.device)*rho_start)
    U = U * norm2.unsqueeze(1)
    Z = Z * norm2.unsqueeze(1)
    
    #B = torch.linalg.solve(XX, XY)
    B = XXinv2.matmul(XY + rho_start*(Z-U))
    
    #U = torch.zeros_like(B)
    
    #Z = B
    
    bsparsity = min(0.99, 1 - nnz/B.numel())
    #print("bs", bsparsity)


    for itt in range(iters):
        if itt < prune_iters:
            cur_sparsity = bsparsity# - bsparsity * (1 - (itt + 1) / iterative_prune) ** 3
            thres = (B+U).abs().flatten().sort()[0][int(B.numel() * cur_sparsity)]
            mask = ((B+U).abs() > thres)
            del thres

        Z = (B + U) * mask    

        U = U + (B - Z)    

        B = XXinv.matmul(XY + rho*(Z-U))
        #B = torch.linalg.solve(XX + torch.eye(XX.shape[1], device=XX.device)*rho, XY + rho*(Z-U))
        if debug:
            print(itt, cur_sparsity, (Z != 0).sum().item() / Z.numel())
            print_sc(A.matmul(B / norm2.unsqueeze(1)))
            print_sc(A.matmul(Z / norm2.unsqueeze(1)))
            print(((An != 0).sum() + (Z != 0).sum()) / W.numel())
            print("-------")
    if debug:
        print("opt end")

    return Z / norm2.unsqueeze(1), U / norm2.unsqueeze(1)    
    
def mag_prune(W, sp=0.6):
    thres = (W).abs().flatten().sort()[0][int(W.numel() * sp)]
    mask = ((W).abs() > thres)
    return W * mask

def ent(p):
    return -(p * np.log2(p) + (1-p) * np.log2(1-p))

def factorizeT(W, XX, asp=0.16, sp=0.4, iters=40):
    #W = lx.weight.detach().T.float()
    nza = int(W.shape[0]**2 * asp)
    nzb = int(W.numel() * sp - nza)
    
    Az = torch.eye(W.shape[0], device=W.device)
    Au = torch.zeros_like(Az)
    norm = XX.diag().sqrt().unsqueeze(1) + 1e-8
    norm = torch.ones_like(norm)
       
    Wn = W * norm
       
    print("nz", nza, nzb, Wn.shape)
    Bz = mag_prune(Wn, (1 - nzb/W.numel()))
    Bu = torch.zeros_like(Bz)
    
    for itt in range(iters):
        #if itt < 10:
        #    rho_start = 0.0
        #elif itt < 15:
        #    rho_start = 0.00
        #else:
        #    rho_start = 0.1
        rho_start = min(1.0, itt / (iters-3))**3
        Az, Au = (x.T for x in find_other2(Bz.T, Wn.T, nza, Az.T, Au.T, reg=1e-2, debug=False, rho_start=rho_start))
                
        Bz, Bu = find_other2(Az, Wn, nzb, Bz, Bu, reg=1e-2, debug=False, rho_start=rho_start)
    
    #print(((Az != 0).sum() + (Bz != 0).sum()).item() / W.numel(), (Az != 0).sum().item() / Az.numel(),
    #      (Bz != 0).sum().item() / Bz.numel(), Az.shape, Bz.shape,
    #     (Az.numel()*ent((Az != 0).sum().item() / Az.numel()) + Bz.numel()*ent((Bz != 0).sum().item() / Bz.numel())) / W.numel(), 
    #    ent(0.4), ent(0.5))
    return ((Az / norm).matmul(Bz)).T, Bz.T, (Az / norm).T


def factorizef(W, XX, asp=0.16, sp=0.4, iters=200, l_prev=None):
    s_time = time.time()
    if W.shape[0] >= W.shape[1]:
        return factorizeT(W.T, XX, sp=sp, asp=asp, iters=iters)
    
    nza = int(W.shape[0]**2 * asp)
    nzb = int(W.numel() * sp - nza)
    norm = XX.diag().sqrt() + 1e-8
    norm = torch.ones_like(norm)

    Wn = W * norm
    
    Az = torch.eye(W.shape[0], device=W.device)
    Au = torch.zeros_like(Az)

    print("nz", nza, nzb, Wn.shape)
    Bz = mag_prune(Wn, (1 - nzb/W.numel()))
    Bu = torch.zeros_like(Bz)
    
    for itt in range(iters):
        #if itt < 10:
        #    rho_start = 0.0
        #elif itt < 15:
        #    rho_start = 0.00
        #else:
        #    rho_start = 0.1
            
        rho_start = min(1.0, itt / (iters-3))**3
        Az, Au = (x.T for x in find_other2(Bz.T, Wn.T, nza, Az.T, Au.T, reg=1e-2, debug=False, rho_start=rho_start))
                
        Bz, Bu = find_other2(Az, Wn, nzb, Bz, Bu, reg=1e-2, debug=False, rho_start=rho_start)
        
        #print(itt, time.time() - s_time, end =" ") 
        #print_scores(Az.matmul(Bz / norm))
        
        
    #print(((Az != 0).sum() + (Bz != 0).sum()).item() / W.numel(), (Az != 0).sum().item() / Az.numel(),
    #      (Bz != 0).sum().item() / Bz.numel(), Az.shape, Bz.shape,
    #     (Az.numel()*ent((Az != 0).sum().item() / Az.numel()) + Bz.numel()*ent((Bz != 0).sum().item() / Bz.numel())) / W.numel(), 
    #    ent(0.4), ent(0.5))
    return Az.matmul(Bz / norm), Az, Bz / norm

def factorize(XX, W, sp, l_prev=None):
    W = W.detach().float()
    asp = max(0.05, sp/2)
    W2, Ab, Bb = factorizef(W, XX, sp=sp, asp=asp, l_prev=l_prev)
    An = Ab.norm(dim=0) + 1e-12
    Bn = Bb.norm(dim=1) + 1e-12
    #print(An, Bn)
    Ab *= (Bn/An).sqrt()
    Bb *= (An/Bn).sqrt().unsqueeze(1)
    #print(Ab.norm(dim=0), Bb.norm(dim=1))
    W2 = Ab.matmul(Bb)
    print("err_prefin", (W2 - W).matmul(XX).matmul((W2 - W).T).diag().sum().item(), W.abs().amax().item(), Ab.abs().amax().item(), Bb.abs().amax().item())
    #qq = qqqq
    print("sparsity check", ((Ab != 0).sum() + (Bb != 0).sum()).item() / W2.numel())
    return W2, (Ab, Bb)

In [9]:
def hook(m, *args, **kwargs):
    m.weight = m.wo * m.mask

def add_mask(m):
    m.register_parameter("wo", m.weight)
    m.register_buffer("mask", torch.nn.Parameter((m.weight.data != 0).to(torch.float32)))
    del m._parameters["weight"]
    m.register_forward_pre_hook(hook)


def run_dsp(model):
    out_admm = {}
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3:
            density = 1 - sparsity
            w_orig = m.weight.flatten(1)
            w_mag = mag_prune(w_orig, sparsity)
            w_admm, facts = factorize(torch.eye(w_orig.shape[1], device=w_orig.device), w_orig, density)
            out_admm[n] = (w_admm.reshape(w_orig.shape), facts)
            print(n, (w_admm - w_orig).square().sum().item(), (w_mag - w_orig).square().sum().item(), w_orig.square().sum().item())
            #m.XX = None

    for n, m in model.named_modules():
        if n in out_admm:
            print("change", n)
            m.weight.data = out_admm[n][0].reshape(m.weight.shape)
            m.weight.facts = out_admm[n][1]
    
    for n, m in model.named_modules():
        if "Bottleneck" in str(type(m)):
            print(m.conv1.weight.shape, m.conv1.weight.facts[0].shape, m.conv1.weight.facts[1].shape)
            if True:
                ff = m.conv1.weight.facts
                m.conv1b = m.conv1
                m.conv1 = nn.Sequential(
                    nn.Conv2d(m.conv1b.in_channels, m.conv1b.out_channels, 1, bias=False),
                    nn.Conv2d(m.conv1b.out_channels, m.conv1b.out_channels, 1, bias=False)
                )
                m.conv1[0].weight.data = ff[1].reshape(m.conv1[0].weight.shape)
                m.conv1[1].weight.data = ff[0].reshape(m.conv1[1].weight.shape)
                m.conv1.cuda()
                add_mask(m.conv1[0])
                add_mask(m.conv1[1])
                
            print(m.conv2.weight.shape, m.conv2.weight.facts[0].shape, m.conv2.weight.facts[1].shape)
            
            if True:
                ff = m.conv2.weight.facts
                m.conv2b = m.conv2
                m.conv2 = nn.Sequential(
                    nn.Conv2d(m.conv2b.in_channels, m.conv2b.out_channels, 3, padding=1, stride=m.conv2b.stride, bias=False),
                    nn.Conv2d(m.conv2b.out_channels, m.conv2b.out_channels, 1, bias=False)
                )
                #m.conv2[0].register_forward_hook(boo)
                m.conv2[0].weight.data = ff[1].reshape(m.conv2[0].weight.shape)
                m.conv2[1].weight.data = ff[0].reshape(m.conv2[1].weight.shape)
                m.conv2.cuda()
                add_mask(m.conv2[0])
                add_mask(m.conv2[1])
                
            if True:
                ff = m.conv3.weight.facts
                m.conv3b = m.conv3
                m.conv3 = nn.Sequential(
                    nn.Conv2d(m.conv3b.in_channels, m.conv3b.in_channels, 1, bias=False),
                    nn.Conv2d(m.conv3b.in_channels, m.conv3b.out_channels, 1, bias=False)
                )
                m.conv3[0].weight.data = ff[1].reshape(m.conv3[0].weight.shape)
                m.conv3[1].weight.data = ff[0].reshape(m.conv3[1].weight.shape)
                m.conv3.cuda()
                add_mask(m.conv3[0])
                add_mask(m.conv3[1])
            
            if m.downsample is not None:
                print(m.downsample[0].weight.shape, m.downsample[0].weight.facts[0].shape, m.downsample[0].weight.facts[1].shape)
                m.sb = m.downsample[0]
                ff = m.sb.weight.facts
                m.downsample[0] = nn.Sequential(
                    nn.Conv2d(m.sb.in_channels, m.sb.in_channels, 1, stride=m.sb.stride, bias=False),
                    nn.Conv2d(m.sb.in_channels, m.sb.out_channels, 1, bias=False)
                )
                #m.conv2[0].register_forward_hook(boo)
                m.downsample[0][0].weight.data = ff[1].reshape(m.downsample[0][0].weight.shape)
                m.downsample[0][1].weight.data = ff[0].reshape(m.downsample[0][1].weight.shape)
                m.downsample.cuda()
                add_mask(m.downsample[0][0])
                add_mask(m.downsample[0][1])
                
    return model

In [10]:
def get_res(epochs=20):
    from torchvision.models import resnet50
    model = resnet50(pretrained=True)
    model.cuda()
    criterion_val = nn.CrossEntropyLoss()
    #acc1 = validate(val_loader, model, criterion_val).item()
    #print("dense acc", acc1)
    
    total_params = 0
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3:
            total_params += m.weight.numel()
    print("tot", total_params)
    
    model = run_dsp(model)
    
    
    #optimizer = torch.optim.AdamW(model.parameters(), 0.001)
    opt0 = torch.optim.SGD(model.parameters(), 0.0, momentum=0.9, nesterov=True, weight_decay=1e-4)
    optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, nesterov=True, weight_decay=1e-4)
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2], gamma=0.1)
    scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=epochs, power=1)
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.004, epochs, cycle_momentum=False)
    criterion = nn.CrossEntropyLoss()#SoftTargetCrossEntropy()
    criterion_val = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    
    best_acc1 = 0
    
    print(model, file=sys.stderr)
    
    acc1 = validate(val_loader, model, criterion_val).item()
    print("start acc no bn", acc1)
    train_loss = train(train_loader, model, criterion, opt0, scaler, -1)
    acc1 = validate(val_loader, model, criterion_val).item()
    total_active = 0
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3 and ("conv2b" not in n and "conv1b" not in n and "sb" not in n and "conv3b" not in n):
            total_active += (m.weight != 0).sum().item()
    print("start acc bn", acc1, total_active)

    for epoch in range(epochs):
        train_loss = train(train_loader, model, criterion, optimizer, scaler, epoch)
        acc1 = validate(val_loader, model, criterion_val).item()
        scheduler.step()
        
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        total_active = 0
        for n, m in model.named_modules():
            if type(m) == nn.Conv2d and m.weight.shape[1] > 3 and ("conv2b" not in n and "conv1b" not in n and "sb" not in n and "conv3b" not in n):
                total_active += (m.weight != 0).sum().item()

        print("epoch", epoch, train_loss, acc1, optimizer.param_groups[0]['lr'], total_active, total_active / total_params)
    
    return acc1, copy.deepcopy(model.state_dict())

acc, end = get_res()

print("acc", acc)



tot 23445504
nz 409 410 torch.Size([64, 64])
err_prefin 1.0651941299438477 0.7266281247138977 0.8041678667068481 0.8065056204795837
sparsity check 0.199462890625
layer1.0.conv1 1.0651941299438477 2.076427459716797 20.62295150756836
nz 409 6963 torch.Size([64, 576])
err_prefin 2.040290594100952 0.46786433458328247 1.0558756589889526 0.4248051643371582
sparsity check 0.1999240451388889
layer1.0.conv2 2.0402908325195312 3.9869112968444824 31.344758987426758
nz 409 2867 torch.Size([64, 256])
err_prefin 1.1036280393600464 0.3936349153518677 0.47855135798454285 0.9105990529060364
sparsity check 0.1998291015625
layer1.0.conv3 1.1036280393600464 2.6578125953674316 20.379261016845703
nz 409 2867 torch.Size([64, 256])
err_prefin 2.736938953399658 0.987881064414978 0.7314780354499817 1.3047504425048828
sparsity check 0.1998291015625
layer1.0.downsample.0 2.736938953399658 5.378649711608887 54.36511993408203
nz 409 2867 torch.Size([64, 256])
err_prefin 1.4511997699737549 0.2617597281932831 0.83343

err_prefin 14.32923698425293 0.2721982002258301 0.7154201865196228 0.41710057854652405
sparsity check 0.19998931884765625
layer3.4.conv1 14.329236030578613 24.23316192626953 86.25032043457031
nz 6553 111411 torch.Size([256, 2304])
err_prefin 24.641550064086914 0.19188867509365082 0.8042526245117188 0.22666381299495697
sparsity check 0.19999525282118055
layer3.4.conv2 24.64154815673828 39.144683837890625 130.9825439453125
nz 6553 45875 torch.Size([256, 1024])
err_prefin 12.692995071411133 0.316133052110672 0.34903326630592346 0.7881821393966675
sparsity check 0.19998931884765625
layer3.4.conv3 12.692994117736816 22.032760620117188 87.05435180664062
nz 6553 45875 torch.Size([256, 1024])
err_prefin 18.37877655029297 0.39949774742126465 0.7821371555328369 0.43698835372924805
sparsity check 0.19998931884765625
layer3.5.conv1 18.3787784576416 29.850234985351562 102.01531982421875
nz 6553 111411 torch.Size([256, 2304])
err_prefin 25.215938568115234 0.2235630750656128 0.8415316343307495 0.2765

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride

Test: [  0/196]	Time  5.105 ( 5.105)	Loss 1.0513e+00 (1.0513e+00)	Acc@1  69.92 ( 69.92)	Acc@5  94.14 ( 94.14)
Test: [ 50/196]	Time  0.430 ( 0.528)	Loss 1.5610e+00 (1.3577e+00)	Acc@1  64.45 ( 66.84)	Acc@5  87.50 ( 88.99)
Test: [100/196]	Time  0.431 ( 0.481)	Loss 2.2911e+00 (1.5393e+00)	Acc@1  42.58 ( 63.93)	Acc@5  77.73 ( 86.24)
Test: [150/196]	Time  0.432 ( 0.465)	Loss 2.4797e+00 (1.7323e+00)	Acc@1  48.83 ( 60.39)	Acc@5  69.53 ( 83.22)
start acc no bn 59.27799987792969
Epoch: [-1][   0/5005]	Time  3.350 ( 3.350)	Data  2.453 ( 2.453)	Loss 8.8577e-01 (8.8577e-01)
Epoch: [-1][  50/5005]	Time  0.636 ( 0.692)	Data  0.000 ( 0.048)	Loss 9.4141e-01 (9.3714e-01)
Test: [  0/196]	Time  3.365 ( 3.365)	Loss 6.2836e-01 (6.2836e-01)	Acc@1  81.25 ( 81.25)	Acc@5  97.27 ( 97.27)
Test: [ 50/196]	Time  0.432 ( 0.490)	Loss 6.3255e-01 (8.2611e-01)	Acc@1  85.55 ( 77.88)	Acc@5  95.31 ( 94.55)
Test: [100/196]	Time  0.434 ( 0.462)	Loss 1.3621e+00 (9.5569e-01)	Acc@1  64.84 ( 75.23)	Acc@5  89.06 ( 93.01)
Test: [1

Epoch: [0][3800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0040e+00 (9.7682e-01)
Epoch: [0][3850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6017e-01 (9.7658e-01)
Epoch: [0][3900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.5589e-01 (9.7658e-01)
Epoch: [0][3950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1872e+00 (9.7686e-01)
Epoch: [0][4000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0438e+00 (9.7689e-01)
Epoch: [0][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1677e-01 (9.7706e-01)
Epoch: [0][4100/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4533e-01 (9.7712e-01)
Epoch: [0][4150/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0578e+00 (9.7704e-01)
Epoch: [0][4200/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0298e+00 (9.7718e-01)
Epoch: [0][4250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.7108e-01 (9.7736e-01)
Epoch: [0][4300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1

Epoch: [1][2900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.5072e-01 (9.4554e-01)
Epoch: [1][2950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1253e+00 (9.4584e-01)
Epoch: [1][3000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1520e-01 (9.4580e-01)
Epoch: [1][3050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1434e-01 (9.4604e-01)
Epoch: [1][3100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0132e+00 (9.4633e-01)
Epoch: [1][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0273e+00 (9.4666e-01)
Epoch: [1][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1164e+00 (9.4720e-01)
Epoch: [1][3250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0070e+00 (9.4699e-01)
Epoch: [1][3300/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.2939e-01 (9.4681e-01)
Epoch: [1][3350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.3629e-01 (9.4676e-01)
Epoch: [1][3400/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8

Epoch: [2][2000/5005]	Time  0.643 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9436e-01 (9.2615e-01)
Epoch: [2][2050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0446e-01 (9.2628e-01)
Epoch: [2][2100/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6251e-01 (9.2641e-01)
Epoch: [2][2150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0792e+00 (9.2630e-01)
Epoch: [2][2200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8186e-01 (9.2608e-01)
Epoch: [2][2250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.5682e-01 (9.2584e-01)
Epoch: [2][2300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4813e-01 (9.2641e-01)
Epoch: [2][2350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6138e-01 (9.2646e-01)
Epoch: [2][2400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0520e+00 (9.2665e-01)
Epoch: [2][2450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6660e-01 (9.2666e-01)
Epoch: [2][2500/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9

Epoch: [3][1100/5005]	Time  0.636 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.7582e-01 (9.0950e-01)
Epoch: [3][1150/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.1824e-01 (9.1027e-01)
Epoch: [3][1200/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.3834e-01 (9.1020e-01)
Epoch: [3][1250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.8642e-01 (9.1207e-01)
Epoch: [3][1300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.4246e-01 (9.1241e-01)
Epoch: [3][1350/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.3500e-01 (9.1280e-01)
Epoch: [3][1400/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3417e-01 (9.1317e-01)
Epoch: [3][1450/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.1095e-01 (9.1413e-01)
Epoch: [3][1500/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.1552e-01 (9.1503e-01)
Epoch: [3][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3601e-01 (9.1503e-01)
Epoch: [3][1600/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9

Epoch: [4][ 200/5005]	Time  0.638 ( 0.649)	Data  0.000 ( 0.012)	Loss 8.8369e-01 (8.8876e-01)
Epoch: [4][ 250/5005]	Time  0.638 ( 0.647)	Data  0.000 ( 0.009)	Loss 8.8810e-01 (8.9360e-01)
Epoch: [4][ 300/5005]	Time  0.637 ( 0.645)	Data  0.000 ( 0.008)	Loss 8.0356e-01 (8.8958e-01)
Epoch: [4][ 350/5005]	Time  0.637 ( 0.644)	Data  0.000 ( 0.007)	Loss 9.7291e-01 (8.8962e-01)
Epoch: [4][ 400/5005]	Time  0.638 ( 0.643)	Data  0.000 ( 0.006)	Loss 8.9406e-01 (8.8908e-01)
Epoch: [4][ 450/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.005)	Loss 8.3893e-01 (8.8852e-01)
Epoch: [4][ 500/5005]	Time  0.638 ( 0.642)	Data  0.000 ( 0.005)	Loss 8.3253e-01 (8.8844e-01)
Epoch: [4][ 550/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.004)	Loss 9.3823e-01 (8.8976e-01)
Epoch: [4][ 600/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 9.3140e-01 (8.9108e-01)
Epoch: [4][ 650/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 1.1029e+00 (8.9349e-01)
Epoch: [4][ 700/5005]	Time  0.638 ( 0.641)	Data  0.000 ( 0.004)	Loss 9

Epoch: [4][4650/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4641e-01 (9.1590e-01)
Epoch: [4][4700/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6151e-01 (9.1615e-01)
Epoch: [4][4750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.9285e-01 (9.1625e-01)
Epoch: [4][4800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0336e+00 (9.1669e-01)
Epoch: [4][4850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8146e-01 (9.1679e-01)
Epoch: [4][4900/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7186e-01 (9.1689e-01)
Epoch: [4][4950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.2179e-01 (9.1690e-01)
Epoch: [4][5000/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1062e+00 (9.1722e-01)
Test: [  0/196]	Time  3.458 ( 3.458)	Loss 6.1560e-01 (6.1560e-01)	Acc@1  83.20 ( 83.20)	Acc@5  96.48 ( 96.48)
Test: [ 50/196]	Time  0.434 ( 0.493)	Loss 6.0114e-01 (7.9652e-01)	Acc@1  85.94 ( 78.36)	Acc@5  96.09 ( 94.89)
Test: [100/196]	Time  0.434 ( 0.464)

Epoch: [5][3750/5005]	Time  0.639 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1003e-01 (9.0097e-01)
Epoch: [5][3800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4870e-01 (9.0128e-01)
Epoch: [5][3850/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.5548e-01 (9.0146e-01)
Epoch: [5][3900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0690e-01 (9.0146e-01)
Epoch: [5][3950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8532e-01 (9.0134e-01)
Epoch: [5][4000/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1261e+00 (9.0185e-01)
Epoch: [5][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6820e-01 (9.0223e-01)
Epoch: [5][4100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0206e+00 (9.0248e-01)
Epoch: [5][4150/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6634e-01 (9.0240e-01)
Epoch: [5][4200/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8007e-01 (9.0246e-01)
Epoch: [5][4250/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7

Epoch: [6][2850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0037e-01 (8.8789e-01)
Epoch: [6][2900/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6868e-01 (8.8791e-01)
Epoch: [6][2950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0967e+00 (8.8804e-01)
Epoch: [6][3000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4886e-01 (8.8817e-01)
Epoch: [6][3050/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0245e+00 (8.8826e-01)
Epoch: [6][3100/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6061e-01 (8.8856e-01)
Epoch: [6][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0847e-01 (8.8885e-01)
Epoch: [6][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.5396e-01 (8.8895e-01)
Epoch: [6][3250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1176e-01 (8.8927e-01)
Epoch: [6][3300/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.3716e-01 (8.8962e-01)
Epoch: [6][3350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7

Epoch: [7][1950/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.1964e-01 (8.7265e-01)
Epoch: [7][2000/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.9517e-01 (8.7280e-01)
Epoch: [7][2050/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 6.8487e-01 (8.7214e-01)
Epoch: [7][2100/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.3880e-01 (8.7194e-01)
Epoch: [7][2150/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.4441e-01 (8.7194e-01)
Epoch: [7][2200/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0474e+00 (8.7251e-01)
Epoch: [7][2250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.0664e-01 (8.7243e-01)
Epoch: [7][2300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.8646e-01 (8.7333e-01)
Epoch: [7][2350/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.7032e-01 (8.7282e-01)
Epoch: [7][2400/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.0584e-01 (8.7241e-01)
Epoch: [7][2450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8

Epoch: [8][1050/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.6655e-01 (8.5243e-01)
Epoch: [8][1100/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.003)	Loss 7.5820e-01 (8.5243e-01)
Epoch: [8][1150/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.002)	Loss 7.7337e-01 (8.5413e-01)
Epoch: [8][1200/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 8.6962e-01 (8.5391e-01)
Epoch: [8][1250/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 7.5204e-01 (8.5403e-01)
Epoch: [8][1300/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0930e+00 (8.5568e-01)
Epoch: [8][1350/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.0301e-01 (8.5676e-01)
Epoch: [8][1400/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.3515e-01 (8.5699e-01)
Epoch: [8][1450/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.2366e-01 (8.5693e-01)
Epoch: [8][1500/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.9438e-01 (8.5733e-01)
Epoch: [8][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8

Epoch: [9][ 150/5005]	Time  0.638 ( 0.655)	Data  0.000 ( 0.018)	Loss 7.7388e-01 (8.3128e-01)
Epoch: [9][ 200/5005]	Time  0.637 ( 0.651)	Data  0.000 ( 0.013)	Loss 9.2806e-01 (8.3385e-01)
Epoch: [9][ 250/5005]	Time  0.638 ( 0.648)	Data  0.000 ( 0.011)	Loss 8.4772e-01 (8.3441e-01)
Epoch: [9][ 300/5005]	Time  0.638 ( 0.646)	Data  0.000 ( 0.009)	Loss 7.4619e-01 (8.3615e-01)
Epoch: [9][ 350/5005]	Time  0.637 ( 0.645)	Data  0.000 ( 0.008)	Loss 8.0234e-01 (8.3785e-01)
Epoch: [9][ 400/5005]	Time  0.638 ( 0.644)	Data  0.000 ( 0.007)	Loss 9.1168e-01 (8.3938e-01)
Epoch: [9][ 450/5005]	Time  0.638 ( 0.643)	Data  0.000 ( 0.006)	Loss 8.6244e-01 (8.4161e-01)
Epoch: [9][ 500/5005]	Time  0.638 ( 0.643)	Data  0.000 ( 0.005)	Loss 8.5830e-01 (8.3891e-01)
Epoch: [9][ 550/5005]	Time  0.638 ( 0.642)	Data  0.000 ( 0.005)	Loss 7.5876e-01 (8.3743e-01)
Epoch: [9][ 600/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.005)	Loss 8.4209e-01 (8.3824e-01)
Epoch: [9][ 650/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.004)	Loss 8

Epoch: [9][4600/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4384e-01 (8.5745e-01)
Epoch: [9][4650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9468e-01 (8.5777e-01)
Epoch: [9][4700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4392e-01 (8.5807e-01)
Epoch: [9][4750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6562e-01 (8.5841e-01)
Epoch: [9][4800/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6008e-01 (8.5878e-01)
Epoch: [9][4850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.7299e-01 (8.5885e-01)
Epoch: [9][4900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5658e-01 (8.5899e-01)
Epoch: [9][4950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7515e-01 (8.5906e-01)
Epoch: [9][5000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0164e+00 (8.5930e-01)
Test: [  0/196]	Time  3.606 ( 3.606)	Loss 6.0277e-01 (6.0277e-01)	Acc@1  83.20 ( 83.20)	Acc@5  96.88 ( 96.88)
Test: [ 50/196]	Time  0.434 ( 0.496)	Loss 5.2775e-01 

Epoch: [10][3650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0794e-01 (8.4203e-01)
Epoch: [10][3700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1449e-01 (8.4243e-01)
Epoch: [10][3750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.0175e-01 (8.4262e-01)
Epoch: [10][3800/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4198e-01 (8.4276e-01)
Epoch: [10][3850/5005]	Time  0.639 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.1420e-01 (8.4312e-01)
Epoch: [10][3900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1307e-01 (8.4333e-01)
Epoch: [10][3950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.3701e-01 (8.4366e-01)
Epoch: [10][4000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6451e-01 (8.4374e-01)
Epoch: [10][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6338e-01 (8.4366e-01)
Epoch: [10][4100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7548e-01 (8.4367e-01)
Epoch: [10][4150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [11][2700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4252e-01 (8.1982e-01)
Epoch: [11][2750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0845e-01 (8.1996e-01)
Epoch: [11][2800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9469e-01 (8.2040e-01)
Epoch: [11][2850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7019e-01 (8.2025e-01)
Epoch: [11][2900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6894e-01 (8.2037e-01)
Epoch: [11][2950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.9686e-01 (8.2046e-01)
Epoch: [11][3000/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.9584e-01 (8.2098e-01)
Epoch: [11][3050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.9485e-01 (8.2116e-01)
Epoch: [11][3100/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.9534e-01 (8.2133e-01)
Epoch: [11][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.7423e-01 (8.2128e-01)
Epoch: [11][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [12][1750/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.8599e-01 (8.0490e-01)
Epoch: [12][1800/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.2585e-01 (8.0488e-01)
Epoch: [12][1850/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.4108e-01 (8.0500e-01)
Epoch: [12][1900/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.9772e-01 (8.0529e-01)
Epoch: [12][1950/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3613e-01 (8.0570e-01)
Epoch: [12][2000/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.0488e-01 (8.0664e-01)
Epoch: [12][2050/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 6.5995e-01 (8.0719e-01)
Epoch: [12][2100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6367e-01 (8.0708e-01)
Epoch: [12][2150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.3531e-01 (8.0717e-01)
Epoch: [12][2200/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5730e-01 (8.0734e-01)
Epoch: [12][2250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [13][ 800/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.003)	Loss 7.7114e-01 (7.8093e-01)
Epoch: [13][ 850/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.8406e-01 (7.8171e-01)
Epoch: [13][ 900/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.003)	Loss 1.0437e+00 (7.8210e-01)
Epoch: [13][ 950/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.4869e-01 (7.8322e-01)
Epoch: [13][1000/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.1496e-01 (7.8335e-01)
Epoch: [13][1050/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 7.2343e-01 (7.8272e-01)
Epoch: [13][1100/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.1341e+00 (7.8341e-01)
Epoch: [13][1150/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.8597e-01 (7.8291e-01)
Epoch: [13][1200/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3110e-01 (7.8167e-01)
Epoch: [13][1250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3331e-01 (7.8141e-01)
Epoch: [13][1300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.

Test: [100/196]	Time  0.434 ( 0.465)	Loss 1.2843e+00 (8.5941e-01)	Acc@1  62.89 ( 77.54)	Acc@5  89.45 ( 93.92)
Test: [150/196]	Time  0.434 ( 0.455)	Loss 1.1253e+00 (9.7315e-01)	Acc@1  75.78 ( 75.36)	Acc@5  90.23 ( 92.50)
epoch 13 0.7956984436878033 74.4020004272461 0.0030000000000000014 4688969 0.19999437845311407
Epoch: [14][   0/5005]	Time  3.107 ( 3.107)	Data  2.465 ( 2.465)	Loss 8.9047e-01 (8.9047e-01)
Epoch: [14][  50/5005]	Time  0.637 ( 0.686)	Data  0.000 ( 0.049)	Loss 8.6609e-01 (7.7221e-01)
Epoch: [14][ 100/5005]	Time  0.637 ( 0.662)	Data  0.000 ( 0.025)	Loss 6.7207e-01 (7.7990e-01)
Epoch: [14][ 150/5005]	Time  0.637 ( 0.654)	Data  0.000 ( 0.017)	Loss 7.7865e-01 (7.7834e-01)
Epoch: [14][ 200/5005]	Time  0.638 ( 0.650)	Data  0.000 ( 0.012)	Loss 8.4829e-01 (7.7596e-01)
Epoch: [14][ 250/5005]	Time  0.638 ( 0.647)	Data  0.000 ( 0.010)	Loss 8.3656e-01 (7.7490e-01)
Epoch: [14][ 300/5005]	Time  0.637 ( 0.646)	Data  0.000 ( 0.008)	Loss 6.8149e-01 (7.7033e-01)
Epoch: [14][ 350/5005]	Time

Epoch: [14][4200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.8685e-01 (7.7438e-01)
Epoch: [14][4250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4248e-01 (7.7444e-01)
Epoch: [14][4300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6234e-01 (7.7451e-01)
Epoch: [14][4350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.0747e-01 (7.7462e-01)
Epoch: [14][4400/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.5365e-01 (7.7492e-01)
Epoch: [14][4450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.8695e-01 (7.7505e-01)
Epoch: [14][4500/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5963e-01 (7.7521e-01)
Epoch: [14][4550/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.3392e-01 (7.7532e-01)
Epoch: [14][4600/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4514e-01 (7.7543e-01)
Epoch: [14][4650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.5981e-01 (7.7533e-01)
Epoch: [14][4700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [15][3250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6953e-01 (7.5239e-01)
Epoch: [15][3300/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.4507e-01 (7.5247e-01)
Epoch: [15][3350/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.8329e-01 (7.5277e-01)
Epoch: [15][3400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7836e-01 (7.5260e-01)
Epoch: [15][3450/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.7244e-01 (7.5284e-01)
Epoch: [15][3500/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 6.8272e-01 (7.5288e-01)
Epoch: [15][3550/5005]	Time  0.638 ( 0.637)	Data  0.000 ( 0.001)	Loss 9.0002e-01 (7.5286e-01)
Epoch: [15][3600/5005]	Time  0.639 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.2855e-01 (7.5299e-01)
Epoch: [15][3650/5005]	Time  0.638 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.4898e-01 (7.5313e-01)
Epoch: [15][3700/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.8688e-01 (7.5317e-01)
Epoch: [15][3750/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.

Epoch: [16][2300/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 5.6840e-01 (7.3538e-01)
Epoch: [16][2350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1336e-01 (7.3518e-01)
Epoch: [16][2400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.1633e-01 (7.3516e-01)
Epoch: [16][2450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5199e-01 (7.3542e-01)
Epoch: [16][2500/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.3613e-01 (7.3528e-01)
Epoch: [16][2550/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.3832e-01 (7.3536e-01)
Epoch: [16][2600/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.0721e-01 (7.3539e-01)
Epoch: [16][2650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 5.8951e-01 (7.3548e-01)
Epoch: [16][2700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.2704e-01 (7.3559e-01)
Epoch: [16][2750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.7731e-01 (7.3574e-01)
Epoch: [16][2800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [17][1350/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.2281e-01 (7.1093e-01)
Epoch: [17][1400/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 6.7459e-01 (7.1043e-01)
Epoch: [17][1450/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.4311e-01 (7.1077e-01)
Epoch: [17][1500/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 6.4292e-01 (7.1078e-01)
Epoch: [17][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.2949e-01 (7.1033e-01)
Epoch: [17][1600/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.1723e-01 (7.1030e-01)
Epoch: [17][1650/5005]	Time  0.639 ( 0.639)	Data  0.000 ( 0.002)	Loss 6.3143e-01 (7.1073e-01)
Epoch: [17][1700/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 6.7572e-01 (7.1092e-01)
Epoch: [17][1750/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.3542e-01 (7.1076e-01)
Epoch: [17][1800/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 5.9880e-01 (7.1117e-01)
Epoch: [17][1850/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.

Epoch: [18][ 400/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.006)	Loss 6.8891e-01 (6.9884e-01)
Epoch: [18][ 450/5005]	Time  0.636 ( 0.642)	Data  0.000 ( 0.005)	Loss 7.1983e-01 (6.9980e-01)
Epoch: [18][ 500/5005]	Time  0.636 ( 0.642)	Data  0.000 ( 0.005)	Loss 6.8125e-01 (7.0036e-01)
Epoch: [18][ 550/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.005)	Loss 6.2714e-01 (6.9946e-01)
Epoch: [18][ 600/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 5.8292e-01 (6.9743e-01)
Epoch: [18][ 650/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 6.0997e-01 (6.9823e-01)
Epoch: [18][ 700/5005]	Time  0.636 ( 0.640)	Data  0.000 ( 0.004)	Loss 6.0047e-01 (6.9731e-01)
Epoch: [18][ 750/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 6.9994e-01 (6.9688e-01)
Epoch: [18][ 800/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 7.2709e-01 (6.9649e-01)
Epoch: [18][ 850/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 7.6085e-01 (6.9571e-01)
Epoch: [18][ 900/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.

Epoch: [18][4800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 5.7723e-01 (6.9339e-01)
Epoch: [18][4850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.1676e-01 (6.9354e-01)
Epoch: [18][4900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.7415e-01 (6.9372e-01)
Epoch: [18][4950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.4615e-01 (6.9368e-01)
Epoch: [18][5000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.8228e-01 (6.9366e-01)
Test: [  0/196]	Time  3.486 ( 3.486)	Loss 5.2796e-01 (5.2796e-01)	Acc@1  83.59 ( 83.59)	Acc@5  98.05 ( 98.05)
Test: [ 50/196]	Time  0.434 ( 0.493)	Loss 4.0284e-01 (6.8708e-01)	Acc@1  90.62 ( 81.84)	Acc@5  98.05 ( 95.83)
Test: [100/196]	Time  0.434 ( 0.464)	Loss 1.1969e+00 (8.1818e-01)	Acc@1  66.80 ( 78.68)	Acc@5  90.62 ( 94.58)
Test: [150/196]	Time  0.434 ( 0.454)	Loss 1.1284e+00 (9.2851e-01)	Acc@1  77.73 ( 76.53)	Acc@5  89.84 ( 93.21)
epoch 18 0.6936557817617771 75.56600189208984 0.0005000000000000008 4688969 0.199994378453

Epoch: [19][3850/5005]	Time  0.638 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.5847e-01 (6.7714e-01)
Epoch: [19][3900/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 6.1058e-01 (6.7694e-01)
Epoch: [19][3950/5005]	Time  0.635 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.7047e-01 (6.7679e-01)
Epoch: [19][4000/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 6.0957e-01 (6.7657e-01)
Epoch: [19][4050/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 8.2115e-01 (6.7661e-01)
Epoch: [19][4100/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.4113e-01 (6.7662e-01)
Epoch: [19][4150/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 6.9383e-01 (6.7668e-01)
Epoch: [19][4200/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 5.4988e-01 (6.7670e-01)
Epoch: [19][4250/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 7.0203e-01 (6.7641e-01)
Epoch: [19][4300/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 6.7986e-01 (6.7651e-01)
Epoch: [19][4350/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.