In [1]:
seed = 10
sparsity = 0.9
width = 32

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
import torch
import torch.nn as nn
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import time
import copy
import sys

import random
import numpy as np
import torch
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import scipy.stats as ss
from timm.data import Mixup
from timm.loss import SoftTargetCrossEntropy
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable

import sys
import numpy as np
import torch.nn.utils.prune as prune
from datautils import *

In [4]:
def random_seed(seed=42, rank=0):
    torch.manual_seed(seed + rank)
    np.random.seed(seed + rank)
    random.seed(seed + rank)

random_seed(47)

In [5]:
train_loader, val_loader = get_loaders(
    "imagenet", path="",
    batchsize=256, workers=8,
    nsamples=-1, seed=0,
    noaug=False
)

In [6]:
def train(train_loader, model, criterion, optimizer, scaler, epoch):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    #top1 = AverageMeter('Acc@1', ':6.2f')
    #top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        with torch.cuda.amp.autocast(enabled=True):
            output = model(images)
            loss = criterion(output, target)

        # measure accuracy and record loss
        #acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        #top1.update(acc1[0], images.size(0))
        #top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #loss.backward()
        #optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 50 == 0:
            progress.display(i)
        if epoch == -1 and i == 50:
            break

    return losses.avg


def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            images = images.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % 50 == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        #print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
        #      .format(top1=top1, top5=top5))

    return top1.avg

In [7]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = LR * (0.1 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

In [8]:
def find_other2(A, W, nnz, Z, U, print_sc=None, debug=False, reg=0, rho_start=0.03, iters=5, prune_iters=2):
    XX = A.T.matmul(A)
    norm2 = torch.diag(XX).sqrt() + 1e-8
    An = A / norm2
    XX = An.T.matmul(An)
    XX += torch.diag(torch.ones_like(XX.diag())) * XX.diag().mean() * reg
    
    #norm2 = torch.ones_like(norm2)
    Wnn = W# * norm2.unsqueeze(1)
    rho = 1
    XY = An.T.matmul(Wnn)
    XXinv = torch.inverse(XX + torch.eye(XX.shape[1], device=XX.device)*rho)
    XXinv2 = torch.inverse(XX + torch.eye(XX.shape[1], device=XX.device)*rho_start)
    U = U * norm2.unsqueeze(1)
    Z = Z * norm2.unsqueeze(1)
    
    #B = torch.linalg.solve(XX, XY)
    B = XXinv2.matmul(XY + rho_start*(Z-U))
    
    #U = torch.zeros_like(B)
    
    #Z = B
    
    bsparsity = min(0.99, 1 - nnz/B.numel())
    #print("bs", bsparsity)


    for itt in range(iters):
        if itt < prune_iters:
            cur_sparsity = bsparsity# - bsparsity * (1 - (itt + 1) / iterative_prune) ** 3
            thres = (B+U).abs().flatten().sort()[0][int(B.numel() * cur_sparsity)]
            mask = ((B+U).abs() > thres)
            del thres

        Z = (B + U) * mask    

        U = U + (B - Z)    

        B = XXinv.matmul(XY + rho*(Z-U))
        #B = torch.linalg.solve(XX + torch.eye(XX.shape[1], device=XX.device)*rho, XY + rho*(Z-U))
        if debug:
            print(itt, cur_sparsity, (Z != 0).sum().item() / Z.numel())
            print_sc(A.matmul(B / norm2.unsqueeze(1)))
            print_sc(A.matmul(Z / norm2.unsqueeze(1)))
            print(((An != 0).sum() + (Z != 0).sum()) / W.numel())
            print("-------")
    if debug:
        print("opt end")

    return Z / norm2.unsqueeze(1), U / norm2.unsqueeze(1)    
    
def mag_prune(W, sp=0.6):
    thres = (W).abs().flatten().sort()[0][int(W.numel() * sp)]
    mask = ((W).abs() > thres)
    return W * mask

def ent(p):
    return -(p * np.log2(p) + (1-p) * np.log2(1-p))

def factorizeT(W, XX, asp=0.16, sp=0.4, iters=40):
    #W = lx.weight.detach().T.float()
    nza = int(W.shape[0]**2 * asp)
    nzb = int(W.numel() * sp - nza)
    
    Az = torch.eye(W.shape[0], device=W.device)
    Au = torch.zeros_like(Az)
    norm = XX.diag().sqrt().unsqueeze(1) + 1e-8
    norm = torch.ones_like(norm)
       
    Wn = W * norm
       
    print("nz", nza, nzb, Wn.shape)
    Bz = mag_prune(Wn, (1 - nzb/W.numel()))
    Bu = torch.zeros_like(Bz)
    
    for itt in range(iters):
        #if itt < 10:
        #    rho_start = 0.0
        #elif itt < 15:
        #    rho_start = 0.00
        #else:
        #    rho_start = 0.1
        rho_start = min(1.0, itt / (iters-3))**3
        Az, Au = (x.T for x in find_other2(Bz.T, Wn.T, nza, Az.T, Au.T, reg=1e-2, debug=False, rho_start=rho_start))
                
        Bz, Bu = find_other2(Az, Wn, nzb, Bz, Bu, reg=1e-2, debug=False, rho_start=rho_start)
    
    #print(((Az != 0).sum() + (Bz != 0).sum()).item() / W.numel(), (Az != 0).sum().item() / Az.numel(),
    #      (Bz != 0).sum().item() / Bz.numel(), Az.shape, Bz.shape,
    #     (Az.numel()*ent((Az != 0).sum().item() / Az.numel()) + Bz.numel()*ent((Bz != 0).sum().item() / Bz.numel())) / W.numel(), 
    #    ent(0.4), ent(0.5))
    return ((Az / norm).matmul(Bz)).T, Bz.T, (Az / norm).T


def factorizef(W, XX, asp=0.16, sp=0.4, iters=200, l_prev=None):
    s_time = time.time()
    if W.shape[0] >= W.shape[1]:
        return factorizeT(W.T, XX, sp=sp, asp=asp, iters=iters)
    
    nza = int(W.shape[0]**2 * asp)
    nzb = int(W.numel() * sp - nza)
    norm = XX.diag().sqrt() + 1e-8
    norm = torch.ones_like(norm)

    Wn = W * norm
    
    Az = torch.eye(W.shape[0], device=W.device)
    Au = torch.zeros_like(Az)

    print("nz", nza, nzb, Wn.shape)
    Bz = mag_prune(Wn, (1 - nzb/W.numel()))
    Bu = torch.zeros_like(Bz)
    
    for itt in range(iters):
        #if itt < 10:
        #    rho_start = 0.0
        #elif itt < 15:
        #    rho_start = 0.00
        #else:
        #    rho_start = 0.1
            
        rho_start = min(1.0, itt / (iters-3))**3
        Az, Au = (x.T for x in find_other2(Bz.T, Wn.T, nza, Az.T, Au.T, reg=1e-2, debug=False, rho_start=rho_start))
                
        Bz, Bu = find_other2(Az, Wn, nzb, Bz, Bu, reg=1e-2, debug=False, rho_start=rho_start)
        
        #print(itt, time.time() - s_time, end =" ") 
        #print_scores(Az.matmul(Bz / norm))
        
        
    #print(((Az != 0).sum() + (Bz != 0).sum()).item() / W.numel(), (Az != 0).sum().item() / Az.numel(),
    #      (Bz != 0).sum().item() / Bz.numel(), Az.shape, Bz.shape,
    #     (Az.numel()*ent((Az != 0).sum().item() / Az.numel()) + Bz.numel()*ent((Bz != 0).sum().item() / Bz.numel())) / W.numel(), 
    #    ent(0.4), ent(0.5))
    return Az.matmul(Bz / norm), Az, Bz / norm

def factorize(XX, W, sp, l_prev=None):
    W = W.detach().float()
    asp = max(0.05, sp/2)
    W2, Ab, Bb = factorizef(W, XX, sp=sp, asp=asp, l_prev=l_prev)
    An = Ab.norm(dim=0) + 1e-12
    Bn = Bb.norm(dim=1) + 1e-12
    #print(An, Bn)
    Ab *= (Bn/An).sqrt()
    Bb *= (An/Bn).sqrt().unsqueeze(1)
    #print(Ab.norm(dim=0), Bb.norm(dim=1))
    W2 = Ab.matmul(Bb)
    print("err_prefin", (W2 - W).matmul(XX).matmul((W2 - W).T).diag().sum().item(), W.abs().amax().item(), Ab.abs().amax().item(), Bb.abs().amax().item())
    #qq = qqqq
    print("sparsity check", ((Ab != 0).sum() + (Bb != 0).sum()).item() / W2.numel())
    return W2, (Ab, Bb)

In [9]:
def hook(m, *args, **kwargs):
    m.weight = m.wo * m.mask

def add_mask(m):
    m.register_parameter("wo", m.weight)
    m.register_buffer("mask", torch.nn.Parameter((m.weight.data != 0).to(torch.float32)))
    del m._parameters["weight"]
    m.register_forward_pre_hook(hook)


def run_dsp(model):
    out_admm = {}
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3:
            density = 1 - sparsity
            w_orig = m.weight.flatten(1)
            w_mag = mag_prune(w_orig, sparsity)
            w_admm, facts = factorize(torch.eye(w_orig.shape[1], device=w_orig.device), w_orig, density)
            out_admm[n] = (w_admm.reshape(w_orig.shape), facts)
            print(n, (w_admm - w_orig).square().sum().item(), (w_mag - w_orig).square().sum().item(), w_orig.square().sum().item())
            #m.XX = None

    for n, m in model.named_modules():
        if n in out_admm:
            print("change", n)
            m.weight.data = out_admm[n][0].reshape(m.weight.shape)
            m.weight.facts = out_admm[n][1]
    
    for n, m in model.named_modules():
        if "Bottleneck" in str(type(m)):
            print(m.conv1.weight.shape, m.conv1.weight.facts[0].shape, m.conv1.weight.facts[1].shape)
            if True:
                ff = m.conv1.weight.facts
                m.conv1b = m.conv1
                m.conv1 = nn.Sequential(
                    nn.Conv2d(m.conv1b.in_channels, m.conv1b.out_channels, 1, bias=False),
                    nn.Conv2d(m.conv1b.out_channels, m.conv1b.out_channels, 1, bias=False)
                )
                m.conv1[0].weight.data = ff[1].reshape(m.conv1[0].weight.shape)
                m.conv1[1].weight.data = ff[0].reshape(m.conv1[1].weight.shape)
                m.conv1.cuda()
                add_mask(m.conv1[0])
                add_mask(m.conv1[1])
                
            print(m.conv2.weight.shape, m.conv2.weight.facts[0].shape, m.conv2.weight.facts[1].shape)
            
            if True:
                ff = m.conv2.weight.facts
                m.conv2b = m.conv2
                m.conv2 = nn.Sequential(
                    nn.Conv2d(m.conv2b.in_channels, m.conv2b.out_channels, 3, padding=1, stride=m.conv2b.stride, bias=False),
                    nn.Conv2d(m.conv2b.out_channels, m.conv2b.out_channels, 1, bias=False)
                )
                #m.conv2[0].register_forward_hook(boo)
                m.conv2[0].weight.data = ff[1].reshape(m.conv2[0].weight.shape)
                m.conv2[1].weight.data = ff[0].reshape(m.conv2[1].weight.shape)
                m.conv2.cuda()
                add_mask(m.conv2[0])
                add_mask(m.conv2[1])
                
            if True:
                ff = m.conv3.weight.facts
                m.conv3b = m.conv3
                m.conv3 = nn.Sequential(
                    nn.Conv2d(m.conv3b.in_channels, m.conv3b.in_channels, 1, bias=False),
                    nn.Conv2d(m.conv3b.in_channels, m.conv3b.out_channels, 1, bias=False)
                )
                m.conv3[0].weight.data = ff[1].reshape(m.conv3[0].weight.shape)
                m.conv3[1].weight.data = ff[0].reshape(m.conv3[1].weight.shape)
                m.conv3.cuda()
                add_mask(m.conv3[0])
                add_mask(m.conv3[1])
            
            if m.downsample is not None:
                print(m.downsample[0].weight.shape, m.downsample[0].weight.facts[0].shape, m.downsample[0].weight.facts[1].shape)
                m.sb = m.downsample[0]
                ff = m.sb.weight.facts
                m.downsample[0] = nn.Sequential(
                    nn.Conv2d(m.sb.in_channels, m.sb.in_channels, 1, stride=m.sb.stride, bias=False),
                    nn.Conv2d(m.sb.in_channels, m.sb.out_channels, 1, bias=False)
                )
                #m.conv2[0].register_forward_hook(boo)
                m.downsample[0][0].weight.data = ff[1].reshape(m.downsample[0][0].weight.shape)
                m.downsample[0][1].weight.data = ff[0].reshape(m.downsample[0][1].weight.shape)
                m.downsample.cuda()
                add_mask(m.downsample[0][0])
                add_mask(m.downsample[0][1])
                
    return model

In [10]:
def get_res(epochs=20):
    from torchvision.models import resnet50
    model = resnet50(pretrained=True)
    model.cuda()
    criterion_val = nn.CrossEntropyLoss()
    #acc1 = validate(val_loader, model, criterion_val).item()
    #print("dense acc", acc1)
    
    total_params = 0
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3:
            total_params += m.weight.numel()
    print("tot", total_params)
    
    model = run_dsp(model)
    
    
    #optimizer = torch.optim.AdamW(model.parameters(), 0.001)
    opt0 = torch.optim.SGD(model.parameters(), 0.0, momentum=0.9, nesterov=True, weight_decay=1e-4)
    optimizer = torch.optim.SGD(model.parameters(), 0.01, momentum=0.9, nesterov=True, weight_decay=1e-4)
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2], gamma=0.1)
    scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=epochs, power=1)
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.004, epochs, cycle_momentum=False)
    criterion = nn.CrossEntropyLoss()#SoftTargetCrossEntropy()
    criterion_val = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    
    best_acc1 = 0
    
    print(model, file=sys.stderr)
    
    acc1 = validate(val_loader, model, criterion_val).item()
    print("start acc no bn", acc1)
    train_loss = train(train_loader, model, criterion, opt0, scaler, -1)
    acc1 = validate(val_loader, model, criterion_val).item()
    total_active = 0
    for n, m in model.named_modules():
        if type(m) == nn.Conv2d and m.weight.shape[1] > 3 and ("conv2b" not in n and "conv1b" not in n and "sb" not in n and "conv3b" not in n):
            total_active += (m.weight != 0).sum().item()
    print("start acc bn", acc1, total_active)

    for epoch in range(epochs):
        train_loss = train(train_loader, model, criterion, optimizer, scaler, epoch)
        acc1 = validate(val_loader, model, criterion_val).item()
        scheduler.step()
        
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)
        total_active = 0
        for n, m in model.named_modules():
            if type(m) == nn.Conv2d and m.weight.shape[1] > 3 and ("conv2b" not in n and "conv1b" not in n and "sb" not in n and "conv3b" not in n):
                total_active += (m.weight != 0).sum().item()

        print("epoch", epoch, train_loss, acc1, optimizer.param_groups[0]['lr'], total_active, total_active / total_params)
    
    return acc1, copy.deepcopy(model.state_dict())

acc, end = get_res()

print("acc", acc)



tot 23445504
nz 204 205 torch.Size([64, 64])
err_prefin 3.1943302154541016 0.7266281247138977 0.8554760217666626 0.8422347903251648
sparsity check 0.099365234375
layer1.0.conv1 3.1943302154541016 4.395542144775391 20.62295150756836
nz 204 3482 torch.Size([64, 576])
err_prefin 5.441827774047852 0.46786433458328247 1.0092570781707764 0.45649296045303345
sparsity check 0.09993489583333333
layer1.0.conv2 5.441827774047852 7.894467353820801 31.344758987426758
nz 204 1434 torch.Size([64, 256])
err_prefin 3.452609062194824 0.3936349153518677 0.5334653258323669 0.8795067667961121
sparsity check 0.099853515625
layer1.0.conv3 3.452609062194824 5.793004035949707 20.379261016845703
nz 204 1434 torch.Size([64, 256])
err_prefin 7.820006847381592 0.987881064414978 0.7290602922439575 1.3375771045684814
sparsity check 0.099853515625
layer1.0.downsample.0 7.82000732421875 11.938996315002441 54.36511993408203
nz 204 1434 torch.Size([64, 256])
err_prefin 4.0182929039001465 0.2617597281932831 0.82312500476

err_prefin 29.67755126953125 0.2721982002258301 0.6721134781837463 0.433461457490921
sparsity check 0.0999908447265625
layer3.4.conv1 29.677553176879883 39.984375 86.25032043457031
nz 3276 55706 torch.Size([256, 2304])
err_prefin 48.958839416503906 0.19188867509365082 0.719207227230072 0.2540438175201416
sparsity check 0.09999593098958333
layer3.4.conv2 48.958839416503906 64.2547607421875 130.9825439453125
nz 3276 22938 torch.Size([256, 1024])
err_prefin 27.120649337768555 0.316133052110672 0.37045490741729736 0.7376529574394226
sparsity check 0.0999908447265625
layer3.4.conv3 27.120647430419922 37.8643798828125 87.05435180664062
nz 3276 22938 torch.Size([256, 1024])
err_prefin 37.631561279296875 0.39949774742126465 0.7623628973960876 0.5068808197975159
sparsity check 0.0999908447265625
layer3.5.conv1 37.631561279296875 48.90454864501953 102.01531982421875
nz 3276 55706 torch.Size([256, 2304])
err_prefin 50.715980529785156 0.2235630750656128 0.7789952754974365 0.2990630865097046
sparsi

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride

Test: [  0/196]	Time  7.280 ( 7.280)	Loss 4.9074e+00 (4.9074e+00)	Acc@1  10.94 ( 10.94)	Acc@5  31.25 ( 31.25)
Test: [ 50/196]	Time  0.431 ( 0.564)	Loss 7.6985e+00 (6.6013e+00)	Acc@1   0.00 (  1.72)	Acc@5   0.00 (  5.78)
Test: [100/196]	Time  0.432 ( 0.498)	Loss 6.8495e+00 (6.4894e+00)	Acc@1   1.95 (  1.82)	Acc@5   6.25 (  5.86)
Test: [150/196]	Time  0.432 ( 0.476)	Loss 6.4366e+00 (6.4240e+00)	Acc@1   2.34 (  2.04)	Acc@5   3.52 (  6.67)
start acc no bn 2.2719998359680176
Epoch: [-1][   0/5005]	Time  3.368 ( 3.368)	Data  2.435 ( 2.435)	Loss 1.9493e+00 (1.9493e+00)
Epoch: [-1][  50/5005]	Time  0.637 ( 0.691)	Data  0.000 ( 0.048)	Loss 2.1008e+00 (2.0358e+00)
Test: [  0/196]	Time  3.409 ( 3.409)	Loss 1.6305e+00 (1.6305e+00)	Acc@1  62.89 ( 62.89)	Acc@5  86.72 ( 86.72)
Test: [ 50/196]	Time  0.433 ( 0.491)	Loss 1.2554e+00 (1.6968e+00)	Acc@1  74.61 ( 62.20)	Acc@5  92.58 ( 86.47)
Test: [100/196]	Time  0.433 ( 0.462)	Loss 2.4508e+00 (1.8642e+00)	Acc@1  42.58 ( 59.11)	Acc@5  73.44 ( 83.73)
Test: [

Epoch: [0][3800/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1224e+00 (1.2128e+00)
Epoch: [0][3850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2176e+00 (1.2117e+00)
Epoch: [0][3900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2008e+00 (1.2109e+00)
Epoch: [0][3950/5005]	Time  0.638 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.3824e+00 (1.2104e+00)
Epoch: [0][4000/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.1442e+00 (1.2096e+00)
Epoch: [0][4050/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.1157e+00 (1.2090e+00)
Epoch: [0][4100/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.0861e+00 (1.2083e+00)
Epoch: [0][4150/5005]	Time  0.638 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.2223e+00 (1.2074e+00)
Epoch: [0][4200/5005]	Time  0.637 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.1863e+00 (1.2067e+00)
Epoch: [0][4250/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 1.1479e+00 (1.2062e+00)
Epoch: [0][4300/5005]	Time  0.636 ( 0.637)	Data  0.000 ( 0.001)	Loss 1

Epoch: [1][2900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0831e+00 (1.1090e+00)
Epoch: [1][2950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2613e+00 (1.1093e+00)
Epoch: [1][3000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0088e+00 (1.1090e+00)
Epoch: [1][3050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1010e+00 (1.1092e+00)
Epoch: [1][3100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1196e+00 (1.1093e+00)
Epoch: [1][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1383e+00 (1.1095e+00)
Epoch: [1][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2880e+00 (1.1098e+00)
Epoch: [1][3250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0990e+00 (1.1094e+00)
Epoch: [1][3300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1370e+00 (1.1090e+00)
Epoch: [1][3350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9987e-01 (1.1090e+00)
Epoch: [1][3400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1

Epoch: [2][2000/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.1498e+00 (1.0724e+00)
Epoch: [2][2050/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.9019e-01 (1.0723e+00)
Epoch: [2][2100/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0893e+00 (1.0725e+00)
Epoch: [2][2150/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.2128e+00 (1.0722e+00)
Epoch: [2][2200/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0592e+00 (1.0720e+00)
Epoch: [2][2250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0690e+00 (1.0718e+00)
Epoch: [2][2300/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.1008e+00 (1.0723e+00)
Epoch: [2][2350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1424e+00 (1.0724e+00)
Epoch: [2][2400/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0746e+00 (1.0724e+00)
Epoch: [2][2450/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1650e+00 (1.0725e+00)
Epoch: [2][2500/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1

Epoch: [3][1100/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 1.0415e+00 (1.0471e+00)
Epoch: [3][1150/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 1.0560e+00 (1.0476e+00)
Epoch: [3][1200/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 1.0679e+00 (1.0473e+00)
Epoch: [3][1250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0458e+00 (1.0490e+00)
Epoch: [3][1300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.6608e-01 (1.0494e+00)
Epoch: [3][1350/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0949e+00 (1.0497e+00)
Epoch: [3][1400/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.7061e-01 (1.0499e+00)
Epoch: [3][1450/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0023e+00 (1.0509e+00)
Epoch: [3][1500/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.7926e-01 (1.0519e+00)
Epoch: [3][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.6315e-01 (1.0519e+00)
Epoch: [3][1600/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1

Epoch: [4][ 200/5005]	Time  0.638 ( 0.649)	Data  0.000 ( 0.012)	Loss 9.9392e-01 (1.0221e+00)
Epoch: [4][ 250/5005]	Time  0.638 ( 0.647)	Data  0.000 ( 0.009)	Loss 9.9891e-01 (1.0284e+00)
Epoch: [4][ 300/5005]	Time  0.637 ( 0.645)	Data  0.000 ( 0.008)	Loss 9.4518e-01 (1.0243e+00)
Epoch: [4][ 350/5005]	Time  0.637 ( 0.644)	Data  0.000 ( 0.007)	Loss 1.1287e+00 (1.0250e+00)
Epoch: [4][ 400/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.006)	Loss 1.0423e+00 (1.0243e+00)
Epoch: [4][ 450/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.005)	Loss 1.0418e+00 (1.0240e+00)
Epoch: [4][ 500/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.005)	Loss 9.6181e-01 (1.0240e+00)
Epoch: [4][ 550/5005]	Time  0.638 ( 0.642)	Data  0.000 ( 0.004)	Loss 1.0351e+00 (1.0247e+00)
Epoch: [4][ 600/5005]	Time  0.638 ( 0.641)	Data  0.000 ( 0.004)	Loss 1.0540e+00 (1.0270e+00)
Epoch: [4][ 650/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 1.2585e+00 (1.0290e+00)
Epoch: [4][ 700/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.003)	Loss 1

Epoch: [4][4650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0631e+00 (1.0462e+00)
Epoch: [4][4700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.2646e-01 (1.0463e+00)
Epoch: [4][4750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.7134e-01 (1.0465e+00)
Epoch: [4][4800/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1815e+00 (1.0469e+00)
Epoch: [4][4850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0351e+00 (1.0470e+00)
Epoch: [4][4900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9591e-01 (1.0470e+00)
Epoch: [4][4950/5005]	Time  0.639 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8369e-01 (1.0469e+00)
Epoch: [4][5000/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2114e+00 (1.0473e+00)
Test: [  0/196]	Time  3.411 ( 3.411)	Loss 6.9913e-01 (6.9913e-01)	Acc@1  81.25 ( 81.25)	Acc@5  96.09 ( 96.09)
Test: [ 50/196]	Time  0.434 ( 0.491)	Loss 6.1318e-01 (8.9559e-01)	Acc@1  82.42 ( 76.17)	Acc@5  96.48 ( 93.78)
Test: [100/196]	Time  0.434 ( 0.463)

Epoch: [5][3750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0139e+00 (1.0297e+00)
Epoch: [5][3800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0952e+00 (1.0299e+00)
Epoch: [5][3850/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.3158e-01 (1.0301e+00)
Epoch: [5][3900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0243e+00 (1.0301e+00)
Epoch: [5][3950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0088e+00 (1.0300e+00)
Epoch: [5][4000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.2495e+00 (1.0303e+00)
Epoch: [5][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0657e+00 (1.0307e+00)
Epoch: [5][4100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1721e+00 (1.0308e+00)
Epoch: [5][4150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0266e+00 (1.0306e+00)
Epoch: [5][4200/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0445e+00 (1.0307e+00)
Epoch: [5][4250/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9

Epoch: [6][2850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.8013e-01 (1.0161e+00)
Epoch: [6][2900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.2902e-01 (1.0160e+00)
Epoch: [6][2950/5005]	Time  0.638 ( 0.638)	Data  0.001 ( 0.001)	Loss 1.2412e+00 (1.0162e+00)
Epoch: [6][3000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9990e-01 (1.0163e+00)
Epoch: [6][3050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1353e+00 (1.0164e+00)
Epoch: [6][3100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6675e-01 (1.0168e+00)
Epoch: [6][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0543e+00 (1.0169e+00)
Epoch: [6][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6385e-01 (1.0170e+00)
Epoch: [6][3250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0627e+00 (1.0174e+00)
Epoch: [6][3300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0163e+00 (1.0177e+00)
Epoch: [6][3350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9

Epoch: [7][1950/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.1193e+00 (1.0003e+00)
Epoch: [7][2000/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0370e+00 (1.0005e+00)
Epoch: [7][2050/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 7.8148e-01 (9.9970e-01)
Epoch: [7][2100/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.1042e+00 (9.9941e-01)
Epoch: [7][2150/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0743e+00 (9.9948e-01)
Epoch: [7][2200/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.1323e+00 (9.9991e-01)
Epoch: [7][2250/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.6945e-01 (9.9957e-01)
Epoch: [7][2300/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0754e+00 (1.0005e+00)
Epoch: [7][2350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0206e+00 (9.9988e-01)
Epoch: [7][2400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0108e+00 (9.9957e-01)
Epoch: [7][2450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9

Epoch: [8][1050/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 9.6705e-01 (9.7999e-01)
Epoch: [8][1100/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 8.9749e-01 (9.8046e-01)
Epoch: [8][1150/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.1081e-01 (9.8200e-01)
Epoch: [8][1200/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0195e+00 (9.8216e-01)
Epoch: [8][1250/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.1199e-01 (9.8229e-01)
Epoch: [8][1300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.2310e+00 (9.8390e-01)
Epoch: [8][1350/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.8865e-01 (9.8495e-01)
Epoch: [8][1400/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0421e+00 (9.8544e-01)
Epoch: [8][1450/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0533e+00 (9.8536e-01)
Epoch: [8][1500/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0520e+00 (9.8587e-01)
Epoch: [8][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9

Epoch: [9][ 150/5005]	Time  0.638 ( 0.655)	Data  0.000 ( 0.017)	Loss 9.5540e-01 (9.5639e-01)
Epoch: [9][ 200/5005]	Time  0.638 ( 0.651)	Data  0.000 ( 0.013)	Loss 1.0515e+00 (9.6242e-01)
Epoch: [9][ 250/5005]	Time  0.637 ( 0.648)	Data  0.000 ( 0.011)	Loss 9.6656e-01 (9.6348e-01)
Epoch: [9][ 300/5005]	Time  0.637 ( 0.646)	Data  0.000 ( 0.009)	Loss 8.5081e-01 (9.6400e-01)
Epoch: [9][ 350/5005]	Time  0.637 ( 0.645)	Data  0.000 ( 0.008)	Loss 9.3382e-01 (9.6533e-01)
Epoch: [9][ 400/5005]	Time  0.638 ( 0.644)	Data  0.000 ( 0.007)	Loss 1.0575e+00 (9.6569e-01)
Epoch: [9][ 450/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.006)	Loss 1.0468e+00 (9.6848e-01)
Epoch: [9][ 500/5005]	Time  0.638 ( 0.643)	Data  0.000 ( 0.005)	Loss 9.6286e-01 (9.6538e-01)
Epoch: [9][ 550/5005]	Time  0.638 ( 0.642)	Data  0.000 ( 0.005)	Loss 8.7891e-01 (9.6417e-01)
Epoch: [9][ 600/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.005)	Loss 1.0115e+00 (9.6547e-01)
Epoch: [9][ 650/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.004)	Loss 1

Epoch: [9][4600/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.3564e-01 (9.8267e-01)
Epoch: [9][4650/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0953e+00 (9.8295e-01)
Epoch: [9][4700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0452e+00 (9.8319e-01)
Epoch: [9][4750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6148e-01 (9.8353e-01)
Epoch: [9][4800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.2109e-01 (9.8392e-01)
Epoch: [9][4850/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9109e-01 (9.8402e-01)
Epoch: [9][4900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.8081e-01 (9.8410e-01)
Epoch: [9][4950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4501e-01 (9.8419e-01)
Epoch: [9][5000/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1544e+00 (9.8443e-01)
Test: [  0/196]	Time  3.412 ( 3.412)	Loss 6.5012e-01 (6.5012e-01)	Acc@1  79.30 ( 79.30)	Acc@5  97.27 ( 97.27)
Test: [ 50/196]	Time  0.434 ( 0.491)	Loss 5.2254e-01 

Epoch: [10][3650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.1239e+00 (9.6919e-01)
Epoch: [10][3700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0026e+00 (9.6962e-01)
Epoch: [10][3750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5949e-01 (9.6990e-01)
Epoch: [10][3800/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.9647e-01 (9.7005e-01)
Epoch: [10][3850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.4839e-01 (9.7045e-01)
Epoch: [10][3900/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4441e-01 (9.7062e-01)
Epoch: [10][3950/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.5143e-01 (9.7079e-01)
Epoch: [10][4000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0597e+00 (9.7080e-01)
Epoch: [10][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0038e+00 (9.7068e-01)
Epoch: [10][4100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1007e-01 (9.7065e-01)
Epoch: [10][4150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [11][2700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5277e-01 (9.4854e-01)
Epoch: [11][2750/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.7549e-01 (9.4856e-01)
Epoch: [11][2800/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0931e+00 (9.4895e-01)
Epoch: [11][2850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.7054e-01 (9.4872e-01)
Epoch: [11][2900/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0971e-01 (9.4896e-01)
Epoch: [11][2950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0648e+00 (9.4903e-01)
Epoch: [11][3000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6670e-01 (9.4960e-01)
Epoch: [11][3050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0056e+00 (9.4965e-01)
Epoch: [11][3100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8227e-01 (9.4981e-01)
Epoch: [11][3150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0061e+00 (9.4974e-01)
Epoch: [11][3200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [12][1750/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.2011e-01 (9.3653e-01)
Epoch: [12][1800/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.0082e-01 (9.3648e-01)
Epoch: [12][1850/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0446e+00 (9.3650e-01)
Epoch: [12][1900/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.7961e-01 (9.3660e-01)
Epoch: [12][1950/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.6763e-01 (9.3682e-01)
Epoch: [12][2000/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 9.4016e-01 (9.3782e-01)
Epoch: [12][2050/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.3387e-01 (9.3849e-01)
Epoch: [12][2100/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.0942e-01 (9.3843e-01)
Epoch: [12][2150/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 8.3095e-01 (9.3849e-01)
Epoch: [12][2200/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.001)	Loss 1.0597e+00 (9.3868e-01)
Epoch: [12][2250/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.

Epoch: [13][ 800/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 9.1054e-01 (9.1474e-01)
Epoch: [13][ 850/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.003)	Loss 1.1005e+00 (9.1598e-01)
Epoch: [13][ 900/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 1.1775e+00 (9.1622e-01)
Epoch: [13][ 950/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 9.7051e-01 (9.1723e-01)
Epoch: [13][1000/5005]	Time  0.638 ( 0.640)	Data  0.000 ( 0.003)	Loss 1.0169e+00 (9.1677e-01)
Epoch: [13][1050/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 8.3901e-01 (9.1647e-01)
Epoch: [13][1100/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 1.2346e+00 (9.1731e-01)
Epoch: [13][1150/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.002)	Loss 8.9462e-01 (9.1666e-01)
Epoch: [13][1200/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.7796e-01 (9.1538e-01)
Epoch: [13][1250/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.5052e-01 (9.1513e-01)
Epoch: [13][1300/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.

Test: [100/196]	Time  0.434 ( 0.463)	Loss 1.5407e+00 (9.3271e-01)	Acc@1  55.08 ( 75.70)	Acc@5  86.33 ( 93.09)
Test: [150/196]	Time  0.434 ( 0.453)	Loss 1.2021e+00 (1.0523e+00)	Acc@1  74.61 ( 73.48)	Acc@5  86.72 ( 91.53)
epoch 13 0.9283417635067227 72.50799560546875 0.0030000000000000014 2344421 0.09999448081815601
Epoch: [14][   0/5005]	Time  3.075 ( 3.075)	Data  2.437 ( 2.437)	Loss 1.0509e+00 (1.0509e+00)
Epoch: [14][  50/5005]	Time  0.637 ( 0.685)	Data  0.000 ( 0.048)	Loss 1.0970e+00 (9.0840e-01)
Epoch: [14][ 100/5005]	Time  0.637 ( 0.662)	Data  0.000 ( 0.024)	Loss 8.3724e-01 (9.1922e-01)
Epoch: [14][ 150/5005]	Time  0.638 ( 0.654)	Data  0.000 ( 0.016)	Loss 9.7419e-01 (9.1906e-01)
Epoch: [14][ 200/5005]	Time  0.637 ( 0.650)	Data  0.000 ( 0.012)	Loss 9.1281e-01 (9.1520e-01)
Epoch: [14][ 250/5005]	Time  0.637 ( 0.647)	Data  0.000 ( 0.010)	Loss 9.9463e-01 (9.1421e-01)
Epoch: [14][ 300/5005]	Time  0.637 ( 0.646)	Data  0.000 ( 0.008)	Loss 7.9613e-01 (9.0868e-01)
Epoch: [14][ 350/5005]	Tim

Epoch: [14][4200/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1278e-01 (9.0783e-01)
Epoch: [14][4250/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0947e-01 (9.0786e-01)
Epoch: [14][4300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.9866e-01 (9.0804e-01)
Epoch: [14][4350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1319e-01 (9.0810e-01)
Epoch: [14][4400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6298e-01 (9.0832e-01)
Epoch: [14][4450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.9371e-01 (9.0847e-01)
Epoch: [14][4500/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.7244e-01 (9.0863e-01)
Epoch: [14][4550/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.9189e-01 (9.0879e-01)
Epoch: [14][4600/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0660e-01 (9.0877e-01)
Epoch: [14][4650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.2774e-01 (9.0868e-01)
Epoch: [14][4700/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [15][3250/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4912e-01 (8.9044e-01)
Epoch: [15][3300/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1736e-01 (8.9033e-01)
Epoch: [15][3350/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6559e-01 (8.9053e-01)
Epoch: [15][3400/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8077e-01 (8.9035e-01)
Epoch: [15][3450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.0303e-01 (8.9055e-01)
Epoch: [15][3500/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.8000e-01 (8.9052e-01)
Epoch: [15][3550/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 1.0179e+00 (8.9017e-01)
Epoch: [15][3600/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6879e-01 (8.9032e-01)
Epoch: [15][3650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8553e-01 (8.9050e-01)
Epoch: [15][3700/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.3205e-01 (8.9061e-01)
Epoch: [15][3750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [16][2300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.3662e-01 (8.7477e-01)
Epoch: [16][2350/5005]	Time  0.640 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.0962e-01 (8.7454e-01)
Epoch: [16][2400/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.4015e-01 (8.7467e-01)
Epoch: [16][2450/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.6714e-01 (8.7480e-01)
Epoch: [16][2500/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6649e-01 (8.7462e-01)
Epoch: [16][2550/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1610e-01 (8.7466e-01)
Epoch: [16][2600/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.3651e-01 (8.7457e-01)
Epoch: [16][2650/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.7776e-01 (8.7462e-01)
Epoch: [16][2700/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5116e-01 (8.7482e-01)
Epoch: [16][2750/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.8876e-01 (8.7499e-01)
Epoch: [16][2800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.

Epoch: [17][1350/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.5311e-01 (8.5380e-01)
Epoch: [17][1400/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.2505e-01 (8.5347e-01)
Epoch: [17][1450/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 1.0014e+00 (8.5371e-01)
Epoch: [17][1500/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.6523e-01 (8.5356e-01)
Epoch: [17][1550/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.9259e-01 (8.5327e-01)
Epoch: [17][1600/5005]	Time  0.636 ( 0.639)	Data  0.000 ( 0.002)	Loss 8.8650e-01 (8.5331e-01)
Epoch: [17][1650/5005]	Time  0.636 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.6770e-01 (8.5368e-01)
Epoch: [17][1700/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.9469e-01 (8.5380e-01)
Epoch: [17][1750/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 9.2020e-01 (8.5363e-01)
Epoch: [17][1800/5005]	Time  0.637 ( 0.639)	Data  0.000 ( 0.002)	Loss 7.4787e-01 (8.5405e-01)
Epoch: [17][1850/5005]	Time  0.638 ( 0.639)	Data  0.000 ( 0.

Epoch: [18][ 400/5005]	Time  0.637 ( 0.643)	Data  0.000 ( 0.006)	Loss 8.3117e-01 (8.4349e-01)
Epoch: [18][ 450/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.006)	Loss 8.3534e-01 (8.4465e-01)
Epoch: [18][ 500/5005]	Time  0.637 ( 0.642)	Data  0.000 ( 0.005)	Loss 7.3971e-01 (8.4486e-01)
Epoch: [18][ 550/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.005)	Loss 7.7022e-01 (8.4376e-01)
Epoch: [18][ 600/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 7.0348e-01 (8.4166e-01)
Epoch: [18][ 650/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 7.4656e-01 (8.4266e-01)
Epoch: [18][ 700/5005]	Time  0.637 ( 0.641)	Data  0.000 ( 0.004)	Loss 7.5042e-01 (8.4172e-01)
Epoch: [18][ 750/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.9788e-01 (8.4165e-01)
Epoch: [18][ 800/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 8.8304e-01 (8.4169e-01)
Epoch: [18][ 850/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.003)	Loss 9.5761e-01 (8.4075e-01)
Epoch: [18][ 900/5005]	Time  0.637 ( 0.640)	Data  0.000 ( 0.

Epoch: [18][4800/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 6.9921e-01 (8.3841e-01)
Epoch: [18][4850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.5931e-01 (8.3861e-01)
Epoch: [18][4900/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.8259e-01 (8.3879e-01)
Epoch: [18][4950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6768e-01 (8.3875e-01)
Epoch: [18][5000/5005]	Time  0.636 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.8608e-01 (8.3879e-01)
Test: [  0/196]	Time  3.474 ( 3.474)	Loss 5.5536e-01 (5.5536e-01)	Acc@1  85.94 ( 85.94)	Acc@5  97.27 ( 97.27)
Test: [ 50/196]	Time  0.433 ( 0.493)	Loss 4.3713e-01 (7.3204e-01)	Acc@1  89.45 ( 80.53)	Acc@5  97.27 ( 95.53)
Test: [100/196]	Time  0.433 ( 0.463)	Loss 1.3129e+00 (8.6529e-01)	Acc@1  64.84 ( 77.37)	Acc@5  89.06 ( 94.09)
Test: [150/196]	Time  0.434 ( 0.453)	Loss 1.1744e+00 (9.8137e-01)	Acc@1  74.61 ( 75.12)	Acc@5  89.06 ( 92.54)
epoch 18 0.8388007092393749 74.13199615478516 0.0005000000000000008 2344421 0.099994480818

Epoch: [19][3850/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.3550e-01 (8.2505e-01)
Epoch: [19][3900/5005]	Time  0.638 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.1459e-01 (8.2477e-01)
Epoch: [19][3950/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9117e-01 (8.2466e-01)
Epoch: [19][4000/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.6564e-01 (8.2443e-01)
Epoch: [19][4050/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.9756e-01 (8.2444e-01)
Epoch: [19][4100/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 9.1427e-01 (8.2448e-01)
Epoch: [19][4150/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.6116e-01 (8.2450e-01)
Epoch: [19][4200/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 7.1431e-01 (8.2458e-01)
Epoch: [19][4250/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.0857e-01 (8.2427e-01)
Epoch: [19][4300/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.001)	Loss 8.7758e-01 (8.2438e-01)
Epoch: [19][4350/5005]	Time  0.637 ( 0.638)	Data  0.000 ( 0.