In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

from tqdm import tqdm
from sklearn import datasets
import random, sys, os

In [3]:
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, transforms

In [4]:
device = torch.device("cuda:0")
# device = torch.device("cuda:1")
# device = torch.device("cpu")

## Custom CUDA-Pytorch functions

In [5]:
import bmm2x2_cuda
import bilinear2x2_cuda

## Cuda -bmm2x2

In [6]:
class BMM2x2Function(torch.autograd.Function):
    @staticmethod
#     @torch.jit.ignore
    def forward(ctx, inputs, weights):
        outputs = bmm2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]
    
    @staticmethod
#     @torch.jit.ignore
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bmm2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [54]:
class PairLinear(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.weight = torch.eye(2).unsqueeze(0).repeat_interleave(input_dim//2, dim=0)
        self.weight = nn.Parameter(self.weight)
        self.bmmfunc = BMM2x2Function()
        
    def forward(self, x):
        bs, dim = x.shape[0], x.shape[1]
        x = x.view(bs, -1, 2)
        x = BMM2x2Function.apply(x, self.weight)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.weight.shape[0]*2
        S = f'PairLinear: [{t} -> {t}]'
        return S

In [55]:
A = torch.randn(5, 8).to(device)
model = PairLinear(8).to(device)

In [56]:
model

PairLinear: [8 -> 8]

In [57]:
y = model(A)
y.mean().backward()
y

tensor([[-0.0482,  0.0458, -0.1138,  0.4797,  0.7061,  0.9942,  0.4593, -0.9618],
        [ 0.5966, -1.1693, -0.8379, -1.0604, -0.4124,  1.8205,  0.2100, -0.6360],
        [-0.3879,  0.7118,  1.0327, -0.8704,  0.4211,  0.3607,  1.1000, -1.3011],
        [ 0.5919, -0.0903, -0.7032,  0.5343, -0.6516, -0.3320,  0.7255,  0.1594],
        [ 0.7397, -0.5280, -0.4998,  2.4217, -1.1096,  0.7731,  0.7770, -0.2846]],
       device='cuda:0', grad_fn=<ViewBackward>)

In [58]:
model.weight.grad

tensor([[[ 0.0373,  0.0373],
         [-0.0258, -0.0258]],

        [[-0.0280, -0.0280],
         [ 0.0376,  0.0376]],

        [[-0.0262, -0.0262],
         [ 0.0904,  0.0904]],

        [[ 0.0818,  0.0818],
         [-0.0756, -0.0756]]], device='cuda:0')

### BMM 2x1

In [59]:
class BMM2x1Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bmm2x2_cuda.forward_2x1(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]
    
    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bmm2x2_cuda.backward_2x1(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [63]:
class PairLinearHalve(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.weight = torch.Tensor([0.5, 0.5]).unsqueeze(0).repeat_interleave(input_dim//2, dim=0)
        self.weight = nn.Parameter(self.weight)
        
    def forward(self, x):
        bs, dim = x.shape[0], x.shape[1]
        x = x.view(bs, -1, 2)
        x = BMM2x1Function.apply(x, self.weight)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.weight.shape[0]
        S = f'PairLinearHalve: [{t*2} -> {t}]'
        return S

In [64]:
A = torch.randn(5, 8).to(device)
model = PairLinearHalve(8).to(device)

In [65]:
model

PairLinearHalve: [8 -> 4]

In [66]:
y = model(A)
y.mean().backward()
y

tensor([[ 0.3508, -0.7172,  0.6843, -0.1334],
        [ 0.0626,  0.1347,  0.2325,  0.3437],
        [-0.1255,  0.0414,  0.4830, -0.4375],
        [-0.0651,  0.0680,  0.3810, -1.0676],
        [ 0.5592,  1.1002,  0.5184, -0.4876]], device='cuda:0',
       grad_fn=<ViewBackward>)

In [67]:
model.weight.grad

tensor([[-0.0161,  0.0943],
        [ 0.0667, -0.0040],
        [ 0.0653,  0.1646],
        [-0.1167, -0.0615]], device='cuda:0')

## Cuda - Bilinear2x2

In [16]:
class BiLinear2x2Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bilinear2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]

    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bilinear2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [82]:
class PairBilinear(nn.Module):
    def __init__(self, dim, grid_width):
        super().__init__()
        num_pairs = dim // 2
        along_row = torch.linspace(0, 1, grid_width).reshape(1, -1).t()
        along_col = torch.linspace(0, 1, grid_width).reshape(-1, 1).t()
        
        self.Y = torch.stack([along_row+along_col*0, along_row*0+along_col])
        self.Y = torch.repeat_interleave(self.Y.unsqueeze(0), num_pairs, dim=0)
        self.Y = nn.Parameter(self.Y)
        
        self.pairW = torch.eye(2).unsqueeze(0).repeat_interleave(num_pairs, dim=0)
        self.pairW = nn.Parameter(self.pairW)
    
    def forward(self, x):
        bs = x.shape[0]
        
        x = x.view(bs, -1, 2)
        x = BMM2x2Function.apply(x, self.pairW)
        ####################################################
        x = BiLinear2x2Function.apply(x, self.Y)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.pairW.shape[0]*2
        u = self.Y.shape[2]
        S = f'PairLinear: [{t} -> {t}] (grid: {u})'
        return S

In [83]:
A = torch.randn(5, 8).to(device)
model = PairBilinear(8, 3).to(device)

In [84]:
model

PairLinear: [8 -> 8] (grid: 3)

In [85]:
y = model(A)
y.mean().backward()

y

tensor([[ 0.6277,  0.6846, -0.2970,  0.5104,  1.5108,  0.8831, -0.1273, -0.1880],
        [-0.2823,  0.2822, -0.3119,  1.1664,  0.8766,  0.5316,  1.3626,  0.1817],
        [-0.8798, -0.6815, -0.9414, -0.3515, -0.5076, -0.6730, -0.5839,  1.1094],
        [ 0.6834,  0.4560, -1.7385,  2.0084,  0.5141,  1.3177,  1.6020, -0.0783],
        [-0.3808,  0.2510, -0.7354, -0.3153,  0.1359, -0.2706, -0.8862,  0.6448]],
       device='cuda:0', grad_fn=<ViewBackward>)

In [86]:
model.Y.grad

tensor([[[[ 0.2020, -0.0499,  0.0000],
          [-0.1182,  0.0686,  0.0069],
          [ 0.0008,  0.0124,  0.0024]],

         [[ 0.2020, -0.0499,  0.0000],
          [-0.1182,  0.0686,  0.0069],
          [ 0.0008,  0.0124,  0.0024]]],


        [[[ 0.2235, -0.2898,  0.3926],
          [-0.1401,  0.2222, -0.2833],
          [ 0.0000,  0.0000,  0.0000]],

         [[ 0.2235, -0.2898,  0.3926],
          [-0.1401,  0.2222, -0.2833],
          [ 0.0000,  0.0000,  0.0000]]],


        [[[ 0.1462, -0.0777,  0.0000],
          [-0.0491,  0.0148,  0.0206],
          [ 0.0000,  0.0290,  0.0411]],

         [[ 0.1462, -0.0777,  0.0000],
          [-0.0491,  0.0148,  0.0206],
          [ 0.0000,  0.0290,  0.0411]]],


        [[[ 0.0432,  0.0256,  0.0861],
          [-0.0551, -0.0246, -0.0484],
          [ 0.0912,  0.0070,  0.0000]],

         [[ 0.0432,  0.0256,  0.0861],
          [-0.0551, -0.0246, -0.0484],
          [ 0.0912,  0.0070,  0.0000]]]], device='cuda:0')

### Cuda - Bilinear2x1

In [87]:
class BiLinear2x1Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bilinear2x2_cuda.forward_2x1(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]

    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bilinear2x2_cuda.backward_2x1(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [91]:
class PairBilinearHalve(nn.Module):
    def __init__(self, dim, grid_width):
        super().__init__()
        num_pairs = dim // 2
        
        self.pairW = torch.eye(2).unsqueeze(0).repeat_interleave(num_pairs, dim=0)
        self.pairW = nn.Parameter(self.pairW)

        along_row = torch.linspace(0, 1, grid_width).reshape(1, -1).t()
        along_col = torch.linspace(0, 1, grid_width).reshape(-1, 1).t()
        
        self.Y = torch.stack([along_row+along_col*0, along_row*0+along_col]).mean(dim=0)
        self.Y = torch.repeat_interleave(self.Y.unsqueeze(0), num_pairs, dim=0)
        self.Y = nn.Parameter(self.Y)
        
    
    def forward(self, x):
        bs = x.shape[0]
        
        x = x.view(bs, -1, 2)
        x = BMM2x2Function.apply(x, self.pairW)
        ####################################################
        x = BiLinear2x1Function.apply(x, self.Y)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.pairW.shape[0]
        u = self.Y.shape[2]
        S = f'PairLinear: [{t*2} -> {t}] (grid: {u})'
        return S

In [92]:
A = torch.randn(5, 8).to(device)
model = PairBilinearHalve(8, 3).to(device)

In [94]:
model

PairLinear: [8 -> 4] (grid: 3)

In [95]:
y = model(A)
y.mean().backward()

y

tensor([[-0.8319, -0.6487, -1.2232, -0.6380],
        [-1.5712,  0.5226,  0.5464, -0.4149],
        [-0.9055, -0.0855, -0.1336,  0.4387],
        [-0.2508,  0.1046, -0.4098,  0.1326],
        [-1.5982,  0.6126,  0.8330,  0.1647]], device='cuda:0',
       grad_fn=<ViewBackward>)

In [96]:
model.Y.grad

tensor([[[ 2.3839e+00, -1.7646e+00,  0.0000e+00],
         [-1.4716e+00,  1.1023e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

        [[ 6.2555e-02,  1.3307e-02,  0.0000e+00],
         [-2.5036e-01,  1.8656e-01,  1.7736e-03],
         [ 7.4990e-01, -5.1488e-01,  1.1444e-03]],

        [[ 2.2106e-01, -2.5346e-01,  4.6508e-01],
         [ 1.1670e-01, -4.5822e-02, -2.8898e-01],
         [ 1.8530e-02,  1.6905e-02,  0.0000e+00]],

        [[ 3.4840e-01, -3.9973e-02,  2.3891e-03],
         [-3.6170e-01,  1.8842e-01,  4.0110e-03],
         [ 3.8070e-01, -2.7224e-01,  0.0000e+00]]], device='cuda:0')

## Modules and Layers

In [125]:
class BiasLayer(nn.Module):
    def __init__(self, dim, init_val=0):
        super().__init__()
        self.bias = nn.Parameter(torch.ones(dim)*init_val)
        
    def forward(self, x):
        return x+self.bias
    
    def __repr__(self):
        S = f'BiasLayer: [{self.bias.shape[0]}]'
        return S

In [256]:
class DimensionSelector(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        assert output_dim > input_dim, "Slection does not select all inputs"
        remain = output_dim-input_dim
        
        scale = int(np.ceil(output_dim/input_dim)-1)
#         self.indices = torch.randperm(input_dim*scale)[:remain]%input_dim

        self.indices = torch.LongTensor([])
        for i in range(scale):
            c = min(input_dim, remain-len(self.indices))
            t = torch.randperm(input_dim)[:c]
            self.indices = torch.cat([self.indices, t])
            
        
    def forward(self, x):
        ## x.shape = [batch_size, input_dim]
        return torch.cat([x, x[:, self.indices]], dim=1)
    
    def __repr__(self):
        S = f'DimensionSelector: [+={self.indices.shape[0]}]'
        return S

In [257]:
ds = DimensionSelector(8, 32)

In [258]:
ds.indices.unique(return_counts=True)

(tensor([0, 1, 2, 3, 4, 5, 6, 7]), tensor([3, 3, 3, 3, 3, 3, 3, 3]))

In [259]:
ds.indices

tensor([3, 1, 4, 2, 7, 6, 5, 0, 0, 3, 6, 5, 4, 1, 7, 2, 5, 4, 7, 6, 0, 2, 3, 1])

In [353]:
class PairBilinear_MixerBlock(nn.Module):
    
    '''
    Handle any input - output size;
    
    Operations -> Select, NxN mix, Halve
    
    -Edge cases:
    1) 8-8 -> NxN mixing for log(N) times
    2) 8-10 -> Select(16) + 16x16 + Select(20) + Halve
    3) 8-6 -> 8x8 + Select(12) + Halve
    4) 8-32 -> Select(32) + 32x32
    5) 8-3 -> 8x8 + Halve + 4-Select(6) + Halve
    
    '''
    
    def __init__(self, input_dim, output_dim, grid_width, bias=True):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.grid_width = grid_width
        
        self.selector = None
        self.pairwise_mixing = []
        self.reducer = []
        
        mix_dim = 2**int(np.ceil(np.log2(max(input_dim, output_dim))))
        
        #########################################################
        ### Find out if first selection is required or Not !
        if self.input_dim != mix_dim:
            ## Input dimension is not power of 2; requires selector to project to mixing dimension
            L = DimensionSelector(input_dim, mix_dim)
            self.selector = L
            if bias:
                self.selector = nn.Sequential(L, BiasLayer(mix_dim, 0.5))
        else:
            self.selector = nn.Identity()
            if bias:
                self.selector = BiasLayer(mix_dim, 0.5)
        
        ### Now perform NxN mixing 
        num_layers = int(np.ceil(np.log2(mix_dim)))
        for i in range(num_layers):
            net = PairBilinear(mix_dim, grid_width)
            self.pairwise_mixing.append(net)
        self.pairwise_mixing = nn.ModuleList(self.pairwise_mixing)
        
        ### Now for reducer if any
        num_halve = int(np.ceil(np.log2(mix_dim/output_dim)))
        final_expand = output_dim*(2**num_halve)
        if final_expand != mix_dim:
            L = DimensionSelector(mix_dim, final_expand)
            self.reducer.append(L)
        for i in range(num_halve):
            L = PairBilinearHalve(final_expand//(2**i), grid_width)
            self.reducer.append(L)
            
        if len(self.reducer) == 0:
            self.reducer = nn.Identity()
        else:
            self.reducer = nn.Sequential(*self.reducer)
        
        pass
    
    def forward(self, x):
        '''
        x: shape-> [batch_size, input_dim]
        '''
        bs = x.shape[0]
        
        x = self.selector(x)
        
        y = x
        for i, fn in enumerate(self.pairwise_mixing):
            y = y.view(-1,2,2**i).permute(0, 2,1).contiguous().view(bs, -1)
            y = fn(y) 
            y = y.view(-1,2**i,2).permute(0, 2,1).contiguous()

#         y = x + y ## this is residual addition... remove if only want feed forward
        y = y.view(bs, -1)
        
        y = self.reducer(y)
        return y

In [354]:
pblm = PairBilinear_MixerBlock(16, 12, grid_width=3).to(device)

In [355]:
pblm

PairBilinear_MixerBlock(
  (selector): BiasLayer: [16]
  (pairwise_mixing): ModuleList(
    (0): PairLinear: [16 -> 16] (grid: 3)
    (1): PairLinear: [16 -> 16] (grid: 3)
    (2): PairLinear: [16 -> 16] (grid: 3)
    (3): PairLinear: [16 -> 16] (grid: 3)
  )
  (reducer): Sequential(
    (0): DimensionSelector: [+=8]
    (1): PairLinear: [24 -> 12] (grid: 3)
  )
)

In [356]:
y = pblm(torch.randn(3, 16).to(device))
y.mean().backward()

y

tensor([[-0.6882,  1.5015,  0.3411, -0.3667,  0.3505,  0.8888,  0.6034,  0.2338,
          0.8310,  0.9054, -0.2032,  0.1390],
        [ 0.9000,  0.8936,  1.3301,  1.4538,  1.5044,  1.3687,  0.0884, -0.1863,
          1.2508,  0.5169,  1.1569,  1.8822],
        [-0.0505, -0.1814, -0.5177,  2.0929,  0.0982,  1.7282,  0.9924, -0.1975,
          0.7790,  0.9535, -0.9159,  2.0932]], device='cuda:0',
       grad_fn=<ViewBackward>)

In [401]:
class FactorNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.la1 = PairBilinear_MixerBlock(784, 512, grid_width=5)
        self.bn1 = nn.BatchNorm1d(512)
#         self.la2 = PairBilinear_MixerBlock(200, 10, grid_width=5)
        self.la2 = nn.Linear(512, 10)
        
    def forward(self, x):
        x = self.la1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.la2(x)
        return x

In [402]:
class OrdinaryNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.la1 = nn.Linear(784, 200, bias=False)
        self.bn1 = nn.BatchNorm1d(200)
        self.la2 = nn.Linear(200, 10)
        
    def forward(self, x):
        x = self.bn1(self.la1(x))
        x = torch.relu(x)
        x = self.la2(x)
        return x

In [403]:
model = FactorNet()
param_count = sum([torch.numel(p) for p in model.parameters()])
param_count

126474

In [404]:
model = OrdinaryNet()
param_count1 = sum([torch.numel(p) for p in model.parameters()])
param_count1, param_count1/param_count

(159210, 1.2588358081502917)

## Dataset

In [405]:
train_transform = transforms.Compose([
            transforms.ToTensor(),
        ])
test_transform = transforms.Compose([
            transforms.ToTensor(),
        ])

train_dataset = datasets.FashionMNIST(root="../../../../_Datasets/FMNIST/", train=True, download=True, transform=train_transform)
test_dataset = datasets.FashionMNIST(root="../../../../_Datasets/FMNIST/", train=False, download=True, transform=test_transform)

In [406]:
LR = 0.0003
BS = 200

In [407]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BS, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BS, shuffle=False, num_workers=2)

In [408]:
## demo of train loader
xx, yy = iter(train_loader).next()
xx.shape

torch.Size([200, 1, 28, 28])

### Model Development

In [409]:
torch.manual_seed(0)
model = FactorNet().to(device)
# model = OrdinaryNet().to(device)
model

FactorNet(
  (la1): PairBilinear_MixerBlock(
    (selector): Sequential(
      (0): DimensionSelector: [+=240]
      (1): BiasLayer: [1024]
    )
    (pairwise_mixing): ModuleList(
      (0): PairLinear: [1024 -> 1024] (grid: 3)
      (1): PairLinear: [1024 -> 1024] (grid: 3)
      (2): PairLinear: [1024 -> 1024] (grid: 3)
      (3): PairLinear: [1024 -> 1024] (grid: 3)
      (4): PairLinear: [1024 -> 1024] (grid: 3)
      (5): PairLinear: [1024 -> 1024] (grid: 3)
      (6): PairLinear: [1024 -> 1024] (grid: 3)
      (7): PairLinear: [1024 -> 1024] (grid: 3)
      (8): PairLinear: [1024 -> 1024] (grid: 3)
      (9): PairLinear: [1024 -> 1024] (grid: 3)
    )
    (reducer): Sequential(
      (0): PairLinear: [1024 -> 512] (grid: 3)
    )
  )
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (la2): Linear(in_features=512, out_features=10, bias=True)
)

In [410]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

In [411]:
losses = []
train_accs = []
test_accs = []
EPOCHS = 20

for epoch in range(EPOCHS):
    
    train_acc = 0
    train_count = 0
    i = -1
    for xx, yy in tqdm(train_loader):
        i += 1 
        xx = xx.view(xx.shape[0], -1)
        xx, yy = xx.to(device), yy.to(device)

        yout = model(xx)
        loss = criterion(yout, yy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(float(loss))

        outputs = torch.argmax(yout, dim=1).data.cpu().numpy()
        correct = (outputs == yy.data.cpu().numpy()).astype(float).sum()
        train_acc += correct
        train_count += len(outputs)
        

    train_accs.append(float(train_acc)/train_count*100)
    train_acc = 0
    train_count = 0

    print(f'Epoch: {epoch},  Loss:{float(loss)}')
    test_count = 0
    test_acc = 0
    for xx, yy in tqdm(test_loader):
        xx = xx.view(xx.shape[0], -1)
        xx, yy = xx.to(device), yy.to(device)
        with torch.no_grad():
            yout = model(xx)
        outputs = torch.argmax(yout, dim=1).data.cpu().numpy()
        correct = (outputs == yy.data.cpu().numpy()).astype(float).sum()
        test_acc += correct
        test_count += len(xx)
    test_accs.append(float(test_acc)/test_count*100)
    print(f'Train Acc:{train_accs[-1]:.2f}%, Test Acc:{test_accs[-1]:.2f}%')
    print()

### after each class index is finished training
print(f'\t-> Train Acc {max(train_accs)} ; Test Acc {max(test_accs)}')

100%|██████████| 300/300 [00:20<00:00, 15.00it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0,  Loss:0.4560028910636902


100%|██████████| 50/50 [00:01<00:00, 41.48it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:77.67%, Test Acc:83.75%



100%|██████████| 300/300 [00:20<00:00, 14.87it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 1,  Loss:0.3467607796192169


100%|██████████| 50/50 [00:01<00:00, 36.75it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:86.10%, Test Acc:85.95%



100%|██████████| 300/300 [00:20<00:00, 14.98it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 2,  Loss:0.32844313979148865


100%|██████████| 50/50 [00:00<00:00, 50.16it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:87.79%, Test Acc:86.91%



100%|██████████| 300/300 [00:19<00:00, 15.09it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 3,  Loss:0.264177531003952


100%|██████████| 50/50 [00:00<00:00, 56.48it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:88.78%, Test Acc:87.61%



100%|██████████| 300/300 [00:20<00:00, 14.87it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 4,  Loss:0.29283979535102844


100%|██████████| 50/50 [00:01<00:00, 39.50it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:89.50%, Test Acc:87.63%



100%|██████████| 300/300 [00:20<00:00, 14.64it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 5,  Loss:0.21122464537620544


100%|██████████| 50/50 [00:00<00:00, 53.54it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:90.19%, Test Acc:87.86%



100%|██████████| 300/300 [00:20<00:00, 14.95it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 6,  Loss:0.18957166373729706


100%|██████████| 50/50 [00:01<00:00, 47.00it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:90.56%, Test Acc:88.16%



100%|██████████| 300/300 [00:20<00:00, 14.92it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 7,  Loss:0.22201703488826752


100%|██████████| 50/50 [00:00<00:00, 55.51it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:91.09%, Test Acc:88.62%



100%|██████████| 300/300 [00:20<00:00, 14.95it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 8,  Loss:0.3216005563735962


100%|██████████| 50/50 [00:00<00:00, 56.47it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:91.55%, Test Acc:88.27%



100%|██████████| 300/300 [00:19<00:00, 15.07it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 9,  Loss:0.15288347005844116


100%|██████████| 50/50 [00:00<00:00, 57.05it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:91.81%, Test Acc:88.36%



100%|██████████| 300/300 [00:20<00:00, 14.99it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 10,  Loss:0.22441822290420532


100%|██████████| 50/50 [00:01<00:00, 45.51it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

Train Acc:92.16%, Test Acc:88.95%



 79%|███████▉  | 238/300 [00:16<00:04, 14.79it/s]


KeyboardInterrupt: 