In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

from tqdm import tqdm
from sklearn import datasets
import random, sys, os

In [3]:
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, transforms

In [4]:
# device = torch.device("cuda:0")
device = torch.device("cuda:1")
# device = torch.device("cpu")

## Custom CUDA-Pytorch functions

In [5]:
import bmm2x2_cuda
import bilinear2x2_cuda

## Cuda -bmm2x2

In [6]:
class BMM2x2Function(torch.autograd.Function):
    @staticmethod
#     @torch.jit.ignore
    def forward(ctx, inputs, weights):
        outputs = bmm2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]
    
    @staticmethod
#     @torch.jit.ignore
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bmm2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [7]:
class PairLinear(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.weight = torch.eye(2).unsqueeze(0).repeat_interleave(input_dim//2, dim=0)
        self.weight = nn.Parameter(self.weight)
        self.bmmfunc = BMM2x2Function()
        
    def forward(self, x):
        bs, dim = x.shape[0], x.shape[1]
        x = x.view(bs, -1, 2)
        x = BMM2x2Function.apply(x, self.weight)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.weight.shape[0]*2
        S = f'PairLinear: [{t} -> {t}]'
        return S

In [8]:
A = torch.randn(5, 8).to(device)
model = PairLinear(8).to(device)

In [9]:
model

PairLinear: [8 -> 8]

In [10]:
y = model(A)
y.mean().backward()
y

tensor([[-2.8143e-01, -1.7013e+00,  2.7567e-01,  1.2769e+00,  1.1258e-01,
          1.5338e+00,  1.2184e+00,  9.1244e-02],
        [ 8.8820e-02,  1.4520e-03,  6.8265e-01, -7.1067e-01, -1.7152e-01,
          9.6847e-01, -1.6379e+00, -1.1885e+00],
        [-3.6610e-01,  2.6684e-01,  8.7538e-01, -6.9132e-01, -8.3439e-01,
         -3.5767e-01, -3.8731e-01,  3.3278e+00],
        [ 5.8075e-01,  2.8850e-01,  6.6579e-01, -9.0714e-01,  3.4323e-01,
         -1.0848e+00, -4.5794e-01,  6.7424e-01],
        [ 8.4624e-01, -1.9748e+00,  5.4001e-01,  1.3055e+00,  6.7613e-01,
          3.4019e-01, -5.6331e-01, -1.4729e+00]], device='cuda:1',
       grad_fn=<ViewBackward0>)

In [11]:
model.weight.grad

tensor([[[ 0.0217,  0.0217],
         [-0.0780, -0.0780]],

        [[ 0.0760,  0.0760],
         [ 0.0068,  0.0068]],

        [[ 0.0032,  0.0032],
         [ 0.0350,  0.0350]],

        [[-0.0457, -0.0457],
         [ 0.0358,  0.0358]]], device='cuda:1')

### BMM 2x1

In [12]:
class BMM2x1Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bmm2x2_cuda.forward_2x1(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]
    
    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bmm2x2_cuda.backward_2x1(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [13]:
class PairLinearHalve(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.weight = torch.Tensor([0.5, 0.5]).unsqueeze(0).repeat_interleave(input_dim//2, dim=0)
        self.weight = nn.Parameter(self.weight)
        
    def forward(self, x):
        bs, dim = x.shape[0], x.shape[1]
        x = x.view(bs, -1, 2)
        x = BMM2x1Function.apply(x, self.weight)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.weight.shape[0]
        S = f'PairLinearHalve: [{t*2} -> {t}]'
        return S

In [14]:
A = torch.randn(5, 8).to(device)
model = PairLinearHalve(8).to(device)

In [15]:
model

PairLinearHalve: [8 -> 4]

In [16]:
y = model(A)
y.mean().backward()
y

tensor([[ 0.0042,  0.3912, -0.4051,  0.8895],
        [ 0.3113,  0.6497, -0.5928,  0.5920],
        [ 0.5781,  0.0147, -0.5225, -1.0799],
        [-1.1975, -0.6876,  0.1668,  0.2346],
        [-0.3488,  1.3876, -1.4109,  0.1269]], device='cuda:1',
       grad_fn=<ViewBackward0>)

In [17]:
model.weight.grad

tensor([[ 0.0894, -0.1547],
        [ 0.1048,  0.0708],
        [-0.2432, -0.0332],
        [ 0.0159,  0.0604]], device='cuda:1')

## Cuda - Bilinear2x2

In [18]:
class BiLinear2x2Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bilinear2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]

    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bilinear2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [218]:
class PairBilinear(nn.Module):
    def __init__(self, dim, grid_width):
        super().__init__()
        num_pairs = dim // 2
        along_row = torch.linspace(0, 1, grid_width).reshape(1, -1).t()
        along_col = torch.linspace(0, 1, grid_width).reshape(-1, 1).t()
        
        self.pairW = torch.eye(2).unsqueeze(0).repeat_interleave(num_pairs, dim=0)
        self.pairW = nn.Parameter(self.pairW)
    
        self.Y = torch.stack([along_row+along_col*0, along_row*0+along_col])
        self.Y = torch.repeat_interleave(self.Y.unsqueeze(0), num_pairs, dim=0)
        self.Y = nn.Parameter(self.Y)
        
    def forward(self, x):
        bs = x.shape[0]
        
        x = x.view(bs, -1, 2)
#         x = BMM2x2Function.apply(x, self.pairW)
        ####################################################
        x = BiLinear2x2Function.apply(x, self.Y)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.pairW.shape[0]*2
        u = self.Y.shape[2]
        S = f'PairLinear: [{t} -> {t}] (grid: {u})'
        return S

In [219]:
A = torch.randn(5, 8).to(device)
model = PairBilinear(8, 3).to(device)

In [220]:
model

PairLinear: [8 -> 8] (grid: 3)

In [221]:
y = model(A)
y.mean().backward()

y

tensor([[ 2.6319, -0.9008,  2.0708, -0.5420,  1.2779, -1.0038, -1.0385, -0.6684],
        [ 0.7059, -0.2526, -0.5484, -0.9034,  0.3111, -0.6670,  0.5793, -0.8889],
        [-0.2838,  0.3112, -0.0567, -1.2892,  1.4162,  0.8061,  1.0327,  0.3577],
        [-1.8556, -1.5621, -0.2080,  0.2869,  0.2638, -0.6998, -0.6630, -0.6716],
        [-2.0038, -0.5239, -0.0471,  2.1041, -0.3635, -0.3394, -1.5490, -0.2572]],
       device='cuda:1', grad_fn=<ViewBackward0>)

In [222]:
model.Y.grad

tensor([[[[ 0.7569, -0.4747,  0.0000],
          [-0.7996,  0.5256,  0.0000],
          [ 0.3141, -0.1972,  0.0000]],

         [[ 0.7569, -0.4747,  0.0000],
          [-0.7996,  0.5256,  0.0000],
          [ 0.3141, -0.1972,  0.0000]]],


        [[[ 0.2618, -0.2066,  0.0878],
          [-0.2031,  0.1141, -0.0075],
          [ 0.1637, -0.0851,  0.0000]],

         [[ 0.2618, -0.2066,  0.0878],
          [-0.2031,  0.1141, -0.0075],
          [ 0.1637, -0.0851,  0.0000]]],


        [[[ 0.1229, -0.0584,  0.0000],
          [-0.0044, -0.0070, -0.0127],
          [ 0.1170, -0.0603,  0.0280]],

         [[ 0.1229, -0.0584,  0.0000],
          [-0.0044, -0.0070, -0.0127],
          [ 0.1170, -0.0603,  0.0280]]],


        [[[ 0.4712, -0.2336,  0.0000],
          [-0.2583,  0.1152,  0.0000],
          [ 0.0186,  0.0120,  0.0000]],

         [[ 0.4712, -0.2336,  0.0000],
          [-0.2583,  0.1152,  0.0000],
          [ 0.0186,  0.0120,  0.0000]]]], device='cuda:1')

### Cuda - Bilinear2x1

In [223]:
class BiLinear2x1Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bilinear2x2_cuda.forward_2x1(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]

    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bilinear2x2_cuda.backward_2x1(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [224]:
class PairBilinearHalve(nn.Module):
    def __init__(self, dim, grid_width):
        super().__init__()
        num_pairs = dim // 2
        
        self.pairW = torch.eye(2).unsqueeze(0).repeat_interleave(num_pairs, dim=0)
        self.pairW = nn.Parameter(self.pairW)

        along_row = torch.linspace(0, 1, grid_width).reshape(1, -1).t()
        along_col = torch.linspace(0, 1, grid_width).reshape(-1, 1).t()
        
        self.Y = torch.stack([along_row+along_col*0, along_row*0+along_col]).mean(dim=0)
        self.Y = torch.repeat_interleave(self.Y.unsqueeze(0), num_pairs, dim=0)
        self.Y = nn.Parameter(self.Y)
        
    
    def forward(self, x):
        bs = x.shape[0]
        
        x = x.view(bs, -1, 2)
#         x = BMM2x2Function.apply(x, self.pairW)
        ####################################################
        x = BiLinear2x1Function.apply(x, self.Y)
        x = x.view(bs, -1)
        return x
    
    def __repr__(self):
        t = self.pairW.shape[0]
        u = self.Y.shape[2]
        S = f'PairLinear: [{t*2} -> {t}] (grid: {u})'
        return S

In [225]:
A = torch.randn(5, 8).to(device)
model = PairBilinearHalve(8, 3).to(device)

In [226]:
model

PairLinear: [8 -> 4] (grid: 3)

In [227]:
y = model(A)
y.mean().backward()

y

tensor([[-0.5034,  0.5507,  0.2926, -0.2328],
        [-0.1249, -0.7540, -0.0688, -0.3511],
        [-0.7080, -0.8221, -0.8722, -0.8480],
        [ 0.0894, -0.0411, -0.7886,  0.7815],
        [-0.7506, -0.8115, -0.1640, -0.9617]], device='cuda:1',
       grad_fn=<ViewBackward0>)

In [228]:
model.Y.grad

tensor([[[ 0.8354, -0.4413,  0.0000],
         [-0.3300,  0.1859,  0.0000],
         [ 0.0000,  0.0000,  0.0000]],

        [[ 0.7969, -0.4051,  0.0000],
         [-1.0159,  0.6316,  0.0000],
         [ 0.9452, -0.7027,  0.0000]],

        [[ 0.5055,  0.0808,  0.0000],
         [-0.2938, -0.0803,  0.0000],
         [ 0.0602, -0.0223,  0.0000]],

        [[ 0.6779, -0.4217,  0.0000],
         [-0.0930,  0.0293,  0.0080],
         [ 0.0542, -0.0201,  0.0154]]], device='cuda:1')

## Modules and Layers

In [229]:
class BiasLayer(nn.Module):
    def __init__(self, dim, init_val=0):
        super().__init__()
        self.bias = nn.Parameter(torch.ones(dim)*init_val)
        
    def forward(self, x):
        return x+self.bias
    
    def __repr__(self):
        S = f'BiasLayer: [{self.bias.shape[0]}]'
        return S

In [230]:
class DimensionSelector(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        assert output_dim > input_dim, "Slection does not select all inputs"
        remain = output_dim-input_dim
        
        scale = int(np.ceil(output_dim/input_dim)-1)
#         self.indices = torch.randperm(input_dim*scale)[:remain]%input_dim

        self.indices = torch.LongTensor([])
        for i in range(scale):
            c = min(input_dim, remain-len(self.indices))
            t = torch.randperm(input_dim)[:c]
            self.indices = torch.cat([self.indices, t])
            
        
    def forward(self, x):
        ## x.shape = [batch_size, input_dim]
        return torch.cat([x, x[:, self.indices]], dim=1)
    
    def __repr__(self):
        S = f'DimensionSelector: [+={self.indices.shape[0]}]'
        return S

In [231]:
ds = DimensionSelector(8, 32)

In [232]:
ds.indices.unique(return_counts=True)

(tensor([0, 1, 2, 3, 4, 5, 6, 7]), tensor([3, 3, 3, 3, 3, 3, 3, 3]))

In [233]:
ds.indices

tensor([0, 4, 3, 5, 1, 6, 7, 2, 3, 2, 4, 0, 5, 1, 7, 6, 4, 1, 6, 5, 7, 3, 0, 2])

In [295]:
class PairBilinear_MixerBlock(nn.Module):
    
    '''
    Handle any input - output size;
    
    Operations -> Select, NxN mix, Halve
    
    -Edge cases:
    1) 8-8 -> NxN mixing for log(N) times
    2) 8-10 -> Select(16) + 16x16 + Select(20) + Halve
    3) 8-6 -> 8x8 + Select(12) + Halve
    4) 8-32 -> Select(32) + 32x32
    5) 8-3 -> 8x8 + Halve + 4-Select(6) + Halve
    
    '''
    
    def __init__(self, input_dim, output_dim, grid_width, bias=True):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.grid_width = grid_width
        
        self.selector = None
        self.pairwise_mixing = []
        self.reducer = []
        
        mix_dim = 2**int(np.ceil(np.log2(max(input_dim, output_dim))))
        
        #########################################################
        ### Find out if first selection is required or Not !
        if self.input_dim != mix_dim:
            ## Input dimension is not power of 2; requires selector to project to mixing dimension
            L = DimensionSelector(input_dim, mix_dim)
            self.selector = L
            if bias:
                self.selector = nn.Sequential(L, BiasLayer(mix_dim, 0.5))
        else:
            self.selector = nn.Identity()
            if bias:
                self.selector = BiasLayer(mix_dim, 0.5)
        
        ### Now perform NxN mixing 
        num_layers = int(np.ceil(np.log2(mix_dim)))
        for i in range(num_layers):
            net = PairBilinear(mix_dim, grid_width)
            self.pairwise_mixing.append(net)
        self.pairwise_mixing = nn.ModuleList(self.pairwise_mixing)
        
        ### Now for reducer if any
        num_halve = int(np.ceil(np.log2(mix_dim/output_dim)))
        final_expand = output_dim*(2**num_halve)
        if final_expand != mix_dim:
            L = DimensionSelector(mix_dim, final_expand)
            self.reducer.append(L)
        for i in range(num_halve):
#             L = PairBilinearHalve(final_expand//(2**i), grid_width)
            L = PairLinearHalve(final_expand//(2**i))
            self.reducer.append(L)
            
        if len(self.reducer) == 0:
            self.reducer = nn.Identity()
        else:
            self.reducer = nn.Sequential(*self.reducer)
        
        pass
    
    def forward(self, x):
        '''
        x: shape-> [batch_size, input_dim]
        '''
        bs = x.shape[0]
        
        x = self.selector(x)
        
        y = x
        for i, fn in enumerate(self.pairwise_mixing):
            y = y.view(-1,2,2**i).permute(0, 2,1).contiguous().view(bs, -1)
            y = fn(y)
            y = y.view(-1,2**i,2).permute(0, 2,1).contiguous()

        y = y.view(bs, -1)
        y = x + y ## this is residual addition... remove if only want feed forward
        y = self.reducer(y)
        return y

In [296]:
pblm = PairBilinear_MixerBlock(16, 12, grid_width=3).to(device)

In [297]:
pblm

PairBilinear_MixerBlock(
  (selector): BiasLayer: [16]
  (pairwise_mixing): ModuleList(
    (0): PairLinear: [16 -> 16] (grid: 3)
    (1): PairLinear: [16 -> 16] (grid: 3)
    (2): PairLinear: [16 -> 16] (grid: 3)
    (3): PairLinear: [16 -> 16] (grid: 3)
  )
  (reducer): Sequential(
    (0): DimensionSelector: [+=8]
    (1): PairLinearHalve: [24 -> 12]
  )
)

In [298]:
y = pblm(torch.randn(3, 16).to(device))
y.mean().backward()

y

tensor([[ 0.7938,  3.2138,  2.3757,  0.8527,  1.9313,  1.9148, -0.2545,  1.1918,
          1.6951,  1.9313,  4.1814,  1.5474],
        [ 0.3515,  0.4075,  0.0384,  3.4861,  0.1212,  3.7080,  1.4738,  0.4823,
          2.6307,  0.1212,  1.0725, -0.1545],
        [-1.2820,  4.1760, -0.3801,  0.4690,  2.0405,  1.9404, -0.3773, -0.6107,
          2.2265,  2.0405,  2.0836,  0.0365]], device='cuda:1',
       grad_fn=<ViewBackward0>)

In [315]:
class FactorNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.la0 = PairBilinear(784, 50)
        self.la1 = PairBilinear_MixerBlock(784, 512, grid_width=10)
        self.bn1 = nn.BatchNorm1d(512)
#         self.bn1 = nn.LayerNorm(512)
#         self.la2 = PairBilinear_MixerBlock(512, 10, grid_width=50)
        self.la2 = nn.Linear(512, 10)
        
    def forward(self, x):
        x = self.la0(x)
        x = self.la1(x)
        x = self.bn1(x)
#         x = torch.relu(x)
        x = self.la2(x)
        return x

In [316]:
class OrdinaryNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.la1 = nn.Linear(784, 200, bias=False)
        self.bn1 = nn.BatchNorm1d(200)
        self.la2 = nn.Linear(200, 10)
        
    def forward(self, x):
        x = self.bn1(self.la1(x))
        x = torch.relu(x)
        x = self.la2(x)
        return x

In [317]:
model = FactorNet()
param_count = sum([torch.numel(p) for p in model.parameters()])
param_count

3014250

In [318]:
model = OrdinaryNet()
param_count1 = sum([torch.numel(p) for p in model.parameters()])
param_count1, param_count1/param_count

(159210, 0.05281910923115203)

## Dataset

In [319]:
train_transform = transforms.Compose([
            transforms.ToTensor(),
        ])
test_transform = transforms.Compose([
            transforms.ToTensor(),
        ])

train_dataset = datasets.FashionMNIST(root="../../../../_Datasets/FMNIST/", train=True, download=True, transform=train_transform)
test_dataset = datasets.FashionMNIST(root="../../../../_Datasets/FMNIST/", train=False, download=True, transform=test_transform)

In [326]:
LR = 0.0001
BS = 50

In [327]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BS, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BS, shuffle=False, num_workers=2)

In [328]:
## demo of train loader
xx, yy = iter(train_loader).next()
xx.shape

torch.Size([50, 1, 28, 28])

### Model Development

In [329]:
torch.manual_seed(0)
model = FactorNet().to(device)
# model = OrdinaryNet().to(device)
model

FactorNet(
  (la0): PairLinear: [784 -> 784] (grid: 50)
  (la1): PairBilinear_MixerBlock(
    (selector): Sequential(
      (0): DimensionSelector: [+=240]
      (1): BiasLayer: [1024]
    )
    (pairwise_mixing): ModuleList(
      (0): PairLinear: [1024 -> 1024] (grid: 10)
      (1): PairLinear: [1024 -> 1024] (grid: 10)
      (2): PairLinear: [1024 -> 1024] (grid: 10)
      (3): PairLinear: [1024 -> 1024] (grid: 10)
      (4): PairLinear: [1024 -> 1024] (grid: 10)
      (5): PairLinear: [1024 -> 1024] (grid: 10)
      (6): PairLinear: [1024 -> 1024] (grid: 10)
      (7): PairLinear: [1024 -> 1024] (grid: 10)
      (8): PairLinear: [1024 -> 1024] (grid: 10)
      (9): PairLinear: [1024 -> 1024] (grid: 10)
    )
    (reducer): Sequential(
      (0): PairLinearHalve: [1024 -> 512]
    )
  )
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (la2): Linear(in_features=512, out_features=10, bias=True)
)

In [330]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [331]:
losses = []
train_accs = []
test_accs = []
EPOCHS = 20

for epoch in range(EPOCHS):
    
    train_acc = 0
    train_count = 0
    i = -1
    for xx, yy in tqdm(train_loader):
        i += 1 
        xx = xx.view(xx.shape[0], -1)
        xx, yy = xx.to(device), yy.to(device)

        yout = model(xx)
        loss = criterion(yout, yy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(float(loss))

        outputs = torch.argmax(yout, dim=1).data.cpu().numpy()
        correct = (outputs == yy.data.cpu().numpy()).astype(float).sum()
        train_acc += correct
        train_count += len(outputs)
        
        if torch.any(torch.isnan(yout.data)):
            print(f"NAN values found")
        

    train_accs.append(float(train_acc)/train_count*100)
    train_acc = 0
    train_count = 0

    print(f'Epoch: {epoch},  Loss:{float(loss)}')
    test_count = 0
    test_acc = 0
    for xx, yy in tqdm(test_loader):
        xx = xx.view(xx.shape[0], -1)
        xx, yy = xx.to(device), yy.to(device)
        with torch.no_grad():
            yout = model(xx)
        outputs = torch.argmax(yout, dim=1).data.cpu().numpy()
        correct = (outputs == yy.data.cpu().numpy()).astype(float).sum()
        test_acc += correct
        test_count += len(xx)
    test_accs.append(float(test_acc)/test_count*100)
    print(f'Train Acc:{train_accs[-1]:.2f}%, Test Acc:{test_accs[-1]:.2f}%')
    print()

### after each class index is finished training
print(f'\t-> Train Acc {max(train_accs)} ; Test Acc {max(test_accs)}')

100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.78it/s]


Epoch: 0,  Loss:0.43971577286720276


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 354.59it/s]


Train Acc:80.55%, Test Acc:84.55%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.38it/s]


Epoch: 1,  Loss:0.31764519214630127


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 331.88it/s]


Train Acc:86.80%, Test Acc:85.78%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.10it/s]


Epoch: 2,  Loss:0.348450243473053


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 380.90it/s]


Train Acc:88.95%, Test Acc:86.62%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.83it/s]


Epoch: 3,  Loss:0.15511386096477509


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 339.63it/s]


Train Acc:90.67%, Test Acc:87.30%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.89it/s]


Epoch: 4,  Loss:0.37110352516174316


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 371.04it/s]


Train Acc:92.05%, Test Acc:87.15%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.44it/s]


Epoch: 5,  Loss:0.13232402503490448


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 349.97it/s]


Train Acc:93.28%, Test Acc:87.16%



100%|█████████████████████████████████████████████████| 1200/1200 [00:13<00:00, 92.15it/s]


Epoch: 6,  Loss:0.24082571268081665


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 371.56it/s]


Train Acc:94.41%, Test Acc:86.97%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.79it/s]


Epoch: 7,  Loss:0.1541043221950531


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 341.96it/s]


Train Acc:95.03%, Test Acc:87.34%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.53it/s]


Epoch: 8,  Loss:0.16107267141342163


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 362.48it/s]


Train Acc:95.89%, Test Acc:87.21%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.17it/s]


Epoch: 9,  Loss:0.07172103971242905


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 366.95it/s]


Train Acc:96.58%, Test Acc:87.28%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.57it/s]


Epoch: 10,  Loss:0.09318865835666656


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 354.58it/s]


Train Acc:97.03%, Test Acc:86.68%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.35it/s]


Epoch: 11,  Loss:0.15648604929447174


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 376.05it/s]


Train Acc:97.17%, Test Acc:86.91%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.29it/s]


Epoch: 12,  Loss:0.017733529210090637


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 346.68it/s]


Train Acc:97.30%, Test Acc:86.68%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.87it/s]


Epoch: 13,  Loss:0.13603618741035461


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 356.62it/s]


Train Acc:96.89%, Test Acc:86.49%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.91it/s]


Epoch: 14,  Loss:0.1014559343457222


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 353.24it/s]


Train Acc:97.72%, Test Acc:86.94%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.47it/s]


Epoch: 15,  Loss:0.0994148924946785


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 375.85it/s]


Train Acc:98.08%, Test Acc:86.56%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.53it/s]


Epoch: 16,  Loss:0.29445207118988037


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 347.40it/s]


Train Acc:98.18%, Test Acc:86.80%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.98it/s]


Epoch: 17,  Loss:0.11577855050563812


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 374.31it/s]


Train Acc:98.18%, Test Acc:86.69%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 92.65it/s]


Epoch: 18,  Loss:0.050337500870227814


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 343.23it/s]


Train Acc:98.62%, Test Acc:86.65%



100%|█████████████████████████████████████████████████| 1200/1200 [00:12<00:00, 93.22it/s]


Epoch: 19,  Loss:0.016569487750530243


100%|██████████████████████████████████████████████████| 200/200 [00:00<00:00, 375.88it/s]

Train Acc:98.68%, Test Acc:86.65%

	-> Train Acc 98.68166666666667 ; Test Acc 87.33999999999999





In [310]:
yout

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:1',
       grad_fn=<AddmmBackward0>)