In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
# from mpl_toolkits.mplot3d import Axes3D

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, transforms as T

import random, os, pathlib, time
from tqdm import tqdm
# from sklearn import datasets

In [2]:
# device = torch.device("cuda:0")
device = torch.device("cpu")

In [3]:
from tqdm import tqdm
import os, time, sys
import json

In [4]:
import dtnnlib as dtnn

In [5]:
mnist_transform = T.Compose([
    T.ToTensor(),
    T.Normalize(
        mean=[0.5,],
        std=[0.5,],
    ),
])

# train_dataset = datasets.FashionMNIST(root="../../../_Datasets/", train=True, download=True, transform=mnist_transform)
# test_dataset = datasets.FashionMNIST(root="../../../_Datasets/", train=False, download=True, transform=mnist_transform)
train_dataset = datasets.MNIST(root="../../../_Datasets/", train=True, download=True, transform=mnist_transform)
test_dataset = datasets.MNIST(root="../../../_Datasets/", train=False, download=True, transform=mnist_transform)

In [6]:
batch_size = 50
train_loader = data.DataLoader(dataset=train_dataset, num_workers=4, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(dataset=test_dataset, num_workers=4, batch_size=batch_size, shuffle=False)

In [7]:
for xx, yy in train_loader:
    xx, yy = xx.to(device), yy.to(device)
    print(xx.shape, yy.shape)
    break

torch.Size([50, 1, 28, 28]) torch.Size([50])


## 1 Layer epsilon Softmax MLP

In [8]:
class DistanceTransform_Epsilon(dtnn.DistanceTransformBase):
    
    def __init__(self, input_dim, num_centers, p=2, bias=False, epsilon=0.1):
        super().__init__(input_dim, num_centers, p=2)
        
        nc = num_centers
        if epsilon is not None:
            nc += 1
        self.scaler = nn.Parameter(torch.log(torch.ones(1, 1)*1))
        self.bias = nn.Parameter(torch.ones(1, nc)*0) if bias else None
        self.epsilon = epsilon
        
    def forward(self, x):
        dists = super().forward(x)
        
        if self.epsilon is not None:
            dists = torch.cat([dists, torch.ones(len(x), 1, dtype=x.dtype)*self.epsilon], dim=1)
        
        ### normalize similar to UMAP
        dists = dists/torch.sqrt(dists.var(dim=1, keepdim=True)+1e-9)
        
        ## scale the dists
#         dists = torch.exp(-dists + self.scaler)
        dists = 1-dists*torch.exp(self.scaler)
    
        if self.bias is not None: dists = dists+self.bias
        return dists

In [9]:
class DTeSM(DistanceTransform_Epsilon):
    
    def __init__(self, input_dim, output_dim, epsilon=1.0):
        super().__init__(input_dim, output_dim, bias=True, epsilon=epsilon)
        
        self.scale_shift = dtnn.ScaleShift(-1, scaler_init=3, shifter_init=0, scaler_const=True, shifter_const=True)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, x):
        xo = super().forward(x)
        xo = self.scale_shift(xo)
        xo = self.softmax(xo)
        return xo

In [10]:
DTeSM(2, 5, 0.1)(torch.randn(1, 2)).shape

torch.Size([1, 6])

In [11]:
class LocalMLP_epsilonsoftmax(nn.Module):
    
    def __init__(self, input_dim, hidden_dim0, hidden_dim1, output_dim, epsilon=1.0):
        super().__init__()
        
        self.layer0 = DTeSM(input_dim, hidden_dim0, epsilon=epsilon)
        
        if epsilon is not None:
            hidden_dim0 += 1
        
        self.layer1 = DTeSM(hidden_dim0, hidden_dim1, epsilon=epsilon)
        
        if epsilon is not None:
            hidden_dim1 += 1
        
#         self.activ = dtnn.OneActiv(hdim, mode='relu', beta_init=np.log(1.2))
#         self.activ = nn.ReLU()

        self.layer2 = nn.Linear(hidden_dim1, output_dim)
        pass
        
    def forward(self, x):
        xo = self.layer0(x)
        xo = self.layer1(xo)
        xo = self.layer2(xo)
        return xo

In [12]:
h0 = 200
h1 = 60
model = LocalMLP_epsilonsoftmax(784, h0, h1, 10, epsilon=None)

In [13]:
model.to(device)

LocalMLP_epsilonsoftmax(
  (layer0): DTeSM(
    (scale_shift): ScaleShift()
    (softmax): Softmax(dim=-1)
  )
  (layer1): DTeSM(
    (scale_shift): ScaleShift()
    (softmax): Softmax(dim=-1)
  )
  (layer2): Linear(in_features=60, out_features=10, bias=True)
)

In [14]:
model(xx.reshape(-1, 28*28)).shape

torch.Size([50, 10])

In [15]:
def get_random_training_samples(N):
    new_center = []
    new_labels = []
    count = 0
    for i, (xx, yy) in enumerate(train_loader):
        xx = xx.reshape(xx.shape[0], -1)
        if count+xx.shape[0] < N:
            new_center.append(xx)
            new_labels.append(yy)
            count += xx.shape[0]
        elif count >= N:
            break
        else:
            new_center.append(xx[:N-count])
            new_labels.append(yy[:N-count])
            count = N
            break

    new_center = torch.cat(new_center, dim=0)
    new_labels = torch.cat(new_labels, dim=0)
    
    weights = torch.zeros(len(new_labels), 10)
    for i in range(len(new_labels)):
        weights[i, new_labels[i]] = 1.
    
    return new_center.to(device), weights.to(device)

In [16]:
get_random_training_samples(2)

(tensor([[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]]),
 tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]))

In [17]:
c0, _ = get_random_training_samples(h0)

In [18]:
## first layer
model.layer0.centers.data = c0.to(model.layer0.centers.device)

In [19]:
c1, v1 = get_random_training_samples(h1)

In [20]:
## second layer
model.layer1.centers.data = model.layer0(c1.to(device))
model.layer2.weight.data = v1.t().to(device)

In [21]:
def train(epoch, model, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(tqdm(train_loader)):
        inputs, targets = inputs.to(device).view(-1, 28*28), targets.to(device)
        
        ### Train with random image and "10" as class
#         inputs = torch.cat([inputs, torch.rand(batch_size//10, 28*28, dtype=inputs.dtype).to(device)*2-1], dim=0)
#         targets = torch.cat([targets, torch.ones(batch_size//10, dtype=targets.dtype).to(device)*10], dim=0)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    print(f"[Train] {epoch} Loss: {train_loss/(batch_idx+1):.3f} | Acc: {100.*correct/total:.3f} {correct}/{total}")
    return

In [22]:
best_acc = -1
def test(epoch, model):
    global best_acc
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
#         for batch_idx, (inputs, targets) in enumerate(tqdm(test_loader)):
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device).view(-1, 28*28), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
    print(f"[Test] {epoch} Loss: {test_loss/(batch_idx+1):.3f} | Acc: {100.*correct/total:.3f} {correct}/{total}")
    
    acc = 100.*correct/total
    return acc

In [23]:
model.eval()

LocalMLP_epsilonsoftmax(
  (layer0): DTeSM(
    (scale_shift): ScaleShift()
    (softmax): Softmax(dim=-1)
  )
  (layer1): DTeSM(
    (scale_shift): ScaleShift()
    (softmax): Softmax(dim=-1)
  )
  (layer2): Linear(in_features=60, out_features=10, bias=True)
)

In [24]:
criterion = nn.CrossEntropyLoss()

In [25]:
test_acc = test(0, model)
test_acc

[Test] 0 Loss: 2.122 | Acc: 34.080 3408/10000


34.08

### Model Training - evaluation

In [26]:
learning_rate = 0.01
# EPOCHS = 10
EPOCHS = 0

In [27]:
p1, p2 = [], []
for p in model.named_parameters():
    if p[0].endswith(".centers"):
        p1.append(p[1])
    else:
        p2.append(p[1])

params = [
    {"params": p1, "lr": learning_rate*0.03}, ## default - to change little from data point
#     {"params": p1},
    {"params": p2},
]

In [28]:
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

optimizer = torch.optim.Adam(params, lr=learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

best_acc = -1
for epoch in range(EPOCHS):
    train(epoch, model, optimizer)
    test(epoch, model)
    scheduler.step()
    
"""
Note: It trains to about 95% on MNIST
"""
print()




## Multilayer Noisy Selection (Type 1)

In [29]:
"""
TYPE 1: DT>eSM>DT>eSM>V
Type 2:  /DT>eSM>S\
        X---------+\>eSM>V
"""
print()




## Add new centers to the model

In [30]:
"""
1) initialize the models with data0, data1
2) to add neurons to dt0, calculate respective activation with data1 and initialize the dt1
    - for dt1, input dim increase and output dim stay same ;
    - the hidden activation changes slightly..
    
3) to add neurons to dt1, same as 2 layer addition..
"""
print()




In [31]:
h0 = 200
h1 = 60
model = LocalMLP_epsilonsoftmax(784, h0, h1, 10, epsilon=None).to(device)

In [32]:
N_search0 = 30
N_search1 = 10

### Initialization

In [33]:
c0, _ = get_random_training_samples(h0)

In [34]:
model.layer0.centers.data = c0.to(model.layer0.centers.device)

In [35]:
c1, v1 = get_random_training_samples(h1)

In [36]:
model.layer1.centers.data = model.layer0(c1.to(device))
model.layer2.weight.data = v1.t().to(device)

In [37]:
test_acc = test(0, model)

[Test] 0 Loss: 2.125 | Acc: 34.100 3410/10000


In [38]:
## Initialize to take the new_activation info of centers in layer 1..
### The activation distribution changes due to entangled(softmax and normalization in DTeSM)
def add_neurons_to_layer0(model, centers0, old_centers1): ## the parameters should not change
    
    ########## LAYER 0 ############
    c0 = torch.cat((model.layer0.centers.data, centers0), dim=0)
    s0 = torch.cat([model.layer0.bias.data, torch.ones(1, len(centers0))*0], dim=1)

    model.layer0.centers.data = c0
    model.layer0.bias.data = s0

#     v = torch.cat((model.layer1.weight.data, values.t()), dim=1)
#     model.layer1.weight.data = v

    ########## LAYER 1 ############
#     c1 = torch.cat((model.layer1.centers.data, model.layer0(centers1)), dim=0) ## initial: add center activation1
#     all_centers = torch.cat([old_centers1, centers1], dim=0)
#     c1 = model.layer0(all_centers) ## initial: add center activation1
    
    model.layer1.centers.data = model.layer0(old_centers1)
    pass

In [39]:
def add_neurons_to_layer1(model, centers1, values1, old_centers1):
    all_centers = torch.cat([old_centers1, centers1], dim=0)
    
#     c1 = torch.cat((model.layer1.centers.data, model.layer0(centers1)), dim=0)
    c1 = model.layer0(all_centers)
    s1 = torch.cat([model.layer1.bias.data, torch.ones(1, len(centers1))*0], dim=1)
    v = torch.cat((model.layer2.weight.data, values1.t()), dim=1)

    model.layer1.centers.data = c1
    model.layer1.bias.data = s1
    model.layer2.weight.data = v
    
    return all_centers

In [40]:
model.layer0.centers.data.shape, model.layer1.centers.data.shape, model.layer2.weight.data.shape

(torch.Size([200, 784]), torch.Size([60, 200]), torch.Size([10, 60]))

In [41]:
_c0 = get_random_training_samples(N_search0)[0]
add_neurons_to_layer0(model, _c0, c1)

In [42]:
model.layer0.centers.data.shape, model.layer1.centers.data.shape

(torch.Size([230, 784]), torch.Size([60, 230]))

In [43]:
test_acc2 = test(0, model)
test_acc2, test_acc ### ?? why does adding new centers to layer0 reduce the accuracy ??

[Test] 0 Loss: 2.142 | Acc: 34.170 3417/10000


(34.17, 34.1)

In [44]:
### Add neurons to second layer
_c1, _v1 = get_random_training_samples(N_search1)
c1 = add_neurons_to_layer1(model, _c1, _v1, c1)

In [45]:
model.layer1.centers.data.shape, model.layer2.weight.data.shape

(torch.Size([70, 230]), torch.Size([10, 70]))

In [46]:
test_acc3 = test(0, model)
test_acc3, test_acc2, test_acc

[Test] 0 Loss: 2.128 | Acc: 37.350 3735/10000


(37.35, 34.17, 34.1)

In [47]:
# asdsadsd

## Calculate Neuron Significance

In [48]:
layer_keys = [model.layer0, model.layer1]
outputs = {k.softmax:None for k in layer_keys}
gradients = {k.softmax:None for k in layer_keys}

def capture_outputs(module, inp, out):
    global outputs
    outputs[module] = out.data.cpu()

def capture_gradients(module, gradi, grado):
    global gradients
    gradients[module] = grado[0].data.cpu()
        
forw_hooks = [k.softmax.register_forward_hook(capture_outputs) for k in layer_keys]
back_hooks = [k.softmax.register_backward_hook(capture_gradients) for k in layer_keys]

def remove_hook():
    for hook in forw_hooks+back_hooks:
        hook.remove()
    pass

In [49]:
significance = {k.softmax:torch.zeros(k.centers.shape[0]) for k in layer_keys}
significance

{Softmax(dim=-1): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0

In [50]:
for xx, yy in train_loader:
    xx, yy = xx.to(device).view(-1, 28*28), yy.to(device)
    print(xx.shape, yy.shape)
    break

torch.Size([50, 784]) torch.Size([50])


In [51]:
yout = model(xx)
yout.shape

torch.Size([50, 10])

In [52]:
for n, p in model.named_parameters():
    print(n,"\t" ,p.shape)

layer0.centers 	 torch.Size([230, 784])
layer0.scaler 	 torch.Size([1, 1])
layer0.bias 	 torch.Size([1, 230])
layer1.centers 	 torch.Size([70, 230])
layer1.scaler 	 torch.Size([1, 1])
layer1.bias 	 torch.Size([1, 70])
layer2.weight 	 torch.Size([10, 70])
layer2.bias 	 torch.Size([10])


In [53]:
def none_grad():
    for p in model.parameters():
        p.grad = None

In [54]:
none_grad()
yout.register_hook(lambda grad: grad/torch.norm(grad, dim=1, keepdim=True))

# grad = torch.randn_like(yout)
# ### grad = grad/torch.norm(grad, dim=1, keepdim=True)
# yout.backward(gradient=grad, retain_graph=False)

criterion(yout, yy).backward()

In [55]:
remove_hook()

In [56]:
outputs

{Softmax(dim=-1): tensor([[2.4899e-06, 2.3023e-06, 5.2259e-07,  ..., 4.7741e-05, 5.2200e-07,
          4.7811e-05],
         [2.3233e-05, 7.4446e-04, 2.2435e-05,  ..., 3.5531e-06, 3.5619e-05,
          3.5813e-06],
         [1.4479e-06, 1.8525e-03, 6.4492e-04,  ..., 5.3255e-07, 2.3857e-05,
          5.5373e-07],
         ...,
         [6.2687e-08, 1.1864e-05, 2.0976e-05,  ..., 4.2574e-07, 4.6926e-05,
          7.9913e-08],
         [5.5233e-06, 1.8968e-04, 7.4183e-06,  ..., 3.4797e-07, 3.3399e-06,
          1.8670e-06],
         [4.2201e-06, 5.5846e-05, 4.3116e-06,  ..., 8.1915e-07, 1.6886e-05,
          4.7456e-07]]),
 Softmax(dim=-1): tensor([[2.8351e-04, 4.2182e-02, 5.4159e-03,  ..., 3.3260e-02, 3.5072e-03,
          7.1325e-06],
         [1.3547e-04, 2.7466e-02, 2.0285e-03,  ..., 1.6792e-02, 3.7371e-03,
          6.7511e-06],
         [2.6094e-04, 7.5014e-02, 5.0708e-03,  ..., 3.5155e-02, 3.5860e-03,
          7.4689e-06],
         ...,
         [5.4296e-05, 6.3891e-03, 8.8972e-04,

In [57]:
gradients

{Softmax(dim=-1): tensor([[ 0.0024,  0.0278,  0.0051,  ...,  0.0099,  0.0051, -0.0318],
         [ 0.0029,  0.0091,  0.0037,  ...,  0.0072,  0.0112,  0.0094],
         [ 0.0017, -0.0048,  0.0004,  ...,  0.0019,  0.0022,  0.0029],
         ...,
         [-0.0009, -0.0065, -0.0022,  ...,  0.1011, -0.0034, -0.0014],
         [ 0.0048,  0.0249,  0.0032,  ...,  0.0058,  0.0060,  0.0077],
         [ 0.0049, -0.0034,  0.0040,  ...,  0.0083,  0.0080,  0.0097]]),
 Softmax(dim=-1): tensor([[ 0.0957,  0.1036,  0.0915,  ...,  0.1036,  0.1059, -0.9484],
         [ 0.0956,  0.0996,  0.0919,  ...,  0.0996,  0.1213,  0.1111],
         [ 0.0942,  0.1063,  0.0900,  ...,  0.1063,  0.1043,  0.1068],
         ...,
         [ 0.2094,  0.0853,  0.0847,  ...,  0.0853,  0.0962,  0.0880],
         [ 0.0952,  0.1004,  0.0915,  ...,  0.1004, -0.9483,  0.1118],
         [ 0.0954,  0.1034,  0.0913,  ...,  0.1034,  0.1053,  0.1073]])}

In [58]:
with torch.no_grad():
    for k in layer_keys:
        lk = k.softmax
#         print(outputs[lk])
#         print(gradients[lk])
#         print(significance[lk])
        significance[lk] += torch.sum((outputs[lk]*gradients[lk])**2, dim=0)
significance

{Softmax(dim=-1): tensor([5.6532e-11, 3.3266e-06, 4.2823e-09, 2.2322e-09, 2.9051e-04, 1.1509e-04,
         1.4978e-06, 3.7347e-11, 1.1044e-02, 2.1369e-09, 1.6987e-06, 3.3429e-05,
         7.6518e-04, 3.8205e-13, 7.3085e-13, 2.8390e-06, 2.6013e-05, 9.4760e-05,
         6.0915e-07, 1.0419e-06, 3.1605e-07, 2.8982e-04, 2.3022e-06, 7.3204e-03,
         8.0414e-14, 9.5337e-05, 1.5766e-15, 7.0277e-09, 5.5522e-04, 6.0955e-06,
         8.2217e-09, 1.9732e-07, 6.0610e-06, 1.2988e-05, 4.6072e-05, 2.7052e-06,
         1.3531e-05, 2.4826e-04, 3.3082e-05, 4.2930e-06, 1.1780e-03, 1.3351e-04,
         1.4145e-06, 1.4505e-05, 1.1940e-08, 1.7981e-05, 1.5006e-06, 2.8295e-08,
         3.5961e-03, 5.0773e-11, 1.0946e-10, 1.5564e-07, 1.5216e-09, 1.0105e-06,
         4.7658e-05, 1.6055e-10, 1.0869e-06, 2.4086e-04, 8.5829e-08, 4.8694e-08,
         1.5200e-12, 7.0120e-07, 4.0739e-02, 1.2976e-06, 8.8634e-02, 2.6272e-04,
         2.1589e-07, 2.6546e-07, 1.8784e-07, 2.1555e-08, 4.1859e-10, 2.8555e-08,
         1.

In [59]:
h0

200

In [60]:
sig = significance[layer_keys[0].softmax]

In [61]:
# torch.topk(sig, k=h0, sorted=True, largest=True)[0]

In [62]:
topk_idx = torch.topk(sig, k=h0, sorted=True)[1]
topk_idx

tensor([ 93, 122,  92, 117,  64, 159, 201,  62, 139, 177,   8,  86,  23, 206,
         95,  48, 205, 184, 217, 207, 202, 141,  87,  40, 136,  89, 123,  12,
        134,  28,  75,  91, 166, 124, 180,   4,  21,  65, 108,  37,  57, 173,
        149, 111, 175, 198, 102,  41,   5, 161,  25,  17, 200, 157, 126, 174,
         76, 140, 165,  90,  54,  34, 107, 160, 132,  73, 169,  94, 118,  11,
         38, 215,  16, 193, 163, 150, 229, 114,  45, 218, 170,  43,  88,  80,
         36, 211,  33, 219, 121, 138, 188, 172,  82,  81, 214, 112,  29,  32,
        146, 185, 189, 115,  39, 128,   1,  15,  35, 113,  22, 156, 103, 143,
        191, 135, 158,  10, 226,  46,   6, 120, 178,  42, 101,  63, 125,  78,
         56, 199,  19,  53,  85,  61, 116, 119,  18, 109, 110,  20, 225, 142,
         67, 212, 222,  74,  66,  77,  31, 203,  68, 221,  51, 105, 182, 224,
        176,  72, 216, 148,  58, 130, 228, 104, 227, 223, 100, 186,  59, 196,
         97, 220, 187,  71,  47,  69,  79, 164,  44, 153, 183, 1

In [63]:
def remove_neurons_from_layer1(model, importance, num_prune, old_centers1):
    N = model.layer1.centers.shape[0]
    topk_idx = torch.topk(importance, k=N-num_prune, largest=True)[1]
    
    c = model.layer1.centers.data[topk_idx]
    v = model.layer2.weight.data[:,topk_idx]
    s = model.layer1.bias.data[:,topk_idx]
    model.layer1.centers.data = c
    model.layer2.weight.data = v
    model.layer1.bias.data = s
    
    return old_centers1[topk_idx]

In [64]:
def remove_neurons_from_layer0(model, importance, num_prune, old_centers1):
    N = model.layer0.centers.shape[0]
    topk_idx = torch.topk(importance, k=N-num_prune, largest=True)[1]
    
    c = model.layer0.centers.data[topk_idx]
    s = model.layer0.bias.data[:,topk_idx]
    model.layer0.centers.data = c
    model.layer0.bias.data = s
    
    model.layer1.centers.data = model.layer0(old_centers1)
    pass

In [65]:
# asdasd

In [66]:
c1 = remove_neurons_from_layer1(model, significance[layer_keys[1].softmax], N_search1, c1)

In [67]:
c1.shape

torch.Size([60, 784])

In [68]:
model.layer0.centers.data.shape, model.layer1.centers.data.shape, model.layer2.weight.data.shape

(torch.Size([230, 784]), torch.Size([60, 230]), torch.Size([10, 60]))

In [69]:
test_acc4 = test(0, model)

test_acc4, test_acc3, test_acc2, test_acc

[Test] 0 Loss: 2.127 | Acc: 38.590 3859/10000


(38.59, 37.35, 34.17, 34.1)

In [70]:
remove_neurons_from_layer0(model, significance[layer_keys[0].softmax], N_search0, c1)

In [71]:
model.layer0.centers.data.shape, model.layer1.centers.data.shape, model.layer2.weight.data.shape

(torch.Size([200, 784]), torch.Size([60, 200]), torch.Size([10, 60]))

In [72]:
test_acc5 = test(0, model)
test_acc5, test_acc4, test_acc3, test_acc2, test_acc

[Test] 0 Loss: 2.126 | Acc: 39.740 3974/10000


(39.74, 38.59, 37.35, 34.17, 34.1)

In [73]:
asdasd

NameError: name 'asdasd' is not defined

## Do this in Loop

In [81]:
_c0 = get_random_training_samples(N_search0)[0]
add_neurons_to_layer0(model, _c0, c1)

_c1, _v1 = get_random_training_samples(N_search1)
c1 = add_neurons_to_layer1(model, _c1, _v1, c1)

In [82]:
significance = {k.softmax:torch.zeros(k.centers.shape[0]) for k in layer_keys}

forw_hooks = [k.softmax.register_forward_hook(capture_outputs) for k in layer_keys]
back_hooks = [k.softmax.register_backward_hook(capture_gradients) for k in layer_keys]

In [83]:
# optim = torch.optim.Adam(model.parameters())

In [84]:
for xx, yy in train_loader:
    xx = xx.to(device).view(-1, 28*28)
    yout = model(xx)
    
    none_grad()
#     yout.register_hook(lambda grad: grad/torch.norm(grad, dim=1, keepdim=True))
    
#     grad = torch.randn_like(yout)
#     ### grad = grad/torch.norm(grad, dim=1, keepdim=True)
#     yout.backward(gradient=grad)
    
    criterion(yout, yy).backward()
    
    with torch.no_grad():
        for k in layer_keys:
            lk = k.softmax
            significance[lk] += torch.sum((outputs[lk]*gradients[lk])**2, dim=0)
    pass

In [85]:
remove_hook()

In [86]:
c1 = remove_neurons_from_layer1(model, significance[layer_keys[1].softmax], N_search1, c1) ## anything can be done first
remove_neurons_from_layer0(model, significance[layer_keys[0].softmax], N_search0, c1)

In [87]:
test(0, model)

[Test] 0 Loss: 2.092 | Acc: 51.450 5145/10000


51.45

In [None]:
asdasd  ### ^^ expected test_acc2 > test_acc3 > test_acc

## Optimize for multiple steps

In [97]:
h0 = 100
h1 = 100
N_search0 = 30
N_search1 = 30
model = LocalMLP_epsilonsoftmax(784, h0, h1, 10, epsilon=None).to(device)

### Initialization
c0, _ = get_random_training_samples(h0)
model.layer0.centers.data = c0.to(model.layer0.centers.device)

c1, v1 = get_random_training_samples(h1)
model.layer1.centers.data = model.layer0(c1.to(device))
model.layer2.weight.data = v1.t().to(device)

test(0, model)

[Test] 0 Loss: 1.936 | Acc: 54.470 5447/10000


54.47

In [98]:
layer_keys = [model.layer0, model.layer1]
outputs = {k.softmax:None for k in layer_keys}
gradients = {k.softmax:None for k in layer_keys}

In [99]:
## Run multiple times for convergence
STEPS = 10
for s in range(STEPS):
    print(f"Adding and Pruning for STEP: {s}")
    _c0 = get_random_training_samples(N_search0)[0]
    add_neurons_to_layer0(model, _c0, c1)

    _c1, _v1 = get_random_training_samples(N_search1)
    c1 = add_neurons_to_layer1(model, _c1, _v1, c1)
    #############################
    significance = {k.softmax:torch.zeros(k.centers.shape[0]) for k in layer_keys}

    forw_hooks = [k.softmax.register_forward_hook(capture_outputs) for k in layer_keys]
    back_hooks = [k.softmax.register_backward_hook(capture_gradients) for k in layer_keys]
    #############################
    
    for xx, yy in train_loader:
        xx = xx.to(device).view(-1, 28*28)
        yout = model(xx)

        none_grad()
#         yout.register_hook(lambda grad: grad/torch.norm(grad, dim=1, keepdim=True))
        ####################################
#         grad = torch.randn_like(yout)
#         ### grad = grad/torch.norm(grad, dim=1, keepdim=True)
#         yout.backward(gradient=grad)
        ###################################
        criterion(yout, yy).backward()
        with torch.no_grad():
            for k in layer_keys:
                lk = k.softmax
                significance[lk] += torch.sum((outputs[lk]*gradients[lk])**2, dim=0)
    
    remove_hook()
    c1 = remove_neurons_from_layer1(model, significance[layer_keys[1].softmax], N_search1, c1)
    remove_neurons_from_layer0(model, significance[layer_keys[0].softmax], N_search0, c1)
    test_acc = test(0, model)

Adding and Pruning for STEP: 0
[Test] 0 Loss: 1.836 | Acc: 65.830 6583/10000
Adding and Pruning for STEP: 1
[Test] 0 Loss: 1.796 | Acc: 69.800 6980/10000
Adding and Pruning for STEP: 2
[Test] 0 Loss: 1.770 | Acc: 72.940 7294/10000
Adding and Pruning for STEP: 3
[Test] 0 Loss: 1.789 | Acc: 70.890 7089/10000
Adding and Pruning for STEP: 4
[Test] 0 Loss: 1.776 | Acc: 72.750 7275/10000
Adding and Pruning for STEP: 5
[Test] 0 Loss: 1.761 | Acc: 72.630 7263/10000
Adding and Pruning for STEP: 6
[Test] 0 Loss: 1.754 | Acc: 74.610 7461/10000
Adding and Pruning for STEP: 7
[Test] 0 Loss: 1.742 | Acc: 75.040 7504/10000
Adding and Pruning for STEP: 8
[Test] 0 Loss: 1.746 | Acc: 74.750 7475/10000
Adding and Pruning for STEP: 9
[Test] 0 Loss: 1.749 | Acc: 74.690 7469/10000
