In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import sys, os, time
from pathlib import Path
from tqdm import tqdm

import mylibrary.datasets as datasets
import mylibrary.nnlib as tnn

In [2]:
# device = torch.device("cuda:0")
device = torch.device("cpu")

In [3]:
mnist = datasets.FashionMNIST()
train_data, train_label_, test_data, test_label_ = mnist.load()

train_data = train_data / 255.
test_data = test_data / 255.

In [4]:
input_size = 784
output_size = 10

learning_rate = 0.0001
batch_size = 50

train_label = tnn.Logits.index_to_logit(train_label_)
train_size = len(train_label_)

In [5]:
## converting data to pytorch format
train_data = torch.Tensor(train_data)
test_data = torch.Tensor(test_data)
train_label = torch.LongTensor(train_label_)

### Model Development

In [6]:
### does binary like encoding but continuous sin and cos
def generate_dimension_encoding(dim, binarize=True):
    num_d = int(np.ceil(np.log2(dim)))
    scale = np.arange(0, num_d, 1).reshape(1,-1)
    scale = (1/2)**scale
    
    index = np.arange(0, dim, 1).reshape(-1,1)
    mat = index*scale
    
    if binarize:
        mat = mat*np.pi/2
    sin_mat = np.sin(mat)**2
    cos_mat = np.cos(mat)**2
    pos_mat = np.concatenate((sin_mat, cos_mat), axis=1)
    return torch.Tensor(pos_mat)

### does exact binary encoding of position
def generate_dimension_encoding2(dim):
    num_d = int(np.ceil(np.log2(dim)))
    pos_mat = np.empty((dim, num_d))
    for i in range(dim):
        binary = np.binary_repr(i, width=num_d)
        pos_mat[i] = [float(a) for a in binary]
    
    pos_mat = np.concatenate((pos_mat, 1-pos_mat), axis=1)
    return torch.Tensor(pos_mat)


### does sin and cosine of position (meant to be used as addition)
def generate_dimension_encoding3(dim, binarize=False):
    num_d = int(np.ceil(np.log2(dim)))
    scale = np.arange(0, num_d, 1).reshape(1,-1)
    scale = (1/2)**scale
    
    index = np.arange(0, dim, 1).reshape(-1,1)
    mat = index*scale
    if binarize:
        mat = mat*np.pi/2
    sin_mat = np.sin(mat)
    cos_mat = np.cos(mat)
    pos_mat = np.concatenate((sin_mat, cos_mat), axis=1)
    return torch.Tensor(pos_mat)

#### as in positional encoding of attention transformer (as addition)
def generate_dimension_encoding4(dim, binarize=False):
    num_d = int(np.ceil(np.log2(dim)))
    scale = np.arange(0, num_d, 1).reshape(1,-1)
    scale = (1/10000)**(scale*2/dim)
    
    index = np.arange(0, dim, 1).reshape(-1,1)
    mat = index*scale
    if binarize:
        mat = mat*np.pi/2
    sin_mat = np.sin(mat)
    cos_mat = np.cos(mat)
    pos_mat = np.concatenate((sin_mat, cos_mat), axis=1)
    return torch.Tensor(pos_mat)

In [7]:
def construct_sequential_mlp(layer_dims, activation):
    layers = []
    for i in range(len(layer_dims)-1):
        _a = nn.Linear(layer_dims[i], layer_dims[i+1])
        layers += [_a, activation]
    layers = layers[:-1]
    return nn.Sequential(*layers)

# class DimMix_Layer_add(nn.Module):
    
#     def __init__(self,dim, pair, out_pair=None, hidden_ratio = [2], activation=nn.ReLU()):
#         super().__init__()
#         self.dim = dim
#         self.pair = pair
#         self.out_pair = out_pair
#         if out_pair is None:
#             self.out_pair = pair
#         if dim%pair != 0 :
#             raise ValueError(f"Dim: {dim} should be exactly divisible by Pair: {pair}")

#         self.pos_mat = generate_dimension_encoding4(dim//pair)
#         self.pos_mat = nn.Parameter(self.pos_mat)
        
#         self.linear = nn.Linear(pair, self.pos_mat.shape[1])
        
#         inp_dim = self.pos_mat.shape[1]
#         la_dims = [inp_dim] + [int(hr*inp_dim) for hr in hidden_ratio] + [self.out_pair]
#         self.net = construct_sequential_mlp(la_dims, activation)
        
#     def forward(self, x):
#         xs = x.shape
#         _x = self.linear(x.reshape(-1, self.pair))
#         _x = _x.reshape(-1, *self.pos_mat.shape)+self.pos_mat.unsqueeze(dim=0)
#         _x = self.net(_x.reshape(-1, self.pos_mat.shape[1]))
#         _x = _x.reshape(xs[0], -1)
#         return _x

In [8]:
dim = 16
pair = 2

X = torch.randn(2,dim)
if dim%pair != 0 :
    raise ValueError(f"Dim: {dim} should be exactly divisible by Pair: {pair}")
pos_mat = generate_dimension_encoding2(dim//pair)
pos_mat

tensor([[0., 0., 0., 1., 1., 1.],
        [0., 0., 1., 1., 1., 0.],
        [0., 1., 0., 1., 0., 1.],
        [0., 1., 1., 1., 0., 0.],
        [1., 0., 0., 0., 1., 1.],
        [1., 0., 1., 0., 1., 0.],
        [1., 1., 0., 0., 0., 1.],
        [1., 1., 1., 0., 0., 0.]])

In [9]:
# DimMix_Layer_add(dim, pair)(X).shape

In [10]:
kernel_dim = 4

centers = torch.rand(kernel_dim, dim)*1.5-0.5
scale = (torch.rand(1, kernel_dim)+0.25)*1

In [11]:
centers.shape

torch.Size([4, 16])

In [12]:
X.shape

torch.Size([2, 16])

In [13]:
K = (X.unsqueeze(1)-centers.unsqueeze(0))
K.shape

torch.Size([2, 4, 16])

In [14]:
K

tensor([[[ 0.3907, -0.6059,  1.6753,  0.5986, -0.4268, -1.2034,  0.2296,
          -1.9732,  0.9881,  0.7617,  1.6232,  1.0629,  1.2625, -0.6373,
          -1.7661, -0.4456],
         [ 0.8025, -0.2108,  1.1790,  0.7272,  0.5218, -0.8926, -0.3339,
          -1.9174,  0.1320,  0.6218,  0.4348, -0.0187,  1.2707,  0.1887,
          -1.6097,  0.1498],
         [ 0.6720, -0.4486,  0.4953,  1.0849,  0.3548, -2.2187, -0.7718,
          -1.4467,  0.8851,  1.1420,  0.4641,  0.2796,  1.2614, -0.5348,
          -1.2307, -1.1026],
         [ 0.4467, -0.4439,  0.4500,  1.4391,  0.1766, -0.7839,  0.2730,
          -1.6284,  0.7510,  0.0916,  0.9207,  0.3206,  0.2467, -0.4942,
          -0.4930, -0.8069]],

        [[ 0.2996, -0.4902, -0.3616,  0.4359, -0.9002,  0.9319,  0.5062,
           0.5698,  1.7613,  0.3194,  0.4969,  0.0485,  0.9640, -1.5235,
          -1.1817, -0.8883],
         [ 0.7114, -0.0951, -0.8580,  0.5646,  0.0483,  1.2427, -0.0573,
           0.6256,  0.9052,  0.1795, -0.6916, -1.0

In [15]:
torch.norm(K, dim=2)**2, torch.sum(K**2, dim=2)

(tensor([[19.9002, 12.3214, 16.8008,  8.7644],
         [11.8000,  8.4770, 15.1133, 13.8831]]),
 tensor([[19.9002, 12.3214, 16.8008,  8.7644],
         [11.8000,  8.4770, 15.1133, 13.8831]]))

In [16]:
k = torch.exp(-(scale**2)*torch.sum(K**2, dim=2))
k.shape

torch.Size([2, 4])

In [17]:
k

tensor([[6.9765e-07, 2.3265e-01, 1.3477e-01, 3.8366e-01],
        [2.2363e-04, 3.6669e-01, 1.6483e-01, 2.1925e-01]])

In [18]:
x = torch.cat([X, k], dim=1)
x.shape

torch.Size([2, 20])

In [19]:
W = torch.randn(3, 2,2)
X = torch.randn(4, 3,2)

In [20]:
torch.bmm(X.transpose(0,1), W).transpose(0,1)

tensor([[[ 1.0461, -2.9304],
         [ 0.4146,  2.1248],
         [ 0.0241, -0.0172]],

        [[-0.3109,  0.8908],
         [-0.7844, -1.5582],
         [ 0.3740, -0.7978]],

        [[ 1.6886, -2.9869],
         [-1.2455, -2.0309],
         [-0.1662, -0.6536]],

        [[-0.3452,  0.2842],
         [-1.2715, -1.6654],
         [-0.0447,  0.1354]]])

In [21]:
class RadialKernelize(nn.Module):
    def __init__(self, dim, kernel_dim):
        super().__init__()
        assert kernel_dim >= 0 and dim >=0
        self.kernel_dim = kernel_dim
        self.dim = dim
        if kernel_dim != 0:
            centers = torch.rand(kernel_dim, dim)*1.5-0.5
            scale = (torch.rand(1, kernel_dim)+0.25)*1
            self.centers = nn.Parameter(centers)
            self.scale = nn.Parameter(scale)
        
    def forward(self, x):
        if self.kernel_dim == 0: return x
        
#         print(x.shape, self.centers.shape)
        
        K = (x.unsqueeze(1)-self.centers.unsqueeze(0))
        K = torch.exp(-(self.scale**2)*torch.sum(K**2, dim=2))
        
        return torch.cat([x, K], dim=1)

In [22]:
class Kernalize_Rotate_BN_DimMix(nn.Module):
    
    def __init__(self, dim, pair, out_pair=None, hidden_ratio = [2], activation=nn.ReLU()):
        super().__init__()
        self.dim = dim
        self.pair = pair
        self.out_pair = out_pair
        if out_pair is None:
            self.out_pair = pair
        if dim%pair != 0 :
            raise ValueError(f"Dim: {dim} should be exactly divisible by Pair: {pair}")

        self.pos_mat = generate_dimension_encoding4(dim//pair)
        num_pos, inp_dim = self.pos_mat.shape
        ### there will be num_pos=dim//pair different inputs
        ### inp_dim will be the dimension of inputs
        if inp_dim < pair:
            self.pos_mat = torch.cat([self.pos_mat, torch.zeros(num_pos,pair-inp_dim)], dim=1)
            inp_dim = self.pos_mat.shape[1]
        
        self.pos_mat = nn.Parameter(self.pos_mat)
        
        self.kernalizer = RadialKernelize(pair, inp_dim-pair)
        
        self.rotate = torch.randn(num_pos, inp_dim, inp_dim)
        self.rotate /= torch.norm(self.rotate, dim=2, keepdim=True)
        self.rotate = nn.Parameter(self.rotate)
        
        self.bn = nn.BatchNorm1d(num_pos*inp_dim, affine=False)
        
        la_dims = [inp_dim] + [int(hr*inp_dim) for hr in hidden_ratio] + [self.out_pair]
        self.net = construct_sequential_mlp(la_dims, activation)
        
    def forward(self, x):
        xs = x.shape
        
        ### kernalizer takes batch of pair inputs and adds kernel to make shape=b*num_enc, input_dim
        _x = self.kernalizer(x.reshape(-1, self.pair))
        
        ### rotator rotates the data per dimension output shape=(b, num_enc, input_dim)
        _x = _x.reshape(-1, *self.pos_mat.shape)
        _x = torch.bmm(_x.transpose(0,1), self.rotate).transpose(0,1)
        
        ### Normalize the batch without encoding, ...can also normalize after encoding...
        _x = self.bn(_x.reshape(-1, self.pos_mat.shape[0]*self.pos_mat.shape[1]))
        
        ### similar to shift transform
        _x = _x.reshape(-1, *self.pos_mat.shape)+self.pos_mat.unsqueeze(dim=0)
        ### pass encoded input through the shared network
        _x = self.net(_x.reshape(-1, self.pos_mat.shape[1]))
        _x = _x.reshape(xs[0], -1)
        return _x

In [23]:
X = torch.randn(2,784)
Kernalize_Rotate_BN_DimMix(784, 4)

Kernalize_Rotate_BN_DimMix(
  (kernalizer): RadialKernelize()
  (bn): BatchNorm1d(3136, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (net): Sequential(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=4, bias=True)
  )
)

In [24]:
196*4

784

In [25]:
# class DimMix_Layer_cat(nn.Module):
    
#     def __init__(self,dim, pair, out_pair=None, hidden_ratio = [2], activation=nn.ReLU()):
#         super().__init__()
#         self.dim = dim
#         self.pair = pair
#         self.out_pair = out_pair
#         if out_pair is None:
#             self.out_pair = pair
#         if dim%pair != 0 :
#             raise ValueError(f"Dim: {dim} should be exactly divisible by Pair: {pair}")

#         self.pos_mat = generate_dimension_encoding4(dim//pair)
#         self.pos_mat = nn.Parameter(self.pos_mat)
        
#         inp_dim = self.pos_mat.shape[1]+pair
#         la_dims = [inp_dim] + [int(hr*inp_dim) for hr in hidden_ratio] + [self.out_pair]
#         self.net = construct_sequential_mlp(la_dims, activation)
        
#     def forward(self, x):
#         xs = x.shape
#         _x = x.reshape(-1, self.dim//self.pair, self.pair)
#         _pm = self.pos_mat.expand(xs[0], *self.pos_mat.shape)
#         _x = torch.cat([_x, _pm], dim=2)
#         _x = _x.reshape(-1, self.pos_mat.shape[1]+self.pair)
#         _x = self.net(_x).reshape(xs[0], -1)
#         return _x

In [26]:
# DimMix_Layer_cat(dim, pair)(X).shape

In [27]:
class MLP_0(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(784, 50)
        self.l2 = nn.LeakyReLU()
        self.l3 = nn.Linear(50, 10)
        
    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        return x
    
class MLP_1(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(784, 300)
        self.l3 = nn.Linear(300, 10)
        self.l2 = Kernalize_Rotate_BN_DimMix(300, 5, hidden_ratio=[5,5], activation=nn.ReLU())
        
    def forward(self, x):
        x = self.l1(x)
        x = self.l2(x)
        x = self.l3(x)
        return x
    
    
# class MLP_2(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.randindx = torch.randperm(784)
#         hr = [5*2,5*2]
#         self.l1 = DimMix_Layer_add(784, 8, 4, hidden_ratio=hr, activation=nn.ReLU()) #784/2*1 = 392
#         self.l2 = DimMix_Layer_add(392, 8, 2, hidden_ratio=hr, activation=nn.ReLU()) #392/4*1 = 98
#         self.l3 = DimMix_Layer_add(98, 7, 3, hidden_ratio=hr, activation=nn.ReLU()) #98/7*3 = 42
#         self.l4 = nn.Linear(42, 10)
        
#     def forward(self, x):
#         x = x[:, self.randindx]
#         x = self.l1(x)
#         x = self.l2(x)
#         x = self.l3(x)
#         x = self.l4(x)
#         return x
    
# class MLP_3(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.randindx = torch.randperm(784)
#         hr = [5*2,5*2]
#         self.l1 = DimMix_Layer_add(784, 8, 8, hidden_ratio=hr, activation=nn.ReLU()) #784/2*1 = 392
#         self.l2 = DimMix_Layer_add(784, 8, 8, hidden_ratio=hr, activation=nn.ReLU()) #392/4*1 = 98
#         self.l3 = DimMix_Layer_add(784, 8, 8, hidden_ratio=hr, activation=nn.ReLU()) #98/7*3 = 42
#         self.l4 = nn.Linear(784, 10)
        
#     def forward(self, x):
#         x = x[:, self.randindx]
#         x = self.l1(x)
#         x = self.l2(x)
#         x = self.l3(x)
#         x = self.l4(x)
#         return x

In [28]:
X = torch.randn(2,784)

In [29]:
MLP_1()(X).shape

torch.Size([2, 10])

### Model Training

In [30]:
torch.manual_seed(0)
model = MLP_1().to(device)
model

MLP_1(
  (l1): Linear(in_features=784, out_features=300, bias=True)
  (l3): Linear(in_features=300, out_features=10, bias=True)
  (l2): Kernalize_Rotate_BN_DimMix(
    (kernalizer): RadialKernelize()
    (bn): BatchNorm1d(720, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (net): Sequential(
      (0): Linear(in_features=12, out_features=60, bias=True)
      (1): ReLU()
      (2): Linear(in_features=60, out_features=60, bias=True)
      (3): ReLU()
      (4): Linear(in_features=60, out_features=5, bias=True)
    )
  )
)

In [31]:
for p in model.parameters():
    print(p.shape)

torch.Size([300, 784])
torch.Size([300])
torch.Size([10, 300])
torch.Size([10])
torch.Size([60, 12])
torch.Size([60, 12, 12])
torch.Size([7, 5])
torch.Size([1, 7])
torch.Size([60, 12])
torch.Size([60])
torch.Size([60, 60])
torch.Size([60])
torch.Size([5, 60])
torch.Size([5])


In [32]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [33]:
## MLP2: number of params:  38390 ; add
## MLP0: number of params:  39760
## MLP2: number of params:  27363 ; add

print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  252657


In [34]:
EPOCHS = 15*3
steps_ = 0
for epoch in range(EPOCHS):
    train_acc = 0
    train_count = 0
    for index in tqdm(range(train_size // batch_size)):
        steps_ += 1

        train_x = train_data[index * batch_size:(index + 1) * batch_size].to(device)
        train_y = train_label[index * batch_size:(index + 1) * batch_size].to(device)

        yout = model(train_x)
        loss = criterion(yout, train_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        outputs = tnn.Logits.logit_to_index(yout.data.cpu().numpy())
        train_acc += (outputs == train_y.data.cpu().numpy()).sum()
        train_count += len(outputs)

        if steps_%500==0:
            train_accuracy = train_acc/train_count
            train_acc, train_count = 0, 0
            
            print(f'\nEpoch: {epoch}, batch: {index}, step: {steps_}, loss: {float(loss)}')
            print(f'\tTrain acc: {train_accuracy*100}%')
            
            with torch.no_grad():
                test_acc, test_count = 0, 0
                model.eval()
                for _ti in range(len(test_data) // batch_size):
                    test_x = test_data[_ti * batch_size:(_ti + 1) * batch_size].to(device)
                    test_y_ = test_label_[_ti * batch_size:(_ti + 1) * batch_size]
                    yout = model(test_x)
                    outputs = tnn.Logits.logit_to_index(yout.cpu().numpy())
                    correct = (outputs == test_y_).sum()
                    test_acc += correct
                    test_count += len(test_x)
                model.train()

            print(f'\tTest acc: {test_acc/test_count*100}%, correct: {test_acc}/{test_count}')

 42%|████▏     | 499/1200 [10:15<16:48,  1.44s/it]


Epoch: 0, batch: 499, step: 500, loss: 0.627275288105011
	Train acc: 75.444%


 42%|████▏     | 500/1200 [11:27<4:26:57, 22.88s/it]

	Test acc: 82.46%, correct: 8246/10000


 83%|████████▎ | 999/1200 [21:36<04:43,  1.41s/it]  


Epoch: 0, batch: 999, step: 1000, loss: 0.7603819370269775
	Train acc: 84.428%


 83%|████████▎ | 1000/1200 [22:51<1:18:05, 23.43s/it]

	Test acc: 84.47%, correct: 8447/10000


100%|██████████| 1200/1200 [26:53<00:00,  1.34s/it]  
 25%|██▍       | 299/1200 [06:02<18:25,  1.23s/it]


Epoch: 1, batch: 299, step: 1500, loss: 0.4564221501350403
	Train acc: 86.02%


 25%|██▌       | 300/1200 [07:18<5:55:34, 23.70s/it]

	Test acc: 84.86%, correct: 8486/10000


 37%|███▋      | 446/1200 [10:25<17:36,  1.40s/it]  


KeyboardInterrupt: 

In [None]:
len(test_data)//batch_size

In [None]:
## MLP2
# Train acc: 87.96000000000001%
# 	Test acc: 86.27%, correct: 8627/10000
## MLP0
# Train acc: 89.05999999999999%
# 	Test acc: 87.3%, correct: 8730/10000

## Visualize first activations --paired embeddings

In [None]:
### inputs from -1, 1 for every dimension in a batch
X = torch.linspace(-1, 1, steps=20).expand(300, -1).transpose(0,1)
X.shape

In [None]:
model.eval()
ys = model.l2(X.to(device)).data.cpu()
ys.shape

In [None]:
ys[0]

In [None]:
%matplotlib inline
plt.plot(model.l2.pos_mat.cpu().data[:10])

In [None]:
num_show = 10
for i in range(300):
    plt.plot(X[:,i], ys[:,i])#+X[:,i])
    if (i+1)%num_show == 0:
#         plt.axis("equal")
        plt.show()

In [None]:
# model = MLP_1()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# print("number of params: ", sum(p.numel() for p in model.parameters()))

In [None]:
# EPOCHS = 15
# steps_ = 0
# for epoch in range(EPOCHS):
#     train_acc = 0
#     train_count = 0
#     for index in range(train_size // batch_size):
#         steps_ += 1

#         train_x = train_data[index * batch_size:(index + 1) * batch_size]
#         train_y = train_label[index * batch_size:(index + 1) * batch_size]

#         yout = model(train_x)
#         loss = criterion(yout, train_y)
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         if steps_%500==0:
#             print('\nTRAIN',epoch, steps_, '-> ', float(loss))
# #             yout = model.forward(train_data)
# #             outputs = tnn.Logits.logit_to_index(yout)
# #             correct = (outputs == np.array(train_label_)).sum()

# #             accuracy = correct / len(train_label_) * 100.
# #             print('EPOCH = ','accuracy = ', accuracy)
# #             print(correct, '/', len(train_label_))
            
#             with torch.no_grad():
#                 yout = model(test_data)
#             outputs = tnn.Logits.logit_to_index(yout.cpu().numpy())
#             correct = (outputs == np.array(test_label_)).sum()

#             accuracy = correct / len(test_label_) * 100.
#             print('   TEST  ','accuracy = ', accuracy)
#             print(correct, '/', len(test_label_))       

In [None]:
# model.l2.pos_mat

In [None]:
np.log2(128)