In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

from tqdm import tqdm
from sklearn import datasets
import random

In [3]:
device = torch.device("cuda:0")
# device = torch.device("cuda:1")
# device = torch.device("cpu")

In [4]:
import bmm2x2_cuda

## Cuda -bmm2x2

In [5]:
class BMM2x2Function(torch.autograd.Function):
    @staticmethod
#     @torch.jit.ignore
    def forward(ctx, inputs, weights):
        outputs = bmm2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]
    
    @staticmethod
#     @torch.jit.ignore
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
        del_input, del_weights = bmm2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [6]:
class PairWeight2(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.weight = torch.eye(2).unsqueeze(0).repeat_interleave(input_dim//2, dim=0)
        self.weight = nn.Parameter(self.weight)
        self.bmmfunc = BMM2x2Function()
        
    @torch.jit.ignore
    def bmm(self, x, w):
        return BMM2x2Function.apply(x, w)
        
    def forward(self, x):
        bs, dim = x.shape[0], x.shape[1]
        x = x.view(bs, -1, 2)
        x = self.bmm(x, self.weight)
        x = x.view(bs, -1)
        return x

In [7]:
pw = PairWeight2(784).to(device)
pw(torch.randn(2,784).to(device))

tensor([[-1.0134,  2.1818,  1.9082,  ..., -0.3174,  2.0114,  0.8128],
        [-0.4477, -0.1729,  0.5523,  ...,  0.3847, -1.6661, -1.3617]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [8]:
import bilinear2x2_cuda

## Cuda - Bilinear2x2

In [9]:
class BiLinear2x2Function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weights):
        outputs = bilinear2x2_cuda.forward(inputs, weights)
        ctx.save_for_backward(inputs, weights)
        return outputs[0]

    @staticmethod
    def backward(ctx, grad_output):
        inputs, weights = ctx.saved_tensors
#         del_input, del_weights = bmm2x2_cuda.backward(
#             grad_output.contiguous(), 
#             grad_cell.contiguous(), 
#             grad_output.contiguous())
        del_input, del_weights = bilinear2x2_cuda.backward(
            inputs, 
            weights, 
            grad_output)
    
        return del_input, del_weights

In [10]:
class PairBilinear2(nn.Module):
    def __init__(self, dim, grid_width):
        super().__init__()
        self.dim = dim
        self.grid_width = grid_width
        
        self.num_pairs = self.dim // 2
        along_row = torch.linspace(0, 1, self.grid_width).reshape(1, -1)
        along_col = torch.linspace(0, 1, self.grid_width).reshape(-1, 1)
#         self.Y = torch.stack([along_row+along_col*0, along_row*0+along_col])
        self.Y = torch.stack([along_row*0+along_col, along_row+along_col*0])

        self.Y = torch.repeat_interleave(self.Y.unsqueeze(0), self.num_pairs, dim=0)
        self.Y = nn.Parameter(self.Y)
        
        self.pairW = torch.eye(2).unsqueeze(0).repeat_interleave(self.num_pairs, dim=0)
        self.pairW = nn.Parameter(self.pairW)
    
#     @torch.jit.ignore
#     def pairbl2x2(self, x, w):
#         return BiLinear2x2Function.apply(x, w)
    
#     @torch.jit.ignore
    def forward(self, x):
        bs = x.shape[0]
        
############# This block ########################
        ### this block is significantly faster
    
#         x = x.view(bs, -1, 2).transpose(0,1)
#         x = torch.bmm(x, self.pairW)
#         x = x.transpose(1,0)#.reshape(-1, 2)
        
############# OR This block ########################
        x = x.view(bs, -1, 2)
        x = BMM2x2Function.apply(x, self.pairW)
####################################################
        x = x.view(bs, -1, 2)
        x = BiLinear2x2Function.apply(x, self.Y)
        x = x.view(bs, -1)
        return x

In [11]:
pbl2 = PairBilinear2(8, 3).to(device)

In [12]:
_a = torch.randn(2, 8).to(device)

In [13]:
y = pbl2(_a) 

In [14]:
_a

tensor([[ 0.1486, -0.1944,  1.0231, -0.1754, -0.6524, -0.3859, -0.1675,  0.6723],
        [-0.3980, -0.1785,  0.8948, -0.8194,  0.5285,  0.0020,  1.1192,  1.1377]],
       device='cuda:0')

In [15]:
y

tensor([[ 0.1486, -0.1944,  1.0231, -0.1754, -0.6524, -0.3859, -0.1675,  0.6723],
        [-0.3980, -0.1785,  0.8948, -0.8194,  0.5285,  0.0020,  1.1192,  1.1377]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [16]:
pbl2.Y[0]

tensor([[[0.0000, 0.0000, 0.0000],
         [0.5000, 0.5000, 0.5000],
         [1.0000, 1.0000, 1.0000]],

        [[0.0000, 0.5000, 1.0000],
         [0.0000, 0.5000, 1.0000],
         [0.0000, 0.5000, 1.0000]]], device='cuda:0',
       grad_fn=<SelectBackward0>)

In [17]:
class BiasLayer(nn.Module):
    def __init__(self, dim, init_val=0):
        super().__init__()
        self.bias = nn.Parameter(torch.ones(dim)*init_val)
        
    def forward(self, x):
        return x+self.bias

In [96]:
class FactorizedPairBilinearSpline(nn.Module):
    
    def __init__(self, input_dim, grid_width, num_layers=None):
        super().__init__()
        assert input_dim%2 == 0, "Input dim must be even number"
        self.input_dim = input_dim
        
        if num_layers is None:
            self.num_layers = int(np.ceil(np.log2(input_dim)))
        else:
            self.num_layers = num_layers
            
        self.facto_nets = []
        self.idx_revidx = []
        for i in range(self.num_layers):
            idrid = self.get_pair(self.input_dim, i+1)
            net = PairBilinear2(self.input_dim, grid_width)
            self.facto_nets.append(net)
            self.idx_revidx.append(idrid)
        self.facto_nets = nn.ModuleList(self.facto_nets)
            
#     @torch.jit.ignore
    def get_pair(self, inp_dim, step=1):
        dim = 2**int(np.ceil(np.log2(inp_dim)))
        assert isinstance(step, int), "Step must be integer"

        blocks = (2**step)
        range_ = dim//blocks
        adder_ = torch.arange(0, range_)*blocks

        pairs_ = torch.Tensor([0, blocks//2])
        repeat_ = torch.arange(0, blocks//2).reshape(-1,1)
        block_map = (pairs_+repeat_).reshape(-1)

        reorder_for_pair = (block_map+adder_.reshape(-1,1)).reshape(-1)
        indx = reorder_for_pair.type(torch.long)
        indx = indx[indx<inp_dim]

        rev_indx = torch.argsort(indx)
        return indx, rev_indx
    
    def forward(self, x):
        ## swap first and then forward and reverse-swap
        y = x
#         for i in range(len(self.facto_nets)):
        for i, fn in enumerate(self.facto_nets):
            idx, revidx = self.idx_revidx[i]
            y = y[:, idx]
            y = fn(y) 
            y = y[:, revidx]
#         y = x + y ## this is residual addition... remove if only want feed forward
        return y

In [97]:
pfL = FactorizedPairBilinearSpline(784, 10).to(device)

In [98]:
pfL(torch.randn(100, 784).to(device))

tensor([[-1.1258, -1.1524, -0.2506,  ..., -1.5825, -0.5878, -0.1140],
        [ 0.7014, -0.5556, -0.3817,  ...,  0.3989,  0.2578,  0.1990],
        [-0.1584,  0.7390, -0.2506,  ...,  0.9209, -0.1103, -1.8729],
        ...,
        [ 1.5498, -0.9650, -0.5772,  ...,  1.1618, -0.3113,  0.3023],
        [-0.3681, -0.8609,  1.1708,  ...,  0.2597,  0.0747,  0.3674],
        [ 0.8068, -0.4530, -0.9414,  ..., -1.0015,  0.7598, -0.1231]],
       device='cuda:0', grad_fn=<IndexBackward0>)

In [99]:
pfL

FactorizedPairBilinearSpline(
  (facto_nets): ModuleList(
    (0): PairBilinear2()
    (1): PairBilinear2()
    (2): PairBilinear2()
    (3): PairBilinear2()
    (4): PairBilinear2()
    (5): PairBilinear2()
    (6): PairBilinear2()
    (7): PairBilinear2()
    (8): PairBilinear2()
    (9): PairBilinear2()
  )
)

In [100]:
param_count = sum([torch.numel(p) for p in pfL.parameters()])
param_count

799680

In [101]:
class FactorNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.bias = BiasLayer(784)
        self.la1 = FactorizedPairBilinearSpline(784, grid_width=2)
        self.bn1 = nn.BatchNorm1d(784)
        self.fc = nn.Linear(784, 10)
        
    def forward(self, x):
        x = self.bias(x)
        x = self.la1(x)
#         x = self.bn1(x)
        x = torch.relu(x)
        x = self.fc(x)
        return x

In [102]:
# class FactorNet(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.bias = BiasLayer(784)
#         self.la1 = FactorizedPairBilinearSpline(784, grid_width=5)
#         self.bn1 = nn.BatchNorm1d(784)
#         self.la2 = FactorizedPairBilinearSpline(784, grid_width=5)
#         self.bn2 = nn.BatchNorm1d(784)
#         self.fc = nn.Linear(784, 10)
        
#     def forward(self, x):
#         x = self.bias(x)
#         x = self.bn1(self.la1(x))
#         x = torch.relu(x)
#         x = self.bn2(self.la2(x))
#         x = torch.relu(x)
#         x = self.fc(x)
#         return x

In [55]:
class OrdinaryNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.la1 = nn.Linear(784, 784, bias=False)
        self.bn1 = nn.BatchNorm1d(784)
        self.la2 = nn.Linear(784, 10)
        
    def forward(self, x):
        x = self.bn1(self.la1(x))
        x = torch.relu(x)
        x = self.la2(x)
        return x

### Dataset

In [152]:
from tabben.datasets import OpenTabularDataset

In [153]:
from tabben.datasets import list_datasets
print(list_datasets())

['arcene', 'covertype', 'higgs', 'poker', 'adult', 'parkinsons', 'musk', 'rossman', 'amazon', 'duolingo-original', 'duolingo-categorical', 'cifar10']


In [154]:
# load the arcene dataset (default is train split) and
# save the data to the current directory 
ds_train = OpenTabularDataset('./../../../../../_Datasets/', 'arcene', split='train')
ds_test = OpenTabularDataset('./../../../../../_Datasets/', 'arcene', split='test')

Data already available at `../../../../../_Datasets/arcene.npz`
Data already available at `../../../../../_Datasets/arcene.json`
Data already available at `../../../../../_Datasets/arcene.npz`
Data already available at `../../../../../_Datasets/arcene.json`


In [155]:
ds_train[0][0].shape, len(ds_train)

((10000,), 100)

In [156]:
# train_loader = data.DataLoader(ds_train, batch_size=4)
# test_loader = data.DataLoader(ds_test, batch_size=4)

In [157]:
# xx, yy = iter(train_loader).next()

In [158]:
# xx, yy

In [159]:
# xx.shape, yy.shape, yy.dtype, yy.type(torch.float32)

In [160]:
xx, yy = ds_train[:]

In [161]:
xx = torch.Tensor(xx).type(torch.float32)
yy = torch.Tensor(yy.reshape(-1,1)).type(torch.float32)

In [162]:
(xx.isnan()).type(torch.float32).sum()

tensor(0.)

In [163]:
xx = xx/(xx.max(dim=0, keepdim=True)[0]+1e-7)

In [164]:
xx = xx.to(device)
yy = yy.to(device)

In [165]:
_xx, _yy = ds_test[:]
_xx = torch.Tensor(_xx).type(torch.float32)
_yy = torch.Tensor(_yy.reshape(-1,1)).type(torch.float32)
_xx = _xx/_xx.max(dim=0, keepdim=True)[0]
_xx = _xx.to(device)
_yy = _yy.to(device)

### Model Development

In [167]:
fpbs = FactorizedPairBilinearSpline(10000, 2).to(device)

In [168]:
_a = torch.randn(2, 10000).to(device)
_y = fpbs(_a)

In [169]:
fpbs(xx)

tensor([[0.0000, 0.3257, 0.0000,  ..., 0.0000, 0.0000, 0.9831],
        [0.0000, 0.1881, 0.3475,  ..., 0.0000, 0.7245, 0.7936],
        [0.0000, 0.0000, 0.0042,  ..., 0.0000, 0.0867, 0.9531],
        ...,
        [0.0106, 0.0688, 0.2034,  ..., 0.0000, 0.0000, 0.8499],
        [0.0426, 0.0000, 0.1610,  ..., 0.0000, 0.4821, 0.7561],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0255, 0.6848]],
       device='cuda:0', grad_fn=<IndexBackward0>)

In [170]:
(xx.isnan()).type(torch.float32).sum()

tensor(0., device='cuda:0')

In [221]:
class FactorNet(nn.Module):
    def __init__(self):
        super().__init__()
#         self.bias = BiasLayer(10000)
#         self.la1 = FactorizedPairBilinearSpline(10000, grid_width=2)
#         self.la1 = FactorizedPairBilinearSpline(10000, grid_width=5, num_layers=6)
#         self.bn1 = nn.BatchNorm1d(10000)
        self.fc = nn.Linear(10000, 1)
        
    def forward(self, x):
#         x = self.bias(x)
#         x = self.la1(x.contiguous())
#         x = self.la2(x)
#         x = self.bn1(x)
#         x = torch.relu(x)
        x = self.fc(x)
        return torch.sigmoid(x)
#         return x

In [222]:
torch.manual_seed(0)
model = FactorNet().to(device)
model

FactorNet(
  (fc): Linear(in_features=10000, out_features=1, bias=True)
)

In [223]:
model(xx)[0].isnan().type(torch.float32).sum()

tensor(0., device='cuda:0')

In [224]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

In [225]:
print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  10001


In [226]:
10000*10000

100000000

In [227]:
losses = []
train_accs = []
test_accs = []
EPOCHS = 10000

for epoch in range(EPOCHS):
    
    train_acc = 0

    yout = model(xx)
    loss = criterion(yout, yy)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch%100 == 0:
        correct = ((yout>0.5).type(torch.float32) == yy).type(torch.float32)
        train_acc = correct.mean()
        train_accs.append(train_acc*100)
        losses.append(float(loss))
        
        print(f'Epoch: {epoch},  Loss:{float(loss)}')

        yout = model(_xx)
        correct = ((yout>0.5).type(torch.float32) == yy).type(torch.float32)
        train_acc = correct.mean()
        test_accs.append(train_acc*100)
        print(f'Train Acc:{train_accs[-1]:.2f}%, Test Acc:{test_accs[-1]:.2f}%')
        print()

### after each class index is finished training
print(f'\t-> Train Acc {max(train_accs)} ; Test Acc {max(test_accs)}')

Epoch: 0,  Loss:0.6665022373199463
Train Acc:65.00%, Test Acc:56.00%

Epoch: 100,  Loss:0.09022673219442368
Train Acc:100.00%, Test Acc:56.00%

Epoch: 200,  Loss:0.03565641865134239
Train Acc:100.00%, Test Acc:56.00%

Epoch: 300,  Loss:0.019870519638061523
Train Acc:100.00%, Test Acc:56.00%

Epoch: 400,  Loss:0.012948412448167801
Train Acc:100.00%, Test Acc:56.00%

Epoch: 500,  Loss:0.009222248569130898
Train Acc:100.00%, Test Acc:56.00%

Epoch: 600,  Loss:0.006955340970307589
Train Acc:100.00%, Test Acc:56.00%

Epoch: 700,  Loss:0.005459170322865248
Train Acc:100.00%, Test Acc:56.00%

Epoch: 800,  Loss:0.004412537906318903
Train Acc:100.00%, Test Acc:56.00%

Epoch: 900,  Loss:0.0036477106623351574
Train Acc:100.00%, Test Acc:56.00%

Epoch: 1000,  Loss:0.003069523023441434
Train Acc:100.00%, Test Acc:56.00%

Epoch: 1100,  Loss:0.00262039084918797
Train Acc:100.00%, Test Acc:56.00%

Epoch: 1200,  Loss:0.0022636621724814177
Train Acc:100.00%, Test Acc:56.00%

Epoch: 1300,  Loss:0.0019750

In [None]:
# 100%|██████████| 1200/1200 [00:52<00:00, 22.72it/s] using called pairlinear
# 100%|██████████| 1200/1200 [00:10<00:00, 118.42it/s] using Ordinary

In [None]:
## stats: 20 epochs || Fact+BN+Linear ; lr0.0001 ##_with 3 bilinear layers
### for factor-net: 5grid : 73706-> 100%|██████████| 1200/1200 [00:24<00:00, 48.44it/s]
########### -> Train Acc 90.3367 ; Test Acc 88.06

### for factor-net: 50grid : 5894906-> 100%|██████████| 1200/1200 [00:28<00:00, 42.74it/s]
########### -> Train Acc 99.985 ; Test Acc 85.85

### for factor-net: 10grid : 250106-> 100%|██████████| 1200/1200 [00:24<00:00, 48.11it/s]
########### -> Train Acc 92.17167 ; Test Acc 88.36

In [None]:
### for factor-net: fact+bn+relu+linear : 5grid : lr 0.0003
####### -> Train Acc 92.42833333333334 ; Test Acc 88.42

### same : factor-net had default of 3 bilinear layers.. changed to log2(input dim)=10 to properly mix all.
#######  -> 100%|██████████| 1200/1200 [00:50<00:00, 23.88it/s]
### facto-net: fact+bn+relu+linear : 5grid  -> params=221882
######## -> Train Acc 95.165 ; Test Acc 89.45

### ordinary net || linear+BN+Linear : lr=0.0003 : params=624074  -> [579.83it/s]
######## -> Train Acc 95.96166666666667 ; Test Acc 89.33


### Sparse Dataset