In [1]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D

import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, transforms

from tqdm import tqdm
import random, time, os, sys, json

In [3]:
import sparse_nonlinear_lib as snl

In [4]:
device = torch.device("cuda:1")
# device = torch.device("cpu")

In [5]:
# time.sleep(60*60)

## For CIFAR10 dataset

In [6]:
cifar_train = transforms.Compose([
    transforms.RandomCrop(size=32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465], # mean=[0.5071, 0.4865, 0.4409] for cifar100
        std=[0.2023, 0.1994, 0.2010], # std=[0.2009, 0.1984, 0.2023] for cifar100
    ),
])

cifar_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465], # mean=[0.5071, 0.4865, 0.4409] for cifar100
        std=[0.2023, 0.1994, 0.2010], # std=[0.2009, 0.1984, 0.2023] for cifar100
    ),
])

train_dataset = datasets.CIFAR10(root="../../../../../_Datasets/cifar10/", train=True, download=True, transform=cifar_train)
test_dataset = datasets.CIFAR10(root="../../../../../_Datasets/cifar10/", train=False, download=True, transform=cifar_test)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(123)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=2, worker_init_fn=seed_worker)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=2)

In [8]:
## demo of train loader
for xx, yy in train_loader:
    break
xx.shape

torch.Size([32, 3, 32, 32])

# Model Comparision

## Pair Linear Mixing

In [9]:
def img2patch(x, input_dim=(1, 28, 28), patch_size=(7, 4)):
    y = nn.functional.unfold(x, 
                             kernel_size=patch_size, 
                             stride=patch_size
                            )
    return y

In [10]:
def patch2img(x, patch_size=(7, 4), input_dim=(1, 28, 28)):
    y = nn.functional.fold(x, (input_dim[-2], input_dim[-1]), 
                               kernel_size=patch_size, 
                               stride=patch_size
                              )
    return y

1. Linearize by expanding the dimension of folded image.

## Final Model

In [11]:
class CIFAR10_BlockMLP(nn.Module):
    
    def __init__(self, img_size=(3, 32, 32), select=4096, block_size=2, hidden_layers_ratio=[4], actf=nn.GELU):
        super().__init__()
        self.dim_sel = snl.DimensionRandomSelector(np.prod(img_size), select)

        self.block_mlp = snl.BlockMLP_MixerBlock(select, block_size, 
                                                 hidden_layers_ratio=hidden_layers_ratio, actf=actf)
#         self.norm = nn.BatchNorm1d(select)
        self.norm = nn.LayerNorm(select)

        self.actf = actf()
        self.fc = nn.Linear(select, 10)
        
    def forward(self, x):
        bs = x.shape[0]
        x = x.reshape(bs, -1)
        x = self.dim_sel(x)
        x = self.block_mlp(x)
        x = self.norm(x)
        x = self.actf(x)
        x = self.fc(x)
        return x

In [12]:
model = CIFAR10_BlockMLP(block_size=4)
print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  958474


In [13]:
model

CIFAR10_BlockMLP(
  (dim_sel): DimensionSelector: [+=4096]
  (block_mlp): BlockMLP_MixerBlock(
    (facto_nets): ModuleList(
      (0-5): 6 x BlockMLP(
        (mlp): Sequential(
          (0): BlockLinear: [1024, 4, 16]
          (1): GELU(approximate='none')
          (2): BlockLinear: [1024, 16, 4]
        )
      )
    )
  )
  (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (actf): GELU(approximate='none')
  (fc): Linear(in_features=4096, out_features=10, bias=True)
)

In [14]:
# asdfasdf

In [15]:
#### USING DimensionSelector to make comparative

class CIFAR10_OrdMLP(nn.Module):
    
    def __init__(self, img_size=(3, 32, 32), select=4096):
        super().__init__()
        self.input_dim = np.prod(img_size)
        self.dim_sel = snl.DimensionRandomSelector(np.prod(img_size), select)
        
        self.l0 = nn.Linear(select, select)
        self.norm = nn.LayerNorm(select)
        self.actf = nn.GELU()
        self.l1 = nn.Linear(select, 10)
        
    def forward(self, x):
        bs = x.shape[0]
        x = x.reshape(bs, -1)
        x = self.dim_sel(x)
        x = self.l0(x)
        x = self.norm(x)
        x = self.actf(x)
        x = self.l1(x)
        return x

In [16]:
model = CIFAR10_OrdMLP(select=4096)
print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  16830474


In [17]:
model

CIFAR10_OrdMLP(
  (dim_sel): DimensionSelector: [+=4096]
  (l0): Linear(in_features=4096, out_features=4096, bias=True)
  (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (actf): GELU(approximate='none')
  (l1): Linear(in_features=4096, out_features=10, bias=True)
)

In [18]:
class CIFAR10_SparseMLP(nn.Module):
    
    def __init__(self, img_size=(3, 32, 32), select=4096, block_size=2):
        super().__init__()
        self.dim_sel = snl.DimensionRandomSelector(np.prod(img_size), select)
        
        self.l0 = snl.BlockLinear_MixerBlock(select, block_size)
        self.norm = nn.LayerNorm(select)
        self.actf = nn.GELU()
        self.l1 = nn.Linear(select, 10)
        
    def forward(self, x):
        bs = x.shape[0]
        x = x.reshape(bs, -1)
        x = self.dim_sel(x)
        x = self.l0(x)
        x = self.norm(x)
        x = self.actf(x)
        x = self.l1(x)
        return x

In [19]:
class CIFAR10_SparseMLP_PWLF(nn.Module):
    
    def __init__(self, img_size=(3, 32, 32), select=4096, block_size=2):
        super().__init__()
        self.dim_sel = snl.DimensionRandomSelector(np.prod(img_size), select)
        
        self.l0 = snl.BlockLinear_MixerBlock(select, block_size)
        self.norm = nn.LayerNorm(select)
        self.pwlf = snl.PairBilinear(select, 5)
        self.actf = nn.GELU()
        self.l1 = nn.Linear(select, 10)
        
    def forward(self, x):
        bs = x.shape[0]
        x = x.reshape(bs, -1)
        x = self.dim_sel(x)
        x = self.l0(x)
        x = self.norm(x)
        x = self.pwlf(x)
#         x = self.actf(x)
        x = self.l1(x)
        return x

In [20]:
# model = CIFAR10_SparseMLP(block_size=32).to(device)
model = CIFAR10_SparseMLP_PWLF(block_size=16).to(device)

print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  356362


In [21]:
model(torch.randn(2, 3, 32, 32).to(device)).shape

torch.Size([2, 10])

In [22]:
model

CIFAR10_SparseMLP_PWLF(
  (dim_sel): DimensionSelector: [+=4096]
  (l0): BlockLinear_MixerBlock(
    (facto_nets): ModuleList(
      (0-2): 3 x BlockWeight: [256, 16, 16]
    )
  )
  (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (pwlf): PairBilinear: [4096 -> 4096] (grid: 5)
  (actf): GELU(approximate='none')
  (l1): Linear(in_features=4096, out_features=10, bias=True)
)

In [23]:
class CIFAR10_PairBilinear(nn.Module):
    
    def __init__(self, img_size=(3, 32, 32), select=4096, grid_width=5):
        super().__init__()

        self.dim_sel = snl.DimensionRandomSelector(np.prod(img_size), select)
        self.block_func = snl.PairBilinear_MixerBlock(select, select, grid_width=grid_width)
        self.norm = nn.LayerNorm(select)
        self.actf = nn.GELU()
#         self.actf = nn.ELU()
        self.fc = nn.Linear(select, 10)
        
    def forward(self, x):
        bs = x.shape[0]
        x = x.reshape(bs,-1)
        x = self.dim_sel(x)
        x = self.block_func(x)
        x = self.norm(x)
        x = self.actf(x)
        x = self.fc(x)
        return x

In [24]:
model = CIFAR10_PairBilinear(grid_width=3)
print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  593930


In [25]:
model

CIFAR10_PairBilinear(
  (dim_sel): DimensionSelector: [+=4096]
  (block_func): PairBilinear_MixerBlock(
    (selector): BiasLayer: [4096]
    (pairwise_mixing): ModuleList(
      (0-11): 12 x PairBilinear: [4096 -> 4096] (grid: 3)
    )
    (reducer): Identity()
  )
  (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (actf): GELU(approximate='none')
  (fc): Linear(in_features=4096, out_features=10, bias=True)
)

## Create Models

In [26]:
model = model.to(device)
model

CIFAR10_PairBilinear(
  (dim_sel): DimensionSelector: [+=4096]
  (block_func): PairBilinear_MixerBlock(
    (selector): BiasLayer: [4096]
    (pairwise_mixing): ModuleList(
      (0-11): 12 x PairBilinear: [4096 -> 4096] (grid: 3)
    )
    (reducer): Identity()
  )
  (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (actf): GELU(approximate='none')
  (fc): Linear(in_features=4096, out_features=10, bias=True)
)

In [27]:
model(torch.randn(2, 3, 32, 32).to(device)).shape

torch.Size([2, 10])

In [28]:
print("number of params: ", sum(p.numel() for p in model.parameters()))

number of params:  593930


## Training

In [29]:
 ## debugging to find the good classifier/output distribution.
# model_name = 'block_mlp_mixer_CIFAR10_v0'
# model_name = 'ord_mlp_mixer_CIFAR10_v0'
model_name = 'pair_bilinear_mixer_CIFAR10_v0'

In [30]:
EPOCHS = 50
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.00003)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [31]:
## Following is copied from 
### https://github.com/kuangliu/pytorch-cifar/blob/master/main.py

# Training
def train(epoch, model, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
#     for batch_idx, (inputs, targets) in enumerate(tqdm(train_loader)):
    for batch_idx, (inputs, targets) in enumerate(train_loader):

        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    loss = train_loss/(batch_idx+1)
    acc = 100.*correct/total
#     print(f"[Train] {epoch} Loss: {loss:.3f} | Acc: {acc:.3f} {correct}/{total}")
    return loss, acc

In [32]:
# best_acc = -1
def test(epoch, model, optimizer, best_acc, model_name):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    latency = []
    with torch.no_grad():
#         for batch_idx, (inputs, targets) in enumerate(tqdm(test_loader)):
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            start = time.time()
            outputs = model(inputs)
            ttaken = time.time()-start
                
            loss = criterion(outputs, targets)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            latency.append(ttaken)
    
    loss = test_loss/(batch_idx+1)
    acc = 100.*correct/total
#     print(f"[Test] {epoch} Loss: {test_loss/(batch_idx+1):.3f} | Acc: {100.*correct/total:.3f} {correct}/{total}")
    
    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
#         print(f'Saving.. Acc: {100.*correct/total:.3f}')
        state = {
            'model': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('models'):
            os.mkdir('models')
        torch.save(state, f'./models/{model_name}.pth')
        best_acc = acc
        
    return loss, acc, best_acc, latency

In [33]:
# start_epoch = 0  # start from epoch 0 or last checkpoint epoch
# resume = False

# if resume:
#     # Load checkpoint.
#     print('==> Resuming from checkpoint..')
#     assert os.path.isdir('./models'), 'Error: no checkpoint directory found!'
#     checkpoint = torch.load(f'./models/{model_name}.pth')
#     model.load_state_dict(checkpoint['model'])
#     best_acc = checkpoint['acc']
#     start_epoch = checkpoint['epoch']

In [34]:
# # ### Train the whole damn thing

# best_acc = -1
# for epoch in range(start_epoch, start_epoch+EPOCHS): ## for 200 epochs
#     trloss, tracc = train(epoch, model, optimizer)
#     teloss, teacc, best_acc, latency = test(epoch, model, optimizer, best_acc, model_name)
#     scheduler.step()

In [35]:
# best_acc ## 90.42 for ordinary, 89.59 for sparse, 89.82 fro 32bMLP, 

### Do all experiments in repeat

In [36]:
def train_model(model, lr, model_name, epochs=200, seed=0):
    global criterion, train_loader, test_loader
    
    torch.manual_seed(seed)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=2)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    best_acc = -1
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    n_params = sum(p.numel() for p in model.parameters())
    stats = {'num_param':n_params, 'latency': [], 
             'train_acc':[], 'train_loss':[], 
             'test_acc':[], 'test_loss':[] 
            }
    
    print(f"Begin Training for {model_name}")
    print(f"Num Parameters: {n_params}")

    for epoch in tqdm(range(epochs)):
        trloss, tracc = train(epoch, model, optimizer)
        teloss, teacc, best_acc, laten = test(epoch, model, optimizer, best_acc, model_name)
        scheduler.step()
        
        stats['latency'] += laten
        stats['train_acc'].append(tracc)
        stats['test_acc'].append(teacc)
        stats['train_loss'].append(trloss)
        stats['test_loss'].append(teloss)
        
    print()
    
    latency = np.array(stats['latency'])
    mu, std = np.mean(latency), np.std(latency)
    stats['latency'] = {'mean':mu, 'std':std}
    ### Save stats of the model
    with open(f'./models/stats/{model_name}_stats.json', 'w') as f:
        json.dump(stats, f)
    
    return stats, best_acc

In [37]:
# mlp_dims_scale = {
#     64: [2, 4, 8],
#     16: [2, 4, 8],
#     8: [2, 4, 8],
#     4: [4, 8, 16],
#     2: [4, 8, 16],
# }
# pwlf_grid_size = [3, 5, 9]

# SEEDS = [147, 258, 369]

# def benchmark_cifar10():
#     for seed in [147]:
#         ### First test MLP with allowed dimension mixing
#         for dim, hid_dim in mlp_dims_scale.items(): ## For 1024 these are the factors
#             for hr in hid_dim:
#                 torch.manual_seed(seed)
#                 model = CIFAR10_BlockMLP(block_size=dim, hidden_layers_ratio=[hr])
#                 n_params = sum(p.numel() for p in model.parameters())
#                 print(f"{dim}\t{hr}\t{n_params}\tBlockMLP")
            
#             model = CIFAR10_SparseMLP(block_size=dim)
#             n_params = sum(p.numel() for p in model.parameters())
#             print(f"{dim}\t\t{n_params}\tSparseMLP")
            
#             model = CIFAR10_SparseMLP_PWLF(block_size=dim)
#             n_params = sum(p.numel() for p in model.parameters())
#             print(f"{dim}\t\t{n_params}\tSparseMLP_PWLF")
#             print()
        
#         for gsz in pwlf_grid_size:
#             model = CIFAR10_PairBilinear(grid_width=gsz)
#             n_params = sum(p.numel() for p in model.parameters())
#             print(f"{2}\t{gsz}\t{n_params}\tPairPWLF")
        
#         print()
#         model = CIFAR10_OrdMLP()
#         n_params = sum(p.numel() for p in model.parameters())
#         print(f"\t\t{n_params}\tOrdMLP")


# benchmark_cifar10()

In [38]:
# model = CIFAR10_OrdMLP()
# sum(p.numel() for p in model.parameters())

## Configuring training and saving functionality

In [41]:
mlp_dims_scale = {
    64: [2, 4, 8],
    16: [2, 4, 8],
    8: [2, 4, 8],
    4: [4, 8, 16],
    2: [4, 8, 16],
}

pwlf_grid_size = [3, 5, 9]

SEEDS = [147, 258, 369]
EPOCHS = 200
LR = 0.001

def benchmark_cifar10():
#     for seed in [147]:
    for seed in SEEDS:
        ## First test MLP with allowed dimension mixing
        for dim, hid_dim in mlp_dims_scale.items(): ## For 1024 these are the factors
#             for hr in hid_dim:
#                 torch.manual_seed(seed)
#                 model = CIFAR10_BlockMLP(block_size=dim, hidden_layers_ratio=[hr])
#                 model_name = f"Rcifar10_BlockMLP_b{dim}_h{hr}_s{seed}"
#                 train_model(model, LR, model_name, EPOCHS)

#             torch.manual_seed(seed)
#             model = CIFAR10_SparseMLP(block_size=dim)
#             model_name = f"Rcifar10_SparseMLP_b{dim}_s{seed}"
#             train_model(model, LR, model_name, EPOCHS)
            
            torch.manual_seed(seed)
            model = CIFAR10_SparseMLP_PWLF(block_size=dim)
            model_name = f"Rcifar10_SparseMLP_PWLF_b{dim}_s{seed}"
            train_model(model, LR, model_name, EPOCHS)
            
        for gsz in pwlf_grid_size:
            torch.manual_seed(seed)
            model = CIFAR10_PairBilinear(grid_width=gsz)
            model_name = f"Rcifar10_PairPWLF_g{gsz}_s{seed}"
            train_model(model, LR, model_name, EPOCHS)
        break
        
#         torch.manual_seed(seed)
#         model = CIFAR10_OrdMLP()
#         model_name = f"Rcifar10_OrdinaryMLP_s{seed}"
#         train_model(model, LR, model_name, EPOCHS)

In [None]:
benchmark_cifar10()

Begin Training for Rcifar10_SparseMLP_PWLF_b64_s147
Num Parameters: 684042


  8%|████▏                                               | 16/200 [01:49<21:05,  6.88s/it]