In [1]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

import os, sys, pathlib, random, time, pickle, copy, json
# from tqdm.autonotebook import tqdm
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch.optim as optim
from torch.utils import data

In [3]:
from transformers_lib import TransformerBlock, \
        Mixer_TransformerBlock_Encoder, \
        PositionalEncoding

# Model

In [4]:
### add randomize patches for clear benefit
class Mixer_ViT_Classifier(nn.Module):
    
    def __init__(self, image_dim:tuple, patch_size:tuple, hidden_channel:int, num_blocks:int, num_classes:int, block_seq_size:int, block_mlp_size:int, forward_expansion:float=2.0, pos_emb=True, dropout:float=0.0, randomize_patch:bool=False):
        super().__init__()
        
        self.img_dim = image_dim ### must contain (C, H, W) or (H, W)
        
        ### find patch dim
        d0 = int(image_dim[-2]/patch_size[0])
        d1 = int(image_dim[-1]/patch_size[1])
        assert d0*patch_size[0]==image_dim[-2], "Image must be divisible into patch size"
        assert d1*patch_size[1]==image_dim[-1], "Image must be divisible into patch size"
        
#         self.d0, self.d1 = d0, d1 ### number of patches in each axis
        __patch_size = patch_size[0]*patch_size[1]*image_dim[0] ## number of channels in each patch
    
        ### find channel dim
        channel_size = d0*d1 ## number of patches
        
        ### after the number of channels are changed
        init_dim = __patch_size
        final_dim = hidden_channel
        self.unfold = nn.Unfold(kernel_size=patch_size, stride=patch_size)
        #### rescale the patches (patch wise image non preserving transform, unlike bilinear interpolation)
        self.channel_change = nn.Linear(init_dim, final_dim)
        print(f"ViT Mixer : Channels per patch -> Initial:{init_dim} Final:{final_dim}")
        
        
        self.channel_dim = final_dim
        self.patch_dim = channel_size
        
        self.transformer_blocks = []
        
        f = self.get_factors(self.channel_dim)
        print(f)
        fi = np.abs(np.array(f) - np.sqrt(self.channel_dim)).argmin()
        
        _n_heads = f[fi]
        
        ## number of dims per channel -> channel_dim
#         print('Num patches:', self.patch_dim)
        print(f'Sequence len: {self.patch_dim} ; Block size: {block_seq_size}')
        print('Channel dim:', self.channel_dim, 'num heads:',_n_heads)
            
        
        if block_seq_size is None or block_seq_size<2:
            ### Find the block size for sequence:
            block_seq_size = int(2**np.ceil(np.log2(np.sqrt(self.patch_dim))))
            
        print(f'MLP dim: {self.channel_dim} ; Block size: {block_mlp_size}')

        for i in range(num_blocks):
            L = Mixer_TransformerBlock_Encoder(self.patch_dim, block_seq_size, self.channel_dim, _n_heads, dropout, forward_expansion, nn.GELU, block_mlp_size)
            self.transformer_blocks.append(L)
        self.transformer_blocks = nn.Sequential(*self.transformer_blocks)
        
        self.linear = nn.Linear(self.patch_dim*self.channel_dim, num_classes)
        
        self.positional_encoding = PositionalEncoding(self.channel_dim, dropout=0)
        if not pos_emb:
            self.positional_encoding = nn.Identity()
            
        self.randomize = None
        if randomize_patch is not None:
            self.randomize = torch.randperm(self.patch_dim)
        
        
    def get_factors(self, n):
        facts = []
        for i in range(2, n+1):
            if n%i == 0:
                facts.append(i)
        return facts
    
    def forward(self, x):
        bs = x.shape[0]
        x = self.unfold(x).swapaxes(-1, -2)
        x = self.channel_change(x)
        x = self.positional_encoding(x)
        ## swap position of patches here
        if self.randomize is not None:
            x = x[..., self.randomize, :]
        x = self.transformer_blocks(x)
        x = self.linear(x.view(bs, -1))
        return x

In [5]:
device = torch.device('cuda:1')
# device = torch.device('cpu')

In [6]:
# torch.cuda.device_count()

In [7]:
# torch.cuda.get_device_name(0)

In [8]:
# torch.cuda.memory_allocated()

In [9]:
# model = Mixer_ViT_Classifier([3, 32, 32], [2, 2], 64, num_blocks=2, num_classes=10, 
#                             block_seq_size=16, block_mlp_size=None, pos_emb=False).to(device)

In [10]:
# model

In [11]:
# model = torch.compile(model)

In [12]:
# model = Mixer_ViT_Classifier([3, 32, 32], [1, 1], 64, num_blocks=2, num_classes=10, 
#                             block_seq_size=32, block_mlp_size=None, pos_emb=False).to(device)

In [13]:
# # print("number of params: ", sum(p.numel() for p in model.parameters()))
# for name, m in model.named_children():
# #     print(name)
#     print(f"{name}: {sum(p.numel() for p in m.parameters())}")

In [14]:
# model

In [15]:
# print("number of params: ", sum(p.numel() for p in model.parameters()))

## Benchmark Memory and Time CIFAR

In [16]:
# import nvidia_smi
# ### pip install nvidia-ml-py3


# MB = 1024*1024
# def get_memory_used(cuda_idx):
#     nvidia_smi.nvmlInit()
#     handle = nvidia_smi.nvmlDeviceGetHandleByIndex(cuda_idx)
#     info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
#     val = info.used/MB
#     nvidia_smi.nvmlShutdown()
#     return val

In [17]:
# get_memory_used(0)

In [18]:
import subprocess as sp
import os

def get_memory_used(cuda_idx):
    if cuda_idx == 0: ## server 1 has pytorch-cuda index and nvidia-smi index flipped
        cuda_idx = 1
    else:
        cuda_idx = 0
    command = "nvidia-smi --query-gpu=memory.used --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values[cuda_idx]

In [19]:
get_memory_used(1)

2

In [20]:
command = "nvidia-smi --query-gpu=memory.used --format=csv"
memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
memory_free_info

['2 MiB', '1 MiB']

In [21]:
# asdfsadf

In [22]:
expansion_dict = {16:1024, 8:256, 4:128, 2:64, 1:64}
def benchmark_memory(dataset:str, patch_size:int, num_layers:int, SEED:int, sparse_att:bool=False, sparse_mlp:bool=False, pos_emb:bool=False, cuda:int=0):
    global expansion_dict, filename, model_name
    device = torch.device(f"cuda:{cuda}")
    
    if sparse_att:
        assert num_layers%2 == 0, 'number of blocks on sparse transformer is (x2)/2 hence it must be even'
        num_layers_ = num_layers//2
    else:
        num_layers_ = num_layers
    
    BS = 32
    NC = -1
    EPOCHS = 1
    imsize = (3, 32, 32)
    expansion = expansion_dict[patch_size]

    
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    ##### Data Transforms
        
    if dataset == 'cifar10':
        NC = 10
    elif dataset == 'cifar100':
        NC = 100

    ### Now create models
    
    seq_len = (imsize[-1]*imsize[-2])//(patch_size*patch_size)
    mlp_dim = expansion
    print(seq_len, mlp_dim)
    
    if sparse_att:
        seq_len = int(2**np.ceil(np.log2(np.sqrt(seq_len))))
    if sparse_mlp:
        mlp_dim = int(2**np.ceil(np.log2(np.sqrt(expansion))))
    
    _a, _b, _c = 'att', 'mlp', 'nPE'
    if sparse_att: _a = 'sAtt'
    if sparse_mlp: _b = 'sMlp'
    if pos_emb: _c = 'PE'
        
    filename = f"./output/bench_mem_retest_data_3090_v3(1by1)_compile.json"
    
    if not os.path.exists(filename):
        with open(filename, 'w') as f:
            json.dump({}, f, indent=0)
            
        
    ### Training
    model_name = f'01.3_ViT_train_{dataset}_patch{patch_size}_l{num_layers}_exp{expansion}_{_a}_{_b}_s{SEED}'
    ### Inference
#     model_name = f'01.3_ViT_eval_{dataset}_patch{patch_size}_l{num_layers}_exp{expansion}_{_a}_{_b}_s{SEED}'

    with open(filename, 'r') as f:
        file_data = json.load(f)
        if model_name in file_data.keys():
            print(f"{model_name} already found.. !!")
            return 0
        
    mem_begin = get_memory_used(cuda)
    
    torch.manual_seed(SEED)
    model = Mixer_ViT_Classifier(imsize, 
                                 patch_size=[patch_size]*2, 
                                 hidden_channel=expansion, 
                                 num_blocks=num_layers_, 
                                 num_classes=NC, 
                                 block_seq_size=seq_len, 
                                 block_mlp_size=mlp_dim,
                                 pos_emb=pos_emb).to(device)
    model = torch.compile(model)
    
    _x = torch.randn(BS, *imsize)#.to(device)
    _y = torch.randint(10, (BS,))
#     print("Output: ",vit_mixer(_x).shape)
    num_params = sum(p.numel() for p in model.parameters())
    print("number of params: ", num_params)
    
    print(f"Model Name: {model_name}")
    
    criterion = nn.CrossEntropyLoss()
    ### Training
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    ### Inference
#     model.eval()

    inputs, targets = _x.to(device), _y.to(device)
    ### test time taken for multiple iterations
    time_taken = []
    for i in tqdm(range(50)):

        ### Inference
#         with torch.no_grad():
#             start = time.time()
#             outputs = model(inputs)
#             start = time.time()-start
            
        ### Training
        start = time.time()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        start = time.time()-start
        
        time_taken.append(start)
    train_time = {"mean":np.mean(time_taken), "std":np.std(time_taken), 
                  "min":np.min(time_taken), "max":np.max(time_taken)}
        
    mem_end = get_memory_used(cuda)
    print(f"mem begin: {mem_begin}  end: {mem_end}")

    model.eval()
    time_taken = []
    for i in range(50):
        with torch.no_grad():
            start = time.time()
            outputs = model(inputs)
            start = time.time()-start
            time_taken.append(start)
            
    test_time = {"mean":np.mean(time_taken), "std":np.std(time_taken), 
                  "min":np.min(time_taken), "max":np.max(time_taken)}
    
    with open(filename, 'r+') as f:
#         with open(filename,'r+') as f:
        file_data = json.load(f)
        file_data[f"{model_name}"] = {'memory':mem_end-mem_begin, 
                                      'time_train':train_time, 
                                      'time_test':test_time,
                                      'param':num_params}
        f.seek(0)
        json.dump(file_data, f, indent = 0)

    del model, optimizer
    return 1

In [23]:
# benchmark_memory(dataset='cifar100', 
#                   patch_size=2, 
#                   num_layers=4, 
#                   SEED=147, 
#                   sparse_att=True, 
#                   sparse_mlp=False, 
#                   pos_emb=False,
#                   cuda=0
#                  )

In [24]:
# ### Automate the benchmark
# ###### for c10

# cuda_idx = 0
# seed = 147
# PE = False
# nlayers=2
# dataset='cifar10'
# nlayers = 2
# sparse_mlp = False
# for patch_size in [4, 2, 1]:
#     for _expansion in [64, 256, 1024]:
#         for sparse_attention in [False, True]:
#             expansion_dict[patch_size] = _expansion
#             print(f'''
#                 Experimenting on {dataset} Dataset 
#                 patch:{patch_size},
#                 sparse_att: {sparse_attention},
#                 sparse_mlp: {sparse_mlp},
#                 num_layers: {nlayers},
#                 pos_embed: {PE},
#                 seed: {seed}
#             ''')

#             try:
#                 benchmark_memory(dataset=dataset, 
#                           patch_size=patch_size, 
#                           num_layers=nlayers, 
#                           SEED=seed, 
#                           sparse_att=sparse_attention, sparse_mlp=sparse_mlp, 
#                           pos_emb=PE,
#                           cuda=cuda_idx
#                          )
#             except Exception as e:
#                 print("Cuda out of memory \n !!!!!!!!!!!!!!!!!!! \n")
#                 print(e)

#             torch.cuda.empty_cache()

## Test 1 by 1

- continue next exp if output is found
- exit after successful experiment

In [25]:
### Automate the benchmark
###### for c10

cuda_idx = 0
seed = 147
PE = False
nlayers=2
dataset='cifar10'
nlayers = 2
sparse_mlp = False
for patch_size in [4, 2, 1]:
    for _expansion in [64, 256, 1024]:
        for sparse_attention in [False, True]:
            expansion_dict[patch_size] = _expansion
            print(f'''
                Experimenting on {dataset} Dataset 
                patch:{patch_size},
                sparse_att: {sparse_attention},
                sparse_mlp: {sparse_mlp},
                num_layers: {nlayers},
                pos_embed: {PE},
                seed: {seed}
            ''')
            out = 1 ## dont break, if out is 0(already done) then do next exp, till existing one is found
            try:
                out = benchmark_memory(dataset=dataset, 
                          patch_size=patch_size, 
                          num_layers=nlayers, 
                          SEED=seed, 
                          sparse_att=sparse_attention, sparse_mlp=sparse_mlp, 
                          pos_emb=PE,
                          cuda=cuda_idx
                         )
            except Exception as e:
                print("Cuda out of memory \n !!!!!!!!!!!!!!!!!!! \n")
                print(e)
                out = 1 ## exit by saving 
                
                with open(filename, 'r+') as f:
                    file_data = json.load(f)
                    file_data[f"{model_name}"] = -1
                    f.seek(0)
                    json.dump(file_data, f, indent = 0)

            torch.cuda.empty_cache()
            if out == 1: break
        if out == 1: break                
    if out == 1: break            


                Experimenting on cifar10 Dataset 
                patch:4,
                sparse_att: False,
                sparse_mlp: False,
                num_layers: 2,
                pos_embed: False,
                seed: 147
            
64 64
01.3_ViT_train_cifar10_patch4_l2_exp64_att_mlp_s147 already found.. !!

                Experimenting on cifar10 Dataset 
                patch:4,
                sparse_att: True,
                sparse_mlp: False,
                num_layers: 2,
                pos_embed: False,
                seed: 147
            
64 64
01.3_ViT_train_cifar10_patch4_l2_exp64_sAtt_mlp_s147 already found.. !!

                Experimenting on cifar10 Dataset 
                patch:4,
                sparse_att: False,
                sparse_mlp: False,
                num_layers: 2,
                pos_embed: False,
                seed: 147
            
64 256
01.3_ViT_train_cifar10_patch4_l2_exp256_att_mlp_s147 already found.. !!

                



number of params:  27289610
Model Name: 01.3_ViT_train_cifar10_patch1_l2_exp1024_sAtt_mlp_s147


100%|█████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.35it/s]


mem begin: 1  end: 5569


