In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from models import ConvMixer, MlpMixer
from torchvision.datasets import CIFAR10, ImageFolder
from torch.utils.data import DataLoader 
from torchvision import transforms as T

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.cuda.empty_cache())
print(torch.cuda.memory_summary(0))

In [None]:
batch_size=2
hdim=1024
depth=32

epochs=1

scale=0.75
reprob=0.25
ra_m=8
ra_n=1
jitter=0.1
psize=2
conv_ks=5
wd=0.01
clip_norm=True
lr_max=0.01
workers=2

In [None]:
from pathlib import Path
import os

if not Path('data/tiny-imagenet-200').exists():
    os.system('wget http://cs231n.stanford.edu/tiny-imagenet-200.zip -P data')
    os.system('unzip -qq data/tiny-imagenet-200.zip -d data')

DATA_DIR = 'data/tiny-imagenet-200' # Original images come in shapes of [3,64,64]

# Define training and validation data paths
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
VALID_DIR = os.path.join(DATA_DIR, 'val')

traindata = ImageFolder(TRAIN_DIR, transform=T.Compose([
    T.RandomResizedCrop(64, scale=(scale, 1.0)),
    T.ToTensor(),
    T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
]))

valdata = ImageFolder(VALID_DIR, transform=T.Compose([
    T.Resize(64),
    T.CenterCrop(64),
    T.ToTensor(),
    T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
]))

trainloader = DataLoader(traindata, batch_size=batch_size, shuffle=True, num_workers=workers)
valloader = DataLoader(valdata, batch_size=batch_size, shuffle=False, num_workers=workers)

print(len(traindata))
print(len(valdata))

In [None]:
# cifar10_mean = (0.4914, 0.4822, 0.4465)
# cifar10_std = (0.2471, 0.2435, 0.2616)
# train_transform = T.Compose([
#     T.RandomResizedCrop(32, scale=(scale, 1.0), ratio=(1.0, 1.0)),
#     T.RandomHorizontalFlip(p=0.5),
#     T.RandAugment(num_ops=ra_n, magnitude=ra_m),
#     T.ColorJitter(jitter, jitter, jitter),
#     T.ToTensor(),
#     T.Normalize(cifar10_mean, cifar10_std),
#     T.RandomErasing(p=reprob)
# ])

# test_transform = T.Compose([
#     T.ToTensor(),
#     T.Normalize(cifar10_mean, cifar10_std)
# ])
# traindata = CIFAR10(root="data", train=True, download=True, transform=train_transform)
# testdata = CIFAR10(root="data", train=False, download=True, transform=test_transform)
# trainloader = DataLoader(traindata, batch_size=batch_size, shuffle=True, num_workers=workers)
# testloader = DataLoader(testdata, batch_size=batch_size, shuffle=False, num_workers=workers)

In [None]:
def get_stats(model, get_time=True, record_time_len=100, verbose=False):
    opt = optim.AdamW(model.parameters(), lr=lr_max, weight_decay=wd)
    criterion = nn.CrossEntropyLoss()
    preload_mem = 0.
    load_mem = 0.
    forward_mem = 0.
    transfered = []
    step_time = []
    record_mem = 3
    record_time = list(range(4, 4+record_time_len))
    end_step = max(4+record_time_len, record_mem) if get_time else record_mem
    if verbose:
        print(f"batch_size: {batch_size}, hdim: {hdim}, depth: {depth}")
        print(f"Total params: {sum(p.numel() for p in model.parameters()):,}")
        print(f"Total params size in gb: {sum(p.element_size()*p.nelement() for p in model.parameters())/1024**3:.4f}GB")
    for i, (X, y) in enumerate(trainloader):
        if i in record_time: start_step = time.time()
        if i == record_mem: preload_mem = torch.cuda.memory_allocated(0)/1024**3
        model.train()
        X, y = X.cuda(), y.cuda()
        if i == record_mem:
            load_mem = torch.cuda.memory_allocated(0)/1024**3
            transfered.append(X.element_size() * X.nelement())
            transfered.append(y.element_size() * y.nelement())

        # lr = lr_schedule(epoch + (i + 1)/len(trainloader))
        # opt.param_groups[0].update(lr=lr)

        opt.zero_grad()
        # with torch.cuda.amp.autocast():
        output = model(X)
        loss = criterion(output, y)
        if i == record_mem:
            forward_mem = torch.cuda.memory_allocated(0)/1024**3
            transfered.append(loss.element_size() * loss.nelement())

        loss.backward()
        if clip_norm:
            # scaler.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # scaler.step(opt)
        # scaler.update()
        opt.step()
        # print(f"step {i} of {len(trainloader)}")
        if i in record_time: step_time.append(time.time() - start_step)
        if verbose:
            if i == record_time[-1]: print(f'avg step time: {np.mean(step_time):.4f} +- {np.std(step_time)}s')
            if i == record_mem:
                print(f'preload_mem: {preload_mem:.4f}GB, load_mem: {load_mem:.4f}GB, forward_mem: {forward_mem:.4f}GB, transfered: {np.array(transfered).mean()/1024:.4f}kB')
        if i == end_step: break
    return preload_mem, load_mem, forward_mem, transfered, step_time

_ = get_stats(MlpMixer(num_blocks=depth, embed_dim=hdim).cuda(), verbose=True)

In [None]:
def get_model_size(mem_limit, hdim=1024):
    # binary search to find depth to fit in mem_limit
    depth = 1
    while True:
        model = MlpMixer(num_blocks=depth, embed_dim=hdim).cuda() 
        _, _, forward_mem, _, _ = get_stats(model, get_time=False)
        print(f"depth: {depth}, hdim: {hdim}, forward_mem: {forward_mem:.4f}GB")
        if forward_mem > mem_limit:
            if depth == 1:
                hdim //= 2
            else:
                break
        depth *= 2
    scale = depth // 2
    depth -= scale
    while True:
        model = MlpMixer(num_blocks=depth, embed_dim=hdim).cuda() 
        _, _, forward_mem, _, _ = get_stats(model, get_time=False)
        scale //= 2
        print(f"depth: {depth}, hdim: {hdim}, forward_mem: {forward_mem:.4f}GB")
        if forward_mem > mem_limit: depth -= scale
        else: depth += scale
        if scale <= 1: break
    return depth, hdim

get_model_size(1)

In [None]:
# model = ConvMixer(hdim, depth, kernel_size=9, patch_size=7, n_classes=1000)
model = MlpMixer(num_blocks=depth, embed_dim=hdim).cuda() 
lr_schedule = lambda t: np.interp([t], [0, epochs*2//5, epochs*4//5, epochs], 
                                  [0, lr_max, lr_max/20.0, 0])[0]

opt = optim.AdamW(model.parameters(), lr=lr_max, weight_decay=wd)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

In [None]:
print(f"batch_size: {batch_size}, hdim: {hdim}, depth: {depth}")
print(f"Total params: {sum(p.numel() for p in model.parameters()):,}")
print(f"Total params size in gb: {sum(p.element_size()*p.nelement() for p in model.parameters())/1024**3:.4f}GB")
preload_mem = 0.
load_mem = 0.
forward_mem = 0.
transfered = 0
step_time = []
record_mem = 3
record_time = list(range(4, 4+100))
for epoch in range(epochs):
    start = time.time()
    train_loss, train_acc, n = 0, 0, 0
    for i, (X, y) in enumerate(trainloader):
        if i in record_time: start_step = time.time()
        if i == record_mem: preload_mem = torch.cuda.memory_allocated(0)/1024**3
        model.train()
        X, y = X.cuda(), y.cuda()
        if i == record_mem:
            load_mem = torch.cuda.memory_allocated(0)/1024**3
            transfered += X.element_size() * X.nelement()
            transfered += y.element_size() * y.nelement()

        lr = lr_schedule(epoch + (i + 1)/len(trainloader))
        opt.param_groups[0].update(lr=lr)

        opt.zero_grad()
        # with torch.cuda.amp.autocast():
        output = model(X)
        loss = criterion(output, y)
        if i == record_mem:
            forward_mem = torch.cuda.memory_allocated(0)/1024**3
            transfered += loss.element_size() * loss.nelement()

        loss.backward()
        if clip_norm:
            # scaler.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # scaler.step(opt)
        # scaler.update()
        opt.step()

        train_loss += loss.item() * y.size(0)
        train_acc += (output.max(1)[1] == y).sum().item()
        n += y.size(0)
        # print(f"step {i} of {len(trainloader)}")
        if i in record_time: step_time.append(time.time() - start_step)
        if i == record_time[-1]: print(f'avg step time: {np.mean(step_time):.4f} +- {np.std(step_time)}s')

        if i == record_mem:
            print(f'preload_mem: {preload_mem:.4f}GB, load_mem: {load_mem:.4f}GB, forward_mem: {forward_mem:.4f}GB, transfered: {transfered/1024**2:.4f}MB')
        
    model.eval()
    test_acc, m = 0, 0
    with torch.no_grad():
        for i, (X, y) in enumerate(testloader):
            X, y = X.cuda(), y.cuda()
            with torch.cuda.amp.autocast():
                output = model(X)
            test_acc += (output.max(1)[1] == y).sum().item()
            m += y.size(0)

    print(f'[ConvMixer] Epoch: {epoch} | Train Acc: {train_acc/n:.4f}, Test Acc: {test_acc/m:.4f}, Time: {time.time() - start:.1f}, lr: {lr:.6f}')


In [None]:
import timeit
# Measure the latency to move data from CPU to GPU, and GPU to CPU
print("Measuring data transfer latency...")
latency = []
for i in range(5,21):
    data_amt = 1 << i
    data = torch.randn(data_amt)
    print(f"Data amount: {data_amt} = {data_amt*data.element_size()/1024**3:.4f}GB = 2^{i}")
    cudata = data.cuda()
    baseline = np.array(timeit.repeat(lambda: cudata.mean().cpu().numpy(), number=10000, repeat=7))
    real = np.array(timeit.repeat(lambda: data.cuda().mean().cpu().numpy(), number=10000, repeat=7))
    latency.append((real-baseline)/(data_amt * data.element_size()*10000))
    print(f"Latency: {np.mean(latency[-1]):.4e} ± {np.std(latency[-1]):.4e}s")
    print()

In [None]:
import matplotlib.pyplot as plt
nplat = np.array(latency) * 1e9 
#plot mean latency and std error bars
# x = np.float32(2)**np.arange(5,21)
x = np.arange(5,21)
plt.plot(x, nplat.mean(1), label="mean")
plt.fill_between(x, nplat.mean(1)-nplat.std(1), nplat.mean(1)+nplat.std(1), alpha=0.5, label="std")
plt.xlabel("Data amount (2^i)")
plt.ylabel("Latency (ns / byte)")