<a href="https://colab.research.google.com/github/tanhao1998/AutoDL-Projects/blob/main/slowfast_stage1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, we will demonstrate the stage 1 of our imagined slowfast system:
- Different models have different view space. And, combing these models together leads to a big vew space.
- A routing model help find the suitable model processing the images.

### Setup

Needs to be executed once in every VM.

The cell below downloads the code from Github and install necessary dependencies.

In [1]:
# from google.colab import drive
# drive.mount('/gdrive')
# root = '/gdrive/My Drive/slowfast_system/stage1'
# root = '/gdrive/My Drive/vision_transformer_colab'
# import os
# if not os.path.isdir(root):
#   os.mkdir(root)
# os.chdir(root)
# print(f'\nChanged CWD to "{root}"')

root = 'drive/MyDrive/slowfast_system/stage1'

In [2]:
# Clone repository and pull latest changes.
# ![ -d slowfast_system] || git clone https://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@github.com/tanhao1998/slowfast_system.git
# !cd slowfast_system && git pull

In [3]:
!pip install ptflops
!pip install timm



### Imports

In [4]:
import timm
import torch
import torchvision as tv
import numpy as np

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Load dataset

In [6]:
dataset = 'cifar10'
num_classes = 10
batch_size = 128
precrop = 224

train_tx = tv.transforms.Compose([
    tv.transforms.Resize((precrop, precrop)),
    tv.transforms.ToTensor(),
    tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

val_tx = tv.transforms.Compose([
    tv.transforms.Resize((precrop, precrop)),
    tv.transforms.ToTensor(),
    tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [7]:
trainset = tv.datasets.CIFAR10('./data', transform=train_tx, train=True, download=True)
testset = tv.datasets.CIFAR10('./data', transform=val_tx, train=False, download=True)

loader_train = torch.utils.data.DataLoader(
    trainset, batch_size=batch_size, shuffle=False,
    num_workers=2, pin_memory=True, drop_last=False)

# loader_valid = torch.utils.data.DataLoader(
#     valid_set, batch_size=batch_size, shuffle=False,
#     num_workers=16, pin_memory=True, drop_last=False)

# order = sorted(range(len(train_set.targets)), key=lambda k: train_set.targets[k])
# order_val = sorted(range(len(valid_set.targets)), key=lambda k: valid_set.targets[k])


Files already downloaded and verified
Files already downloaded and verified


##Boilerplate

In [8]:
from IPython.display import HTML, display

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [9]:
def stairs(s, v, *svs):
    """ Implements a typical "stairs" schedule for learning-rates.
    Best explained by example:
    stairs(s, 0.1, 10, 0.01, 20, 0.001)
    will return 0.1 if s<10, 0.01 if 10<=s<20, and 0.001 if 20<=s
    """
    for s0, v0 in zip(svs[::2], svs[1::2]):
        if s < s0:
            break
        v = v0
    return v

def rampup(s, peak_s, peak_lr):
  if s < peak_s:  # Warmup
    return s/peak_s * peak_lr
  else:
    return peak_lr

# def schedule(s):
#   step_lr = stairs(s, 3e-3, 200, 3e-4, 300, 3e-5, 400, 3e-6, 500, None)
#   return rampup(s, 100, step_lr)

## Fine-tune

In [10]:
torch.backends.cudnn.benchmark = True
crit = torch.nn.CrossEntropyLoss().to(device)


def init_model(name='vit_tiny_patch16_224'):
  model = timm.create_model(name, pretrained=True, num_classes=num_classes)
  model = torch.nn.DataParallel(model)

  # Note: no weight-decay!
  optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
  model = model.to(device)
  optim.zero_grad()

  return model.train(), optim

In [11]:
def eval_cifar10(model, dataset, bs=100, progressbar=True):
  loader = torch.utils.data.DataLoader(dataset, batch_size=bs, shuffle=False, num_workers=2)

  model.eval()

  if progressbar is True:
    progressbar = display(progress(0, len(loader)), display_id=True)

  preds = []
  with torch.no_grad():
    for i, (x, t) in enumerate(loader):
      x, t = x.to(device), t.numpy()
      logits = model(x)
      _, y = torch.max(logits.data, 1)
      preds.extend(y.cpu().numpy() == t)
      progressbar.update(progress(i+1, len(loader)))

  return np.mean(preds), preds

In [12]:
from os.path import join as pjoin  # pylint: disable=g-importing-member

S = 500
def schedule(s):
  step_lr = stairs(s, 3e-3, 200, 3e-4, 300, 3e-5, 400, 3e-6, S, None)
  return rampup(s, 100, step_lr)


def train(model, optim, name="fast"):
  pb_train = display(progress(0, S), display_id=True)
  pb_test = display(progress(0, 100), display_id=True)
  losses = [[]]
  accus_train = [[]]
  accus_test = []

  steps_per_iter = 512 // loader_train.batch_size

  while len(losses) < S:
    for x, t in loader_train:
      x, t = x.to(device), t.to(device)
      logits = model(x)
      loss = crit(logits, t) / steps_per_iter
      loss.backward()
      losses[-1].append(loss.item())

      with torch.no_grad():
        accus_train[-1].extend(torch.max(logits, dim=1)[1].cpu().numpy() == t.cpu().numpy())

      if len(losses[-1]) == steps_per_iter:
        losses[-1] = sum(losses[-1])
        losses.append([])
        accus_train[-1] = np.mean(accus_train[-1])
        accus_train.append([])

        # Update learning-rate according to schedule, and stop if necessary
        lr = schedule(len(losses) - 1)
        for param_group in optim.param_groups:
          param_group['lr'] = lr

        optim.step()
        optim.zero_grad()

        pb_train.update(progress(len(losses) - 1, S))
        print(f'\r[Step {len(losses) - 1}] loss={losses[-2]:.2e} '
              f'train accu={accus_train[-2]:.2%} '
              f'test accu={accus_test[-1] if accus_test else 0:.2%} '
              f'(lr={lr:g})', end='', flush=True)

        if len(losses) % 25 == 0:
          accus_test.append(eval_cifar10(model, testset, progressbar=pb_test)[0])
          model.train()

          savename = pjoin(root, name + str(len(losses)-1) + ".pth.tar")
          torch.save({
                          "step": len(losses) - 1,
                          "losses": losses,
                          "model": model.state_dict(),
                          "optim": optim.state_dict(),
                      }, savename)


In [13]:
from ptflops import get_model_complexity_info

s_model, s_optim = init_model('vit_tiny_patch16_224')
macs, params = get_model_complexity_info(s_model, (3, 224, 224), as_strings=True,
                                        print_per_layer_stat=False, verbose=True)
print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
print('{:<30}  {:<8}'.format('Number of parameters: ', params))


losses, accus_train, accus_test = train(s_model, s_optim, name="slow")

# fmodel = timm.create_model('tf_mobilenetv3_small_100', pretrained=True, num_classes=num_classes)
# fmodel = torch.nn.DataParallel(fmodel)


# from pprint import pprint
# model_names = timm.list_models('*vit*')
# pprint(model_names)


Computational complexity:       1.07 GMac
Number of parameters:           5.53 M  


[Step 362] loss=3.15e-02 train accu=99.22% test accu=97.11% (lr=3e-05)

Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: ignored

In [None]:
losses = [[]]
accus_train = [[]]
accus_test = []

steps_per_iter = 512 // loader_train.batch_size
print('steps_per_iter:', steps_per_iter)

for x, t in loader_train:
  print(x.shape, t.shape)
  x, t = x.to(device), t.to(device)

  logits = s_model(x)
  loss = crit(logits, t) / steps_per_iter
  print(loss)
  # del x, t
  break
  loss.backward()
  losses[-1].append(loss.item())

steps_per_iter: 4
torch.Size([128, 3, 224, 224]) torch.Size([128])
tensor(0.6829, device='cuda:0', grad_fn=<DivBackward0>)


In [None]:
# print(loss.item())
# crit(logits, t)
losses[-1].append(loss.item())

In [None]:
print(len(losses[-1]))

1


In [None]:
def plot_training(losses, accus_train, accus_test):
  fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4))
  ax1.plot(losses[:-1])
  ax1.set_yscale('log')
  ax1.set_title('loss')
  ax2.plot(accus_train[:-1])
  ax2.set_title('training accuracy')
  ax3.plot(np.arange(25, 501, 25), accus_test)
  ax3.set_title('test accuracy');

In [None]:
plot_training(losses, accus_train, accus_test)

## Find indices to create a sorted CIFAR10 variant


In [None]:
# Import
import pandas as pd
import plotly.express as px

# Figure
x_scale = 50
y_scale = 50

# Sample the dataset

preprocess_tiny = tv.transforms.Compose([tv.transforms.CenterCrop((2, 2)), tv.transforms.ToTensor()])
trainset_tiny = tv.datasets.CIFAR10(root='./data', train=True, download=False, transform=preprocess_tiny)
loader = torch.utils.data.DataLoader(trainset_tiny, batch_size=50000, shuffle=False, num_workers=2)
images, labels = iter(loader).next()

In [None]:
indices = {cls: np.random.choice(np.where(labels.numpy() == cls)[0], 250, replace=False) for cls in range(10)}

In [None]:
data = []

cnt = 0
for label in indices.keys():
  for image in indices[label]:
      data.append([cnt // y_scale, cnt % y_scale, trainset_tiny.classes[label])
      cnt += 1

In [None]:
df = pd.DataFrame(data,columns=['index i','index j', 'classes'])


fig =  px.scatter(df, x = 'index i', y = 'index j', color='classes', range_x=[-0.5, x_scale-0.5],range_y=[-0.5, y_scale-0.5], title='subtraining set on CIFAR10')
fig.update_traces(marker=dict(size=6, symbol='square'))

fig.show()

In [None]:
train_sort = torch.utils.data.Subset(trainset, indices=[i for v in indices.values() for i in v])
len(train_sort)

## Plot the model

In [None]:
preds = eval_cifar10(model, train_sort)[1]

reg_preds = []
for i in range(x_scale*y_scale):
  reg_preds.append([cnt // y_scale, cnt % y_scale, preds[i])


In [None]:
df = pd.DataFrame(data,columns=['index i','index j', 'prediction'])


fig =  px.scatter(df, x = 'index i', y = 'index j', color='classes', range_x=[-0.5, x_scale-0.5],range_y=[-0.5, y_scale-0.5], title='prediction on subtraining set of CIFAR10')
fig.update_traces(marker=dict(size=6, symbol='square'))

fig.show()

In [15]:
!nvidia-smi

Mon Dec 13 23:20:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    62W / 149W |   5976MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
root = 'drive/MyDrive/slowfast_system/stage1'

In [None]:
!ls drive/MyDrive/slowfast_system/

stage1


In [None]:
model = EfficientNet.from_pretrained(‘efficientnet-b0’)
device = torch.device(“cuda”)
model.to(device)
dummy_input = torch.randn(optimal_batch_size, 3,224,224, dtype=torch.float).to(device)
repetitions=100
total_time = 0
with torch.no_grad():
  for rep in range(repetitions):
     starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
     starter.record()
     _ = model(dummy_input)
     ender.record()
     torch.cuda.synchronize()
     curr_time = starter.elapsed_time(ender)/1000
     total_time += curr_time
Throughput = (repetitions*optimal_batch_size)/total_time
print(‘Final Throughput:’,Throughput)