In [1]:
import timm
import torch.nn as nn

In [2]:
from timm.models.convmixer import ConvMixer

model = ConvMixer(dim=512, depth=6, patch_size=1, in_chans=1, num_classes=10)

In [27]:
model

ConvMixer(
  (stem): Sequential(
    (0): Conv2d(1, 512, kernel_size=(1, 1), stride=(1, 1))
    (1): GELU(approximate=none)
    (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
      (2): GELU(approximate=none)
      (3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum

In [28]:
y = model.stem(torch.rand(8,1,28,28))
y.shape

torch.Size([8, 512, 28, 28])

In [29]:
model.blocks[0](y).shape

torch.Size([8, 512, 28, 28])

In [79]:
model.head = nn.Sequential(nn.Dropout(0.1), 
                           nn.Flatten(), 
                           nn.Linear(512, 10, bias=False), 
                           nn.BatchNorm1d(10)
                          )

In [80]:
model

ConvMixer(
  (stem): Sequential(
    (0): Conv2d(1, 512, kernel_size=(2, 2), stride=(2, 2))
    (1): GELU(approximate=none)
    (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
      (2): GELU(approximate=none)
      (3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum

In [81]:
from timm.models.mlp_mixer import MlpMixer

In [66]:
model = MlpMixer(img_size=28, num_blocks=6, embed_dim=512, patch_size=2, in_chans=1, num_classes=10)

In [67]:
model(torch.rand(8,1,28,28))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], grad_fn=<AddmmBackward0>)

## LSUV

In [88]:
from lsuv import LSUVinit

In [None]:
LSUVinit(model, torch.rand(8,1,28,28))

Starting LSUV
Total layers to process: 14
(512, 1, 2, 2) (512, 4)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(512, 1, 9, 9) (512, 81)
(512, 512, 1, 1) (512, 512)
(10, 512) (10, 512)
Orthonorm done
0
std at layer  0  =  0.023811925
std at layer  0  =  0.9999996 mean =  1.9766846e-09
finish at layer 0
1
std at layer  1  =  0.6850876
std at layer  1  =  0.99999994 mean =  -1.140395e-09
finish at layer 1
2
std at layer  2  =  0.25060916
std at layer  2  =  0.9999999 mean =  -5.169791e-09
finish at layer 2
3
std at layer  3  =  1.1467748
std at layer  3  =  1.0000002 mean =  -1.7486057e-09
finish at layer 3
4
std at layer  4  =  0.24164225
std at layer  4  =  1.0 mean =  -3.421185e-09
finish at layer 4
5
std at layer  5  =  1.1498828
std at layer  5  =  0.9999999 mean =  1

ConvMixer(
  (stem): Sequential(
    (0): Conv2d(1, 512, kernel_size=(2, 2), stride=(2, 2))
    (1): GELU(approximate=none)
    (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))
      (2): GELU(approximate=none)
      (3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Residual(
        (fn): Sequential(
          (0): Conv2d(512, 512, kernel_size=(9, 9), stride=(1, 1), padding=same, groups=512)
          (1): GELU(approximate=none)
          (2): BatchNorm2d(512, eps=1e-05, momentum

In [8]:
28*28

784

In [9]:
784/4

196.0

In [10]:
from timm.models.vision_transformer import _create_vision_transformer

In [18]:
196/4

49.0

In [22]:
import torch
impo

In [55]:
model_kwargs = dict(patch_size=4, embed_dim=196, depth=6, num_heads=4, mlp_ratio=2)
model = _create_vision_transformer('vit_tiny_patch16_224', 
                                   pretrained=False, img_size=(28,28), 
                                   in_chans=1, num_classes=10, **model_kwargs)

In [56]:
model(torch.rand(8,1,28,28))

tensor([[-0.4554, -0.9780,  0.3091, -0.0304, -0.1631, -0.1910, -0.3284,  0.3185,
         -0.6398,  0.2459],
        [-0.4108, -0.9881,  0.2846, -0.0276, -0.1513, -0.2406, -0.3656,  0.2758,
         -0.5802,  0.2881],
        [-0.4505, -0.9508,  0.2980, -0.0093, -0.1395, -0.1933, -0.3460,  0.2509,
         -0.6060,  0.2479],
        [-0.4234, -0.9500,  0.2942, -0.0135, -0.1061, -0.2226, -0.3869,  0.2177,
         -0.5944,  0.2690],
        [-0.4265, -0.9537,  0.2643, -0.0190, -0.1551, -0.2123, -0.3399,  0.2686,
         -0.5882,  0.2720],
        [-0.4033, -0.9565,  0.3059, -0.0146, -0.1096, -0.2289, -0.3612,  0.2706,
         -0.5907,  0.2718],
        [-0.4280, -0.9611,  0.2979, -0.0407, -0.1418, -0.2003, -0.3567,  0.2698,
         -0.6025,  0.2526],
        [-0.4447, -0.9428,  0.2932, -0.0353, -0.1133, -0.2081, -0.3517,  0.2351,
         -0.5904,  0.2545]], grad_fn=<AddmmBackward0>)

In [30]:
import fastcore.all as fc
from functools import partial

import torch
import torch.nn as nn

def conv(ni, nf, ks=3, stride=2, act=nn.ReLU, norm=None, bias=None):
    if bias is None: bias = not isinstance(norm, (nn.BatchNorm1d,nn.BatchNorm2d,nn.BatchNorm3d))
    layers = [nn.Conv2d(ni, nf, stride=stride, kernel_size=ks, padding=ks//2, bias=bias)]
    if norm: layers.append(norm(nf))
    if act: layers.append(act())
    return nn.Sequential(*layers)

class GeneralRelu(nn.Module):
    def __init__(self, leak=None, sub=None, maxv=None):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x): 
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x -= self.sub
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x

act_gr = partial(GeneralRelu, leak=0.1, sub=0.4)

# %% ../nbs/13_resnet.ipynb 14
def _conv_block(ni, nf, stride, act=act_gr, norm=None, ks=3):
    conv2 = conv(nf, nf, stride=stride, act=None, norm=norm, ks=ks)
    if norm: init.constant_(conv2[1].weight, 0.)
    return nn.Sequential(conv(ni, nf, stride=1, act=act, norm=norm, ks=ks), conv2)

class ResBlock(nn.Module):
    def __init__(self, ni, nf, stride=1, ks=3, act=act_gr, norm=None):
        super().__init__()
        self.convs = _conv_block(ni, nf, stride, act=act, ks=ks)
        self.idconv = fc.noop if ni==nf else conv(ni, nf, ks=1, stride=1, act=None)
        self.pool = fc.noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)
        self.act = act()

    def forward(self, x): return self.act(self.convs(x) + self.idconv(self.pool(x)))


def get_model(act=nn.ReLU, nfs=(16,32,64,128,256,512), norm=nn.BatchNorm2d):
    layers = [ResBlock(1, 16, ks=5, stride=1, act=act, norm=norm)]
    layers += [ResBlock(nfs[i], nfs[i+1], act=act, norm=norm, stride=2) for i in range(len(nfs)-1)]
    layers += [nn.Flatten(), nn.Linear(nfs[-1], 10, bias=False), nn.BatchNorm1d(10)]
    return nn.Sequential(*layers)

get_model()

## DataLoader

In [1]:
import torch

In [12]:
# 1 minute video 1 frame/s
video = torch.rand(60, 3, 256, 256)
labels = torch.rand(60)

if the you want to feed 5 seconds each time to the model, you need to construct a dataset that does this:

In [16]:
class VideoDataset:
    def __init__(self, video, labels, frames_to_slice=5):
        self.video = video
        self.labels = labels
        self.frames_to_slice = frames_to_slice
    
    def __getitem__(self, idx):
        x = self.video[idx:idx+self.frames_to_slice]
        y = self.labels[idx+self.frames_to_slice]
        return x, y 
    
    def __len__(self):
        return len(self.video) - self.frames_to_slice # we cannot get 5 seconds at the last 4 frames

In [17]:
ds = VideoDataset(video, labels)

In [19]:
x,y = ds[0]
x.shape, y.shape

(torch.Size([5, 3, 256, 256]), torch.Size([]))

In [22]:
dl = torch.utils.data.DataLoader(ds, batch_size=3)

In [23]:
x,y = next(iter(dl))

In [25]:
x.shape, y.shape

(torch.Size([3, 5, 3, 256, 256]), torch.Size([3]))