In [1]:
from fastai.vision import *

In [2]:
DATA = untar_data(URLs.IMAGENETTE_160)

In [3]:
src = (ImageList.from_folder(DATA).filter_by_rand(0.3, seed=42)
                .split_by_folder(valid='val')
                .label_from_folder()
                .transform(([flip_lr(p=0.5)], []), size=160))
data = (src.databunch(bs=64, num_workers=6)
           .normalize(imagenet_stats))
data

ImageDataBunch;

Train: LabelList (3798 items)
x: ImageList
Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160)
y: CategoryList
n03028079,n03028079,n03028079,n03028079,n03028079
Path: /home/user/.fastai/data/imagenette-160;

Valid: LabelList (161 items)
x: ImageList
Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160),Image (3, 160, 160)
y: CategoryList
n03028079,n03028079,n03028079,n03028079,n03028079
Path: /home/user/.fastai/data/imagenette-160;

Test: None

In [4]:
from fastai import layers

In [5]:
def conv_layer(ni:int, nf:int, ks:int=3, stride:int=1, padding:int=None, bias:bool=None, is_1d:bool=False,
               norm_type:Optional[NormType]=NormType.Batch,  use_activ:bool=True, activ_fn:Callable=None, leaky:float=None,
               transpose:bool=False, init:Callable=nn.init.kaiming_normal_, self_attention:bool=False):
    "Create a sequence of convolutional (`ni` to `nf`), ReLU (if `use_activ`) and batchnorm (if `bn`) layers."
    activ_fn = ifnone(activ_fn, partial(relu, inplace=True, leaky=leaky))
    if padding is None: padding = (ks-1)//2 if not transpose else 0
    bn = norm_type in (NormType.Batch, NormType.BatchZero)
    if bias is None: bias = not bn
    conv_func = nn.ConvTranspose2d if transpose else nn.Conv1d if is_1d else nn.Conv2d
    conv = init_default(conv_func(ni, nf, kernel_size=ks, bias=bias, stride=stride, padding=padding), init)
    if   norm_type==NormType.Weight:   conv = weight_norm(conv)
    elif norm_type==NormType.Spectral: conv = spectral_norm(conv)
    layers = [conv]
    if use_activ: layers.append(activ_fn())
    if bn: layers.append((nn.BatchNorm1d if is_1d else nn.BatchNorm2d)(nf))
    if self_attention: layers.append(SelfAttention(nf))
    return nn.Sequential(*layers)

In [6]:
def simple_cnn(data, actns:Collection[int], kernel_szs:Collection[int]=None,
               strides:Collection[int]=None, bn=False, activ_fn=None,
               lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5,
               concat_pool:bool=True, bn_final:bool=False) -> nn.Sequential:
    "CNN with `conv_layer` defined by `actns`, `kernel_szs` and `strides`, plus batchnorm if `bn`."
    nl = len(actns)-1
    kernel_szs = ifnone(kernel_szs, [3]*nl)
    strides    = ifnone(strides   , [2]*nl)
    layers = [conv_layer(actns[i], actns[i+1], kernel_szs[i], stride=strides[i],
              norm_type=(NormType.Batch if bn and i<(len(strides)-1) else None), activ_fn=activ_fn) for i in range_of(strides)]
    nf_head = actns[-1]  * (2 if concat_pool else 1)
    head = create_head(nf_head, data.c, lin_ftrs=lin_ftrs, ps=ps, concat_pool=concat_pool, bn_final=bn_final)
    return nn.Sequential(*layers, head)

In [7]:
actns = [3,64,64,128,128,256,256,512,512]
strides = [1,2]*(len(actns)//2)

# Relu

In [8]:
mdl_relu = simple_cnn(data, actns=actns, strides=strides)
mdl_relu

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (1): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (3): Sequential(
    (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (5): Sequential(
    (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (6): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
  )
  (7): Sequential(
    (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  

In [9]:
lrn = Learner(data, mdl_relu, metrics=[accuracy,top_k_accuracy])

In [10]:
lrn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.129265,2.851966,0.217391,0.614907,00:13
1,1.975083,2.081112,0.267081,0.73913,00:13
2,1.831574,1.718789,0.403727,0.84472,00:13
3,1.639187,1.452828,0.496894,0.888199,00:13
4,1.453105,1.301081,0.546584,0.919255,00:13


In [11]:
lrn.destroy()

this Learner object self-destroyed - it still exists, but no longer usable


# Mish

In [12]:
class Mish(nn.Module):
    def forward(self, x):
        return x * torch.tanh(F.softplus(x))

In [13]:
mdl_mish = simple_cnn(data, actns=actns, strides=strides, activ_fn=Mish)
mdl_mish

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Mish()
  )
  (1): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): Mish()
  )
  (2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Mish()
  )
  (3): Sequential(
    (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): Mish()
  )
  (4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Mish()
  )
  (5): Sequential(
    (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): Mish()
  )
  (6): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): Mish()
  )
  (7): Sequential(
    (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): Mish()
  )
  (8): Sequential(
    (0): AdaptiveConcatPool2d(
      (ap): Adap

In [14]:
lrn = Learner(data, mdl_mish, metrics=[accuracy,top_k_accuracy])

In [15]:
lrn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.084685,2.054669,0.291925,0.782609,00:15
1,1.830139,1.782138,0.397516,0.838509,00:15
2,1.590294,1.337767,0.540373,0.89441,00:15
3,1.334887,1.116282,0.639752,0.913043,00:15
4,1.105988,0.97204,0.695652,0.944099,00:15


In [16]:
lrn.destroy()

this Learner object self-destroyed - it still exists, but no longer usable


## Mish CUDA

In [17]:
from mish_cuda import MishCuda

In [18]:
mdl_mish = simple_cnn(data, actns=actns, strides=strides, activ_fn=MishCuda)
mdl_mish

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): MishCuda()
  )
  (1): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): MishCuda()
  )
  (2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): MishCuda()
  )
  (3): Sequential(
    (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): MishCuda()
  )
  (4): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): MishCuda()
  )
  (5): Sequential(
    (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): MishCuda()
  )
  (6): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): MishCuda()
  )
  (7): Sequential(
    (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): MishCuda()
  )
  (8): Sequential(
    (0): Adapti

In [19]:
lrn = Learner(data, mdl_mish, metrics=[accuracy,top_k_accuracy])

In [20]:
lrn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.086314,2.35623,0.21118,0.73913,00:13
1,1.844088,2.806577,0.267081,0.677019,00:13
2,1.59321,1.359999,0.552795,0.919255,00:13
3,1.32984,1.198766,0.608696,0.944099,00:13
4,1.118359,0.987282,0.670807,0.950311,00:13


In [21]:
lrn.destroy()

this Learner object self-destroyed - it still exists, but no longer usable


## Stability

In [22]:
from mish_cuda import mish_backward
def stress_bwd():
    grad_out = torch.ones(1000,100)
    for _ in progress_bar(range(1000)):
        inp = torch.randn(1000,100) + torch.randint(-1000, 1000, (1000,1)).float()
        y = mish_backward(inp, grad_out)
        ni = (y == y.new_full((1,), np.inf)).sum()
        nn = (y == y.new_full((1,), np.nan)).sum()
        if ni > 0 or nn > 0:
            print(F"Found non-finite: {ni} inf; {nn} nan")
            return (inp,grad_out,y)
    print("No non-finites found")
    return None

In [23]:
bad = stress_bwd()

No non-finites found
