In [2]:
! pip install fastcore --upgrade -qq
! pip install fastai --upgrade -qq

[K     |████████████████████████████████| 61kB 6.9MB/s 
[K     |████████████████████████████████| 194kB 13.9MB/s 
[?25h

In [1]:
from fastai.vision.all import *
import fastai
from sys import exit
from operator import itemgetter
import re
import torch
from torch.nn import functional as F
import numpy as np
from time import process_time_ns, process_time
import gc

In [2]:
def scale(val, spec="#0.4G"):
    PREFIXES = np.array([c for c in u"yzafpnµm kMGTPEZY"])
    exp = np.int8(np.log10(np.abs(val)) // 3 * 3 * np.sign(val))
    val /= 10.**exp
    prefix = PREFIXES[exp//3 + len(PREFIXES)//2]
    return f"{val:{spec}}{prefix}"

def display_times(times):
    return f"{scale(times.mean())}s ± {scale(times.std())}s, {scale(times.min())}s, {scale(times.max())}s"

def profile_cpu(func, inp, n_repeat=100, warmup=10):
    fwd_times,bwd_times = [],[]
    for i in range(n_repeat + warmup):
        start = process_time()
        res = func(inp)
        end = process_time()
        if i >= warmup: fwd_times.append(end-start)
        inp = inp.clone().requires_grad_()
        y = func(inp)
        l = y.mean()
        start = process_time()
        _ = torch.autograd.grad(l, inp)
        end = process_time()
        if i >= warmup: bwd_times.append(end-start)
    return (np.array(fwd_times), # Elapsed time is in seconds
            np.array(bwd_times))
    
def profile_cuda(func, inp, n_repeat=100, warmup=10):
    fwd_times,bwd_times = [],[]
    for i in range(n_repeat + warmup):
        start,end = (torch.cuda.Event(enable_timing=True) for _ in range(2))
        start.record()
        res = func(inp)
        end.record()
        torch.cuda.synchronize()
        if i >= warmup: fwd_times.append(start.elapsed_time(end))
        start,end = (torch.cuda.Event(enable_timing=True) for _ in range(2))
        inp = inp.clone().requires_grad_()
        y = func(inp)
        l = y.mean()
        start.record()
        _ = torch.autograd.grad(l, inp)
        end.record()
        torch.cuda.synchronize()
        if i >= warmup: bwd_times.append(start.elapsed_time(end))
    return (np.array(fwd_times)/1000, # Elapsed time is in ms
            np.array(bwd_times)/1000)

mish_pt = lambda x: x.mul(torch.tanh(F.softplus(x)))

def profile(device='cuda', n_repeat=100, warmup=10, size='(16,10,256,256)', baseline=True, types='all'):
    if types == 'all': 
        dtypes = [torch.float16, torch.bfloat16, torch.float32, torch.float64]
    else:
        if not hasattr(torch, types): exit("Invalid data type, expected torch type or 'all', got {types}")
        dtypes = [getattr(torch, types)]
    dev = torch.device(type=device)
    sz_str = size.replace(' ','')
    if not re.match(r"[\(\[]\d+(,\d+)*[\)\]]", sz_str):
        exit("Badly formatted size, should be a list or tuple such as \"(1,2,3)\".")
    sz = list(map(int, sz_str[1:-1].split(',')))
    print(f"Profiling over {n_repeat} runs after {warmup} warmup runs.")
    for dtype in dtypes:
        if len(dtypes) > 1:
            print(f"Testing on {dtype}:")
            ind = ' '
        else: ind = ''
        inp = torch.randn(*sz, dtype=dtype, device=dev)
        timings = []
        funcs = {}
        funcs.update(relu = torch.nn.functional.relu, 
                     leaky_relu = torch.nn.functional.leaky_relu,
                     softplus = torch.nn.functional.softplus,
                     silu_jit = fastai.layers.swish,
                     silu_native = torch.nn.functional.silu,
                     mish_naive = mish_pt,
                     mish_jit = fastai.layers.mish,
                     mish_native = torch.nn.functional.mish)
        if device=='cuda': funcs['mish_cuda'] = MishCudaFunction.apply
        max_name = max(map(len, funcs.keys())) + 6
        for (name,func) in funcs.items():
            if device=='cuda':
                if (name=='mish_cuda') and (dtype==torch.bfloat16):
                    pass
                else: 
                    fwd_times,bwd_times = profile_cuda(func, inp, n_repeat, warmup)
                    torch.cuda.empty_cache()
            if device=='cpu':
                fwd_times,bwd_times = profile_cpu(func, inp, n_repeat, warmup)
                gc.collect()
            print(ind+(name+'_fwd:').ljust(max_name) + display_times(fwd_times))
            print(ind+(name+'_bwd:').ljust(max_name) + display_times(bwd_times))

# Haswell Benchmark

In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2299.998
cache size	: 46080 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
bogomips	: 4599.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor	:

In [None]:
profile('cpu', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        19.95ms ± 618.7µs, 18.61ms, 23.66ms
relu_bwd:        44.59ms ± 1.026ms, 42.70ms, 49.14ms
leaky_relu_fwd:  20.60ms ± 850.2µs, 17.80ms, 25.07ms
leaky_relu_bwd:  45.02ms ± 1.071ms, 42.83ms, 48.56ms
softplus_fwd:    61.69ms ± 1.328ms, 59.94ms, 65.84ms
softplus_bwd:    49.82ms ± 1.341ms, 48.06ms, 56.02ms
silu_jit_fwd:    47.55ms ± 2.001ms, 44.50ms, 53.54ms
silu_jit_bwd:    168.6ms ± 2.993ms, 162.4ms, 181.5ms
silu_native_fwd: 24.50ms ± 1.224ms, 22.62ms, 30.06ms
silu_native_bwd: 50.89ms ± 1.382ms, 48.49ms, 58.31ms
mish_naive_fwd:  123.6ms ± 2.575ms, 120.1ms, 132.6ms
mish_naive_bwd:  143.0ms ± 2.669ms, 138.1ms, 154.2ms
mish_jit_fwd:    123.0ms ± 2.387ms, 120.1ms, 132.0ms
mish_jit_bwd:    292.3ms ± 3.622ms, 284.6ms, 301.6ms
mish_native_fwd: 144.5ms ± 1.294ms, 142.3ms, 149.8ms
mish_native_bwd: 183.0ms ± 2.418ms, 178.8ms, 192.1ms


In [None]:
profile('cpu', size='(64,10,256,256)', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        78.93ms ± 1.870ms, 75.26ms, 88.00ms
relu_bwd:        175.0ms ± 2.626ms, 170.0ms, 184.1ms
leaky_relu_fwd:  81.81ms ± 2.367ms, 75.91ms, 90.04ms
leaky_relu_bwd:  176.3ms ± 3.326ms, 166.3ms, 186.4ms
softplus_fwd:    243.1ms ± 3.984ms, 234.5ms, 253.9ms
softplus_bwd:    196.2ms ± 3.845ms, 184.3ms, 207.4ms
silu_jit_fwd:    190.5ms ± 4.441ms, 181.1ms, 201.5ms
silu_jit_bwd:    681.8ms ± 15.51ms, 637.7ms, 791.3ms
silu_native_fwd: 100.7ms ± 3.996ms, 92.33ms, 109.7ms
silu_native_bwd: 200.6ms ± 4.758ms, 187.2ms, 213.4ms
mish_naive_fwd:  495.6ms ± 7.702ms, 476.5ms, 517.5ms
mish_naive_bwd:  576.7ms ± 8.659ms, 555.7ms, 600.0ms
mish_jit_fwd:    495.2ms ± 8.542ms, 477.5ms, 519.2ms
mish_jit_bwd:    1.175 s ± 12.98ms, 1.145 s, 1.215 s
mish_native_fwd: 581.8ms ± 8.295ms, 566.9ms, 600.7ms
mish_native_bwd: 738.3ms ± 10.09ms, 720.1ms, 764.4ms


# Broadwell Benchmark

In [1]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [3]:
profile('cpu', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        15.49ms ± 768.0µs, 14.12ms, 17.71ms
relu_bwd:        35.51ms ± 1.919ms, 32.72ms, 43.52ms
leaky_relu_fwd:  16.01ms ± 868.3µs, 14.39ms, 20.81ms
leaky_relu_bwd:  35.29ms ± 1.139ms, 33.73ms, 40.99ms
softplus_fwd:    66.98ms ± 1.975ms, 62.24ms, 75.23ms
softplus_bwd:    41.12ms ± 1.893ms, 37.99ms, 49.38ms
silu_jit_fwd:    43.30ms ± 1.571ms, 39.81ms, 47.84ms
silu_jit_bwd:    142.4ms ± 3.976ms, 135.5ms, 156.1ms
silu_native_fwd: 23.05ms ± 1.368ms, 20.84ms, 28.65ms
silu_native_bwd: 41.87ms ± 1.587ms, 39.88ms, 52.11ms
mish_naive_fwd:  124.8ms ± 3.692ms, 118.8ms, 137.8ms
mish_naive_bwd:  118.7ms ± 3.661ms, 112.9ms, 130.1ms
mish_jit_fwd:    139.6ms ± 2.405ms, 133.1ms, 150.1ms
mish_jit_bwd:    270.1ms ± 5.565ms, 258.9ms, 286.4ms
mish_native_fwd: 139.6ms ± 2.304ms, 135.7ms, 150.3ms
mish_native_bwd: 173.2ms ± 2.784ms, 166.2ms, 182.4ms


In [4]:
profile('cpu', size='(64,10,256,256)', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        59.45ms ± 2.378ms, 54.24ms, 66.46ms
relu_bwd:        136.5ms ± 3.777ms, 128.7ms, 146.9ms
leaky_relu_fwd:  63.26ms ± 1.723ms, 59.52ms, 69.18ms
leaky_relu_bwd:  139.9ms ± 3.657ms, 131.3ms, 151.8ms
softplus_fwd:    267.5ms ± 5.315ms, 256.5ms, 279.5ms
softplus_bwd:    162.7ms ± 4.240ms, 153.0ms, 174.3ms
silu_jit_fwd:    168.0ms ± 4.140ms, 158.5ms, 187.2ms
silu_jit_bwd:    560.7ms ± 11.17ms, 533.8ms, 585.3ms
silu_native_fwd: 91.60ms ± 3.132ms, 83.69ms, 101.3ms
silu_native_bwd: 166.7ms ± 4.551ms, 158.1ms, 178.5ms
mish_naive_fwd:  499.9ms ± 9.079ms, 482.2ms, 523.9ms
mish_naive_bwd:  471.1ms ± 10.61ms, 443.7ms, 496.3ms
mish_jit_fwd:    509.2ms ± 10.31ms, 487.0ms, 532.1ms
mish_jit_bwd:    1.064 s ± 17.73ms, 1.024 s, 1.114 s
mish_native_fwd: 573.2ms ± 7.804ms, 554.6ms, 589.9ms
mish_native_bwd: 691.8ms ± 8.222ms, 671.7ms, 707.4ms


# Skylake Benchmark

In [3]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 85
model name	: Intel(R) Xeon(R) CPU @ 2.00GHz
stepping	: 3
microcode	: 0x1
cpu MHz		: 2000.186
cache size	: 39424 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	:

In [4]:
profile('cpu', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        9.791ms ± 541.3µs, 9.225ms, 13.19ms
relu_bwd:        26.25ms ± 808.5µs, 25.39ms, 29.88ms
leaky_relu_fwd:  10.69ms ± 664.5µs, 9.579ms, 13.68ms
leaky_relu_bwd:  28.27ms ± 1.210ms, 26.49ms, 31.89ms
softplus_fwd:    48.18ms ± 2.954ms, 43.05ms, 53.93ms
softplus_bwd:    32.99ms ± 2.090ms, 30.05ms, 40.69ms
silu_jit_fwd:    27.55ms ± 1.636ms, 25.81ms, 34.08ms
silu_jit_bwd:    95.21ms ± 2.974ms, 91.30ms, 105.0ms
silu_native_fwd: 13.83ms ± 687.5µs, 13.23ms, 17.85ms
silu_native_bwd: 30.66ms ± 1.096ms, 29.63ms, 35.38ms
mish_naive_fwd:  74.72ms ± 5.135ms, 68.01ms, 85.55ms
mish_naive_bwd:  84.82ms ± 3.569ms, 78.94ms, 94.52ms
mish_jit_fwd:    73.17ms ± 4.608ms, 67.65ms, 85.18ms
mish_jit_bwd:    170.3ms ± 7.535ms, 160.6ms, 189.9ms
mish_native_fwd: 113.9ms ± 7.488ms, 103.4ms, 129.7ms
mish_native_bwd: 142.7ms ± 9.413ms, 130.2ms, 164.6ms


In [5]:
profile('cpu', size='(64,10,256,256)', types='float32')

Profiling over 100 runs after 10 warmup runs.
relu_fwd:        41.32ms ± 1.600ms, 38.73ms, 45.67ms
relu_bwd:        109.2ms ± 2.995ms, 102.6ms, 117.3ms
leaky_relu_fwd:  41.42ms ± 1.746ms, 38.96ms, 47.14ms
leaky_relu_bwd:  106.5ms ± 3.433ms, 102.9ms, 118.4ms
softplus_fwd:    192.2ms ± 13.23ms, 176.6ms, 232.9ms
softplus_bwd:    126.0ms ± 6.323ms, 117.7ms, 145.4ms
silu_jit_fwd:    117.0ms ± 5.936ms, 108.8ms, 133.7ms
silu_jit_bwd:    388.1ms ± 12.02ms, 370.5ms, 419.9ms
silu_native_fwd: 61.13ms ± 4.576ms, 55.08ms, 74.42ms
silu_native_bwd: 128.3ms ± 6.544ms, 120.1ms, 150.0ms
mish_naive_fwd:  289.8ms ± 17.85ms, 272.0ms, 348.9ms
mish_naive_bwd:  331.7ms ± 11.10ms, 317.6ms, 361.1ms
mish_jit_fwd:    299.1ms ± 17.01ms, 275.1ms, 343.5ms
mish_jit_bwd:    691.3ms ± 29.15ms, 647.8ms, 759.4ms
mish_native_fwd: 445.9ms ± 33.82ms, 414.7ms, 543.5ms
mish_native_bwd: 553.9ms ± 36.91ms, 514.4ms, 659.7ms
