In [1]:
from sparsemm_kernels.autotune import autotune
from sparsemm_kernels.up_dejavu import bench_sparsemm_up_dejavu
from sparsemm_kernels.up_neo import bench_sparsemm_up_neo
from sparsemm_kernels.up_dense import bench_sparsemm_up_dense
from sparsemm_kernels.up_cats import bench_sparsemm_up_cats
from sparsemm_kernels.down_dense import bench_sparsemm_down_dense

import torch
import triton

BATCH_SIZE = 512
EMBED_DIM = 5120
HIDDEN_DIM = 13824
P = 1
Q = 10000

In [33]:
from sparsemm_kernels.gemm_bcsr import sparsemm_gemm_bcsr

A = torch.empty(128, 5120, dtype=torch.float16, device='cuda')
B = torch.empty(5120, 13824, dtype=torch.float16, device='cuda')

# randomly zero out 99% of A
mask = torch.rand_like(A) > 0.01
A = A * mask

# print(A)

# xavier initialization
torch.nn.init.xavier_uniform_(A)
torch.nn.init.xavier_uniform_(B)

C = sparsemm_gemm_bcsr(A, B)
C_ref = torch.matmul(A, B)
Zeros = torch.zeros_like(C)

print(torch.allclose(C, C_ref, atol=1e-3))
print(torch.allclose(Zeros, C_ref, atol=1e-3))

True
False


In [34]:
from sparsemm_kernels.autotune import autotune
from sparsemm_kernels.gemm_bcsr import bench_sparsemm_gemm_bcsr

BATCH_SIZE = 512
EMBED_DIM = 5120
HIDDEN_DIM = 13824
SPARSITY = 0.1

autotune(
    bench_sparsemm_gemm_bcsr,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, SPARSITY),
    {
        "BLOCK_SIZE_M": [16],
        "BLOCK_SIZE_N": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_K": [16, 32, 64, 128, 256],
        "GROUP_SIZE_N": [1, 2, 4, 8],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    }
)

[I 2024-11-14 01:11:12,433] A new study created in memory with name: no-name-04e589d6-d8e9-4c54-b34a-565fdfeddbcd
[I 2024-11-14 01:11:13,348] Trial 0 finished with value: 3.2147912979125977 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_N': 4, 'num_stages': 3, 'num_warps': 4}. Best is trial 0 with value: 3.2147912979125977.
[I 2024-11-14 01:11:14,235] Trial 1 finished with value: 2.9501442909240723 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 1, 'num_stages': 5, 'num_warps': 4}. Best is trial 1 with value: 2.9501442909240723.
[I 2024-11-14 01:11:15,033] Trial 2 finished with value: 1.311458706855774 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 2 with value: 1.311458706855774.
loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/gemm_bcsr.py":67:27): error: operation scheduled before it

out of resource: shared memory, Required: 152064, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:19,310] Trial 8 finished with value: 4.029067516326904 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_N': 1, 'num_stages': 5, 'num_warps': 8}. Best is trial 2 with value: 1.311458706855774.
[I 2024-11-14 01:11:19,803] Trial 9 finished with value: 1.4850794076919556 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 8, 'num_stages': 4, 'num_warps': 4}. Best is trial 2 with value: 1.311458706855774.
[I 2024-11-14 01:11:19,941] Trial 10 finished with value: 1.3935598134994507 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 2, 'num_stages': 2, 'num_warps': 4}. Best is trial 2 with value: 1.311458706855774.
[I 2024-11-14 01:11:20,079] Trial 11 finished with value: 1.3689515590667725 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_N': 8, 'num_stages': 4, 'num_warps': 8}. Best is trial 2 with value: 1.311

out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:25,185] Trial 15 finished with value: inf and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_N': 2, 'num_stages': 5, 'num_warps': 8}. Best is trial 2 with value: 1.311458706855774.


out of resource: shared memory, Required: 152576, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:26,160] Trial 16 finished with value: 1.9870343208312988 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_N': 4, 'num_stages': 5, 'num_warps': 4}. Best is trial 2 with value: 1.311458706855774.
[I 2024-11-14 01:11:26,954] Trial 17 finished with value: 1.664522409439087 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 4, 'num_stages': 5, 'num_warps': 8}. Best is trial 2 with value: 1.311458706855774.
loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/gemm_bcsr.py":67:27): error: operation scheduled before its operands
[I 2024-11-14 01:11:27,546] Trial 18 finished with value: 2.423454523086548 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 2, 'num_stages': 2, 'num_warps': 8}. Best is trial 2 with value: 1.311458706855774.
[I 2024-11-14 01:11:28,236] Trial 19 finished with value: 2.394087553024292 and parameters: {'BLOCK_SIZ

out of resource: shared memory, Required: 283136, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/gemm_bcsr.py":67:27): error: operation scheduled before its operands
[I 2024-11-14 01:11:32,075] Trial 37 finished with value: 7.933469772338867 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 1, 'num_stages': 2, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:32,923] Trial 38 finished with value: 4.4668869972229 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:33,716] Trial 39 finished with value: 2.4029479026794434 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 8, 'num_stages': 4, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:33,750] Trial 40 finished with value: inf and parameters: {'BLOCK_SIZE_M': 16, '

out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:34,028] Trial 42 finished with value: 1.2569048404693604 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:35,424] Trial 43 finished with value: 2.5583717823028564 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:35,561] Trial 44 finished with value: 2.6114561557769775 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 24 with value: 1.2248382568359375.
[I 2024-11-14 01:11:35,739] Trial 45 finished with value: 4.720239162445068 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_N': 1, 'num_stages': 5, 'num_warps': 4}. Best is trial 24 with v

out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:43,808] Trial 76 finished with value: 1.8748321533203125 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:45,665] Trial 77 finished with value: 3.487936019897461 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_N': 8, 'num_stages': 3, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:46,527] Trial 78 finished with value: 2.1140310764312744 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 8}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:47,043] Trial 79 finished with value: 2.9022140502929688 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with 

out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:48,801] Trial 84 finished with value: 1.2285362482070923 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:48,815] Trial 85 finished with value: inf and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_N': 8, 'num_stages': 2, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:48,963] Trial 86 finished with value: 1.6535592079162598 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.


out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:50,069] Trial 87 finished with value: 2.2570347785949707 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 4, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:50,839] Trial 88 finished with value: 1.8126057386398315 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:50,996] Trial 89 finished with value: 1.6318742036819458 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 4, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:51,026] Trial 90 finished with value: inf and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.21686

out of resource: shared memory, Required: 141824, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


[I 2024-11-14 01:11:51,279] Trial 92 finished with value: 1.2568103075027466 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:51,512] Trial 93 finished with value: 2.4039108753204346 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:51,658] Trial 94 finished with value: 2.600677490234375 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 54 with value: 1.2168688774108887.
[I 2024-11-14 01:11:52,937] Trial 95 finished with value: 2.260634422302246 and parameters: {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_N': 8, 'num_stages': 3, 'num_warps': 4}. Best is trial 54 with v

out of resource: shared memory, Required: 152064, Hardware limit: 101376. Reducing block sizes or `num_stages` may help.


({'BLOCK_SIZE_M': 16,
  'BLOCK_SIZE_N': 256,
  'BLOCK_SIZE_K': 32,
  'GROUP_SIZE_N': 8,
  'num_stages': 5,
  'num_warps': 4},
 1.2168688774108887)

In [13]:
A = torch.randn(BATCH_SIZE, EMBED_DIM, dtype=torch.float16, device="cuda")
B = torch.randn(EMBED_DIM, HIDDEN_DIM, dtype=torch.float16, device="cuda")

ms = triton.testing.do_bench(lambda: A @ B)
print(f"cuBLAS: {ms:.8f}ms")

cuBLAS: 1.47721159ms


In [None]:
autotune(
    bench_sparsemm_up_dejavu,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_K": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_Q": [16, 32, 64, 128, 256],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

In [None]:
autotune(
    bench_sparsemm_up_neo,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_K": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_Q": [16, 32, 64, 128, 256],
        "GROUP_SIZE_Q": [1, 2, 4, 8, 16],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

In [None]:
autotune(
    bench_sparsemm_up_cats,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_K": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_N": [16, 32, 64, 128, 256],
        "GROUP_SIZE_N": [1, 2, 4, 8, 16],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

In [None]:
bench_sparsemm_down_dense(BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q)

In [None]:
from sparsemm_kernels.down_dejavu import bench_sparsemm_down_dejavu

autotune(
    bench_sparsemm_down_dejavu,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_Q": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_N": [16, 32, 64, 128, 256],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

In [7]:
from sparsemm_kernels.down_neo import bench_sparsemm_down_neo

autotune(
    bench_sparsemm_down_neo,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_Q": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_N": [16, 32, 64, 128, 256],
        "GROUP_SIZE_N": [1, 2, 4, 8, 16],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

[I 2024-11-01 20:48:40,532] A new study created in memory with name: no-name-d4f724a3-e39a-433f-9bb9-f892713cb416
loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/down_neo.py":41:49): error: operation scheduled before its operands
[I 2024-11-01 20:48:41,392] Trial 0 finished with value: 1.9837497472763062 and parameters: {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_Q': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 0 with value: 1.9837497472763062.
loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/down_neo.py":41:49): error: operation scheduled before its operands
[I 2024-11-01 20:48:44,455] Trial 1 finished with value: 3.5551705360412598 and parameters: {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_Q': 64, 'BLOCK_SIZE_N': 256, 'GROUP_SIZE_N': 8, 'num_stages': 5, 'num_warps': 4}. Best is trial 0 with value: 1.9837497472763062.
loc("/home/tianhaodong/repo/sp-gated-mlp-kernels/sparsemm_kernels/down_neo.py":39:25): error: operation sc

({'BLOCK_SIZE_M': 128,
  'BLOCK_SIZE_Q': 32,
  'BLOCK_SIZE_N': 128,
  'GROUP_SIZE_N': 2,
  'num_stages': 4,
  'num_warps': 8},
 0.7734186053276062)

In [6]:
from sparsemm_kernels.down_splitk import bench_sparsemm_down_splitk

autotune(
    bench_sparsemm_down_splitk,
    (BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, P, Q),
    {
        "BLOCK_SIZE_M": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_Q": [16, 32, 64, 128, 256],
        "BLOCK_SIZE_N": [16, 32, 64, 128, 256],
        "GROUP_SIZE_Q": [1, 2, 4, 8, 16],
        "num_stages": [2, 3, 4, 5],
        "num_warps": [4, 8],
    },
    n_trials=100
)

[I 2024-11-01 20:47:52,705] A new study created in memory with name: no-name-583d6bf5-372f-4175-8ace-fefb4cb67e9b
[I 2024-11-01 20:47:53,445] Trial 0 finished with value: 3.661696195602417 and parameters: {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_Q': 64, 'BLOCK_SIZE_N': 256, 'GROUP_SIZE_Q': 8, 'num_stages': 5, 'num_warps': 8}. Best is trial 0 with value: 3.661696195602417.
[I 2024-11-01 20:47:53,847] Trial 1 finished with value: 5.247544765472412 and parameters: {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_Q': 64, 'BLOCK_SIZE_N': 16, 'GROUP_SIZE_Q': 4, 'num_stages': 5, 'num_warps': 4}. Best is trial 0 with value: 3.661696195602417.
[I 2024-11-01 20:47:54,595] Trial 2 finished with value: 8.762088775634766 and parameters: {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_Q': 32, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_Q': 4, 'num_stages': 3, 'num_warps': 4}. Best is trial 0 with value: 3.661696195602417.
[I 2024-11-01 20:47:55,920] Trial 3 finished with value: inf and parameters: {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_Q': 256, 'BLOCK_S

KeyboardInterrupt: 