In [1]:
import torch, os, math, gzip, pickle
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
from pathlib import Path

from torch import tensor
import torchvision as tv
import torchvision.transforms.functional as tvf
from torchvision import io
from torch.utils.cpp_extension import load_inline

## Matrix Multiplication

2d Matrix multiplication

In [2]:
N, M = 50, 75

In [3]:
A = torch.randn(N, M)
B = torch.randn(M, N)
C = torch.zeros(N, N, dtype=torch.float32)

In [None]:
for i in range(N):
  for j in range(N):
    for k in range(M):
      C[i,j] += A[i,k] * B[k,j]

In [None]:
def mat_mul(A, B):
  C = torch.zeros(N, N)
  for i in range(N):
    for j in range(N):
      for k in range(M):
        C[i,j] += A[i,k] * B[k,j]
  return C

In [None]:
%%timeit
C = mat_mul(A, B)

30.3 s ¬± 343 ms per loop (mean ¬± std. dev. of 7 runs, 1 loop each)


In [None]:
C.shape

torch.Size([100, 100])

## CUDA setup

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'
# Get the CUDA capability of the current device
if torch.cuda.is_available():
    major, minor = torch.cuda.get_device_capability()
    # Set the environment variable with the detected architecture
    os.environ['TORCH_CUDA_ARCH_LIST'] = f"{major}.{minor}"
    print(f"Setting TORCH_CUDA_ARCH_LIST to: {os.environ['TORCH_CUDA_ARCH_LIST']}")

Setting TORCH_CUDA_ARCH_LIST to: 7.5


In [5]:
%pip install -q wurlitzer ninja

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/422.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m422.8/422.8 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
%load_ext wurlitzer

In [7]:
def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [47]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

'''

## row major matrix mul


### üöÄ CUDA Matrix Multiplication (Row-Major) Kernel

This CUDA kernel performs matrix multiplication for **row-major** matrices.

#### Matrix Shapes:
- `A`: shape (m √ó n)
- `B`: shape (n √ó o)
- `C = A √ó B`: shape (m √ó o)

---

#### üß† Kernel Logic
```cpp
extern "C" __global__ void mat_mul_kernel(
  float *A, float *B, float *C,
  int m, int n, int o)
{
    int r = blockIdx.y * blockDim.y + threadIdx.y;

    if (r < m) {
        for (int c = 0; c < o; c++) {
            float val = 0.0f;
            for (int i = 0; i < n; i++) {
                val += A[r * n + i] * B[i * o + c];
            }
            C[r * o + c] = val;
        }
    }
}


In [19]:
cuda_src = cuda_begin + r'''

extern "C" __global__ void mat_mul_kernel(
    const float* A, const float* B, float* C,
    int m, int n, int o) {

    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;

    if (r < m && c < o) {
        float val = 0.0f;
        for (int i = 0; i < n; ++i) {
            val += A[r * n + i] * B[i * o + c];
        }
        C[r * o + c] = val;
    }
}


torch::Tensor mat_mul_row(torch::Tensor A, torch::Tensor B) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);

    int m = A.size(0);
    int n = A.size(1);
    int o = B.size(1);

    auto C = torch::zeros({m, o}, A.options());

    dim3 threads(32, 32);
    dim3 blocks((o + 31) / 32, (m + 31) / 32);

    mat_mul_kernel<<<blocks, threads>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        m, n, o);

    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return C;
}


'''

In [20]:
cpp_src = "torch::Tensor mat_mul_row(torch::Tensor A, torch::Tensor B);"
module = load_inline(
    name="inline_ext_v2",  # üîÅ Force rebuild by changing name
    cuda_sources=[cuda_src],
    cpp_sources=[cpp_src],
    functions=['mat_mul_row'],
    extra_cuda_cflags=["-O2"],
    verbose=True
)


Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
The input conditions for extension module inline_ext_v2 have changed. Bumping to version 1 and re-building as inline_ext_v2_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/inline_ext_v2/build.ninja...
Building extension module inline_ext_v2_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v2_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.11/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /root/.cache/torch_extensions/py311_cu124/inline_ext_v2/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v2_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-package

Loading extension module inline_ext_v2_v1...


In [21]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'mat_mul_row']

In [22]:
Ac = A.contiguous().cuda()
Bc = B.contiguous().cuda()

In [23]:
Ac.shape, Bc.shape

(torch.Size([50, 75]), torch.Size([75, 50]))

In [24]:
%%time
C = module.mat_mul_row(Ac, Bc)

CPU times: user 1.14 ms, sys: 9.03 ms, total: 10.2 ms
Wall time: 23.1 ms


In [25]:
N, M = 1150, 750

In [26]:
A = torch.randn(N, M)
B = torch.randn(M, N)
C = torch.zeros(N, N, dtype=torch.float32)
Ac = A.contiguous().cuda()
Bc = B.contiguous().cuda()

In [28]:
%%time
C = module.mat_mul_row(Ac, Bc)

CPU times: user 6.3 ms, sys: 56 ¬µs, total: 6.35 ms
Wall time: 6.48 ms




### üöÄ CUDA Matrix Multiplication (Col-Major) Kernel

This CUDA kernel performs matrix multiplication for **col-major** matrices.

#### Matrix Shapes:
- `A`: shape (m √ó n)
- `B`: shape (n √ó o)
- `C = A √ó B`: shape (m √ó o)

In [50]:
cuda_begin

'\n#include <torch/extension.h>\n#include <stdio.h>\n#include <c10/cuda/CUDAException.h>\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")\n#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")\n#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)\n\n'

In [48]:
cuda_src_col = cuda_begin + r'''

extern "C" __global__ void mat_mulC_kernel(
  float *A, float *B, float *C,
  int m, int n, int o)
{

    int c = blockIdx.x * blockDim.x + threadIdx.x;

    if (c < o) {
        for (int r = 0; r < m; r++) {
            float val = 0.0f;
            for (int i = 0; i < n; i++) {
                val += A[r * n + i] * B[i * o + c];
            }
            C[r * o + c] = val;
        }
    }
}

int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}


torch::Tensor mat_mul_col(torch::Tensor A, torch::Tensor B) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);

    int m = A.size(0);
    int n = A.size(1);
    int o = B.size(1);

    auto C = torch::zeros({m, o}, A.options());

    dim3 threads(32, 32);
    dim3 blocks(cdiv(m, threads.x), cdiv(o, threads.y));

    mat_mulC_kernel<<<blocks, threads>>>(
        A.data_ptr<float>(),
        B.data_ptr<float>(),
        C.data_ptr<float>(),
        m, n, o);

    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return C;
}
'''

In [51]:
cpp_src = "torch::Tensor mat_mul_col(torch::Tensor A, torch::Tensor B);"
module = load_inline(
    name="inline_ext_v4",  # üîÅ Force rebuild by changing name
    cuda_sources=[cuda_src_col],
    cpp_sources=[cpp_src], # Corrected cpp_sources
    functions=['mat_mul_col'],
    extra_cuda_cflags=["-O2"],
    verbose=True
)

Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
The input conditions for extension module inline_ext_v4 have changed. Bumping to version 7 and re-building as inline_ext_v4_v7...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/inline_ext_v4/build.ninja...
Building extension module inline_ext_v4_v7...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v4_v7 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.11/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /root/.cache/torch_extensions/py311_cu124/inline_ext_v4/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v4_v7 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-package

Loading extension module inline_ext_v4_v7...


In [53]:
%%time
C = module.mat_mul_col(Ac, Bc)

CPU times: user 5.43 s, sys: 0 ns, total: 5.43 s
Wall time: 5.42 s


## Conclusion
Yes, based on the execution times you observed (5.8ms for the row-major kernel and 5.4s for the column-major kernel), it appears that the row-major operation was significantly faster in this case.