Aim : To implement vector addition using cuda

In [1]:
import torch, os, math, gzip, pickle
from pathlib import Path

from torch import tensor
from torch.utils.cpp_extension import load_inline

In [43]:
N = 10_000
A = torch.randn(N)
B = torch.randn(N)

A.shape, B.shape

(torch.Size([10000]), torch.Size([10000]))

### Basic way

In [44]:
def basic():
  c = []
  for i in range(N):
    c.append(A[i] + B[i])
  return tensor(c)

In [45]:
%%timeit
c = basic()

74.1 ms ± 781 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### tesnor way

In [46]:
def tensor_way():
  return A + B

In [47]:
%%timeit
c = tensor_way()

4.25 µs ± 201 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### cuda kernel

In [11]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'
# Get the CUDA capability of the current device
if torch.cuda.is_available():
    major, minor = torch.cuda.get_device_capability()
    # Set the environment variable with the detected architecture
    os.environ['TORCH_CUDA_ARCH_LIST'] = f"{major}.{minor}"
    print(f"Setting TORCH_CUDA_ARCH_LIST to: {os.environ['TORCH_CUDA_ARCH_LIST']}")
else:
    print("CUDA is not available. The warning about TORCH_CUDA_ARCH_LIST is not relevant in this case.")

Setting TORCH_CUDA_ARCH_LIST to: 7.5


In [12]:
%pip install -q wurlitzer ninja

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/422.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.8/422.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
%load_ext wurlitzer

In [14]:
def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [15]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

In [32]:
cuda_src = cuda_begin + r'''
__global__ void add_kernel(float* A, float* B, float* C, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i<n){
       C[i] = A[i] + B[i];
    }
}

torch::Tensor add(torch::Tensor A, torch::Tensor B, int N, int threads = 256) {
    CHECK_INPUT(A);
    CHECK_INPUT(B);

    TORCH_CHECK(A.size(0) == N && B.size(0)  == N, "Input tensor shapes must match N");

    auto C = torch::empty({N}, A.options());

    add_kernel<<<cdiv(N, threads), threads>>>(
      A.data_ptr<float>(),                      // data_ptr() returns a pointer to the underlying data
      B.data_ptr<float>(),
      C.data_ptr<float>(),
      N);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return C;
}'''

In [34]:
cpp_src = "torch::Tensor add(torch::Tensor A, torch::Tensor B, int N, int threads);"

module = load_cuda(cuda_src, cpp_src, ['add'], verbose=True)

Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
The input conditions for extension module inline_ext have changed. Bumping to version 6 and re-building as inline_ext_v6...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/inline_ext/build.ninja...
Building extension module inline_ext_v6...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v6 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.11/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.11/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /root/.cache/torch_extensions/py311_cu124/inline_ext/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v6 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.11/dist-packages/torch/i

Loading extension module inline_ext_v6...


In [35]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'add']

In [49]:
A, B = A.contiguous().cuda(), B.contiguous().cuda()

In [37]:
%%timeit
C = module.add(A, B, N, 256)
C.shape

28.8 µs ± 2.16 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [38]:
2 ** 10

1024

In [50]:
%%timeit
C = module.add(A, B, N, 1024)
C.shape

28.2 µs ± 1.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
