In [1]:
import torch, os, math
import numpy as np
import matplotlib.pyplot as plt
# from torch.utils.cpp_extension import load_inline
from urllib.request import urlretrieve
from pathlib import Path

from util import load_cuda, load_cuda_inline

In [2]:
np.set_printoptions(precision=2, linewidth=140)
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)

## CUDA Setup

In [3]:
torch.cuda.is_available()

True

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING']='1'

device_props = torch.cuda.get_device_properties(0)
os.environ['TORCH_CUDA_ARCH_LIST'] = f'{device_props.major}.{device_props.minor}'

print(os.environ.get('TORCH_CUDA_ARCH_LIST'))

7.5


In [6]:
%load_ext wurlitzer

## Vector sum kernel

In [6]:
# sources = ["vector_sum.cu"]
# module = load_cuda(sources, build_directory="./build", verbose=True)

In [47]:
src_dir = Path("csrc")
cu_path =  src_dir/"vector_sum.cu"
cpp_path = src_dir/"vector_sum.cpp"

cuda_src = cu_path.read_text()
cpp_src = cpp_path.read_text()

module = load_cuda_inline(cuda_src, cpp_src, ['vector_sum'], verbose=True, build_directory='./build')

The input conditions for extension module inline_ext have changed. Bumping to version 3 and re-building as inline_ext_v3...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build/build.ninja...
Building extension module inline_ext_v3...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Loading extension module inline_ext_v3...


In [None]:
src_dir = Path("csrc")
cu_path =  src_dir/"vector_sum.cu"
cpp_path = src_dir/"vector_sum.cpp"

cuda_src = cu_path.read_text()
cpp_src = cpp_path.read_text()

module = load_cuda_inline(cuda_src, cpp_src, ['vector_sum'], verbose=True, build_directory='./build')

The input conditions for extension module inline_ext have changed. Bumping to version 2 and re-building as inline_ext_v2...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module inline_ext_v2...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Loading extension module inline_ext_v2...


In [11]:
[o for o in dir(module) if o[0]!='_']

['vector_sum']

In [12]:
# Define test case
if True:
  # Test case 1
  n = 200
  torch.manual_seed(1)
  inp = torch.randn(200)
  inp = inp.contiguous().cuda()
else:
  # Test case 2
  inp = torch.tensor([1., 2., 3., 4., 5., 6.])
  inp = inp.contiguous().cuda()
  n = inp.shape[0]

In [13]:
%%time
sum_gpu = module.vector_sum(inp).cpu()
# sum_gpu

CPU times: user 613 μs, sys: 39 μs, total: 652 μs
Wall time: 661 μs


In [14]:
%%time
sum_true = inp.sum().cpu()

CPU times: user 8.53 ms, sys: 3.6 ms, total: 12.1 ms
Wall time: 32.9 ms


In [15]:
# Test for correctness
print(sum_true, sum_gpu)
torch.isclose(sum_true, sum_gpu).item()

tensor(-4.93) tensor([-4.93])


True

In [None]:
# Check correctness
torch.isclose(torch.matmul(m1, m2), module.matmul(m1c, m2c).cpu(), atol=1e-4).all()

tensor(True)

## Matmul

In [5]:
torch.manual_seed(1)
m1 = torch.randn(1000,2000)
m2 = torch.randn(2000,1000)

In [8]:
from wurlitzer import sys_pipes

with sys_pipes():
    src_dir = Path("csrc")
    cu_path =  src_dir/"attention.cu"
    cpp_path = src_dir/"attention.cpp"
    funcs = ["matmul"]

    cuda_src = cu_path.read_text()
    cpp_src = cpp_path.read_text()

    module = load_cuda_inline(cuda_src, cpp_src, funcs, verbose=True, build_directory='./build')

RuntimeError: Error building extension 'inline_ext_v1': [1/3] /opt/conda/envs/flash-attention/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/TH -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/THC -isystem /opt/conda/envs/flash-attention/include -isystem /opt/conda/envs/flash-attention/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 --compiler-options '-fPIC' -std=c++17 -c /mnt/tobias/flash_attention/build/cuda.cu -o cuda.cuda.o 
[31mFAILED: [0mcuda.cuda.o 
/opt/conda/envs/flash-attention/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/TH -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/THC -isystem /opt/conda/envs/flash-attention/include -isystem /opt/conda/envs/flash-attention/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 --compiler-options '-fPIC' -std=c++17 -c /mnt/tobias/flash_attention/build/cuda.cu -o cuda.cuda.o 
/mnt/tobias/flash_attention/build/cuda.cu(84): error: expected a ";"
     return out;
     ^

/mnt/tobias/flash_attention/build/cuda.cu(85): warning #940-D: missing return statement at end of non-void function "matmul"
  }
  ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

1 error detected in the compilation of "/mnt/tobias/flash_attention/build/cuda.cu".
[2/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/TH -isystem /opt/conda/envs/flash-attention/lib/python3.11/site-packages/torch/include/THC -isystem /opt/conda/envs/flash-attention/include -isystem /opt/conda/envs/flash-attention/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /mnt/tobias/flash_attention/build/main.cpp -o main.o 
ninja: build stopped: subcommand failed.


In [41]:
m1c,m2c = m1.contiguous().cuda(), m2.contiguous().cuda()

In [42]:
res = module.matmul(m1c, m2c).cpu()

In [43]:
# Check correctness
torch.isclose(res, torch.matmul(m1, m2), atol=1e-3).all()

tensor(True)

In [44]:
%timeit -n 10 _= module.matmul(m1c, m2c).cpu()

2.37 s ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [45]:
%timeit -n 10 _= torch.matmul(m1c, m2c).cpu()

583 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
