From f9fa815dfd6a6e4d90d726095ca6cb0df9b0220b Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Tue, 6 Jun 2023 22:45:25 +0000 Subject: [PATCH 1/2] Add ROCm support Co-authored-by: [ ] --- cuda_ext.py | 4 ++- exllama_ext/cuda_compat.cuh | 4 +-- exllama_ext/cuda_func/column_remap.cu | 7 +++++ exllama_ext/cuda_func/half_matmul.cu | 3 ++ exllama_ext/cuda_func/half_matmul.cuh | 6 ++++ exllama_ext/cuda_func/q4_matmul.cu | 3 ++ exllama_ext/cuda_func/q4_matmul.cuh | 6 ++++ exllama_ext/cuda_func/q4_mlp.cu | 3 ++ exllama_ext/hip_compat.cuh | 45 +++++++++++++++++++++++++++ exllama_ext/util.cuh | 4 +++ 10 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 exllama_ext/hip_compat.cuh diff --git a/cuda_ext.py b/cuda_ext.py index b8bd1679..c28fab48 100644 --- a/cuda_ext.py +++ b/cuda_ext.py @@ -53,8 +53,10 @@ def find_msvc(): os.path.join(library_dir, "exllama_ext/cuda_func/q4_mlp.cu"), os.path.join(library_dir, "exllama_ext/cpu_func/rep_penalty.cpp") ], + extra_include_paths = [os.path.join(library_dir, "exllama_ext")], verbose = verbose, - extra_ldflags = ["cublas.lib"] if windows else [] + extra_ldflags = ["cublas.lib"] if windows else [], + extra_cuda_cflags = ["-U__HIP_NO_HALF_CONVERSIONS__"] if torch.version.hip else [] # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"] ) diff --git a/exllama_ext/cuda_compat.cuh b/exllama_ext/cuda_compat.cuh index c880997d..fbc5582f 100644 --- a/exllama_ext/cuda_compat.cuh +++ b/exllama_ext/cuda_compat.cuh @@ -41,8 +41,8 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) // -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ < 700 +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) +#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } diff --git a/exllama_ext/cuda_func/column_remap.cu b/exllama_ext/cuda_func/column_remap.cu index bc3cb887..875f5896 100644 --- a/exllama_ext/cuda_func/column_remap.cu +++ b/exllama_ext/cuda_func/column_remap.cu @@ -1,7 +1,14 @@ #include "column_remap.cuh" #include "../util.cuh" +// Using 1024 make me crash with "Memory access fault by GPU node-1 (Agent +// handle: 0x012345678912) on address 0x012345678912. Reason: Page not present +// or supervisor privilege." +#if defined(USE_ROCM) +const int SHUF_BLOCKSIZE_X = 256; +#else const int SHUF_BLOCKSIZE_X = 1024; +#endif const int SHUF_BLOCKSIZE_Y = 16; __global__ void column_remap_kernel diff --git a/exllama_ext/cuda_func/half_matmul.cu b/exllama_ext/cuda_func/half_matmul.cu index 6f8a892f..2385b9a3 100644 --- a/exllama_ext/cuda_func/half_matmul.cu +++ b/exllama_ext/cuda_func/half_matmul.cu @@ -2,6 +2,9 @@ #include "../util.cuh" #include "../matrix.cuh" #include "../cuda_compat.cuh" +#if defined(USE_ROCM) +#include "../hip_compat.cuh" +#endif // Block size diff --git a/exllama_ext/cuda_func/half_matmul.cuh b/exllama_ext/cuda_func/half_matmul.cuh index 0138dfad..88331253 100644 --- a/exllama_ext/cuda_func/half_matmul.cuh +++ b/exllama_ext/cuda_func/half_matmul.cuh @@ -6,6 +6,12 @@ #include #include +// Workaround for hipify_python using rocblas instead of hipblas. +#if defined(USE_ROCM) +#include +#define rocblas_handle hipblasHandle_t +#endif + void half_matmul_cuda ( const half* x, diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu index b791c571..ae0e3096 100644 --- a/exllama_ext/cuda_func/q4_matmul.cu +++ b/exllama_ext/cuda_func/q4_matmul.cu @@ -4,6 +4,9 @@ #include "../matrix.cuh" #include "../cuda_compat.cuh" #include "../cuda_buffers.cuh" +#if defined(USE_ROCM) +#include "../hip_compat.cuh" +#endif const int THREADS_X = 32; // Block size and thread count along columns in w and out const int THREADS_Y = 1; // Block size and thread count along rows in x and out diff --git a/exllama_ext/cuda_func/q4_matmul.cuh b/exllama_ext/cuda_func/q4_matmul.cuh index 9c12dc03..348ee92a 100644 --- a/exllama_ext/cuda_func/q4_matmul.cuh +++ b/exllama_ext/cuda_func/q4_matmul.cuh @@ -10,6 +10,12 @@ #include "q4_matrix.cuh" #include "../tuning.h" +// Workaround for hipify_python using rocblas instead of hipblas. +#if defined(USE_ROCM) +#include +#define rocblas_handle hipblasHandle_t +#endif + void q4_matmul_cuda ( ExLlamaTuning* tuningParams, diff --git a/exllama_ext/cuda_func/q4_mlp.cu b/exllama_ext/cuda_func/q4_mlp.cu index d436c572..c16b464f 100644 --- a/exllama_ext/cuda_func/q4_mlp.cu +++ b/exllama_ext/cuda_func/q4_mlp.cu @@ -4,6 +4,9 @@ #include "../cuda_buffers.cuh" #include "../util.cuh" #include "../matrix.cuh" +#if defined(USE_ROCM) +#include "../hip_compat.cuh" +#endif const int THREADS_X = 32; const int THREADS_Y = 4; diff --git a/exllama_ext/hip_compat.cuh b/exllama_ext/hip_compat.cuh new file mode 100644 index 00000000..2a4fe1d0 --- /dev/null +++ b/exllama_ext/hip_compat.cuh @@ -0,0 +1,45 @@ +#ifndef _hip_compat_cuh +#define _hip_compat_cuh + +// Workaround for a bug in hipamd, backported from upstream. +__device__ __forceinline__ __half __compat_hrcp(__half x) { + return __half_raw{ + static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; +} + +__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { + return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)), + static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))}; +} + +#define hrcp __compat_hrcp +#define h2rcp __compat_h2rcp + +// Workaround for hipify_python using rocblas instead of hipblas. +__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, + hipblasOperation_t transA, + hipblasOperation_t transB, + int m, + int n, + int k, + const half* alpha, + const half* AP, + int lda, + const half* BP, + int ldb, + const half* beta, + half* CP, + int ldc) { + return hipblasHgemm(handle, transA, transB, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(AP), lda, + reinterpret_cast(BP), ldb, + reinterpret_cast(beta), + reinterpret_cast(CP), ldc); +} + +#define rocblas_handle hipblasHandle_t +#define rocblas_operation_none HIPBLAS_OP_N +#define rocblas_hgemm __compat_hipblasHgemm + +#endif diff --git a/exllama_ext/util.cuh b/exllama_ext/util.cuh index 57b06d3d..7903f0eb 100644 --- a/exllama_ext/util.cuh +++ b/exllama_ext/util.cuh @@ -6,7 +6,11 @@ #include #include +#if defined(USE_ROCM) +#define cudaUnspecified hipErrorUnknown +#else #define cudaUnspecified cudaErrorApiFailureBase +#endif // React to failure on return code != cudaSuccess From afc8b7cd4f7a5871717da51f4b9fc04fb4d78741 Mon Sep 17 00:00:00 2001 From: ardfork <134447697+ardfork@users.noreply.github.com> Date: Tue, 6 Jun 2023 22:50:06 +0000 Subject: [PATCH 2/2] Disable half2 by default when using HIP --- model_init.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model_init.py b/model_init.py index b469d4f2..bad2075f 100644 --- a/model_init.py +++ b/model_init.py @@ -1,6 +1,7 @@ from model import ExLlama, ExLlamaCache, ExLlamaConfig from tokenizer import ExLlamaTokenizer import argparse, sys, os, glob +from torch import version as torch_version def add_args(parser): @@ -23,11 +24,12 @@ def add_args(parser): parser.add_argument("-mmnh2", "--matmul_no_half2", action = "store_true", help = "Don't use half2 in Q4 matmul kernel") parser.add_argument("-snh2", "--silu_no_half2", action = "store_true", help = "Don't use half2 in SiLU kernel") parser.add_argument("-nh2", "--no_half2", action = "store_true", help = "(All of the above) disable half2 in all kernela") + parser.add_argument("-fh2", "--force_half2", action = "store_true", help = "Force enable half2 even if unsupported") def post_parse(args): - if args.no_half2: + if args.no_half2 or torch_version.hip and not args.force_half2: args.rmsnorm_no_half2 = True args.rope_no_half2 = True args.matmul_no_half2 = True