diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc index 25913b8b..135afb3f 100644 --- a/operators/cuda/cuda_ops.cc +++ b/operators/cuda/cuda_ops.cc @@ -6,6 +6,7 @@ #ifdef USE_CUDA #include "cuda/add_mul.h" #include "cuda/fast_gelu.h" +#include "cuda/mul_sigmoid.h" #include "cuda/negxplus1.h" #include "cuda/scatter_nd_of_shape.h" #include "cuda/transpose_cast.h" @@ -32,6 +33,8 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& { CustomCudaStructV2("FastGelu", contrib::FastGelu), CustomCudaStructV2("MaskedScatterNDOfShape", contrib::MaskedScatterNDOfShape), CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type), + CustomCudaStructV2("MulMulSigmoid", contrib::MulMulSigmoid), + CustomCudaStructV2("MulSigmoid", contrib::MulSigmoid), CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1), CustomCudaStructV2("ScatterNDOfShape", contrib::ScatterNDOfShape), #if ORT_API_VERSION >= 16 @@ -41,6 +44,8 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& { CustomCudaStructV2("FastGelu", contrib::FastGelu), CustomCudaStructV2("MaskedScatterNDOfShape", contrib::MaskedScatterNDOfShape), CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type), + CustomCudaStructV2("MulMulSigmoid", contrib::MulMulSigmoid), + CustomCudaStructV2("MulSigmoid", contrib::MulSigmoid), CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1), CustomCudaStructV2("ScatterNDOfShape", contrib::ScatterNDOfShape), CustomCudaStructV2("Transpose2DCastFP16", Transpose2DCastFloat32ToFloat16Type), diff --git a/operators/cuda/mul_sigmoid.h b/operators/cuda/mul_sigmoid.h new file mode 100644 index 00000000..a6ddcb74 --- /dev/null +++ b/operators/cuda/mul_sigmoid.h @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "ocos.h" +#include "mul_sigmoid_impl.cuh" +#include "ortx_common.h" + +namespace contrib { + +/** +* MulSigmoid(X) = X * Sigmoid(X) + +No shape broadcasting supported. +*/ +template +struct MulSigmoid { + template + OrtxStatus OnModelAttach(const TDict& /*dict*/) { + return {}; + } + OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx, + const ortc::Tensor& input, + ortc::Tensor& output) const { + const T* input_data = input.Data(); + T* output_data = output.Allocate(input.Shape()); + auto input_length = input.NumberOfElement(); + if (0 == input_length) { + return {}; + } + LaunchMulSigmoidKernel(reinterpret_cast(ctx->GetCudaStream()), + input_length, + input_data, + output_data); + return {}; + } +}; + +/** +* MulSigmoid(X, Y) = X * Y * Sigmoid(Y) + +No shape broadcasting supported. +*/ +template +struct MulMulSigmoid { + template + OrtxStatus OnModelAttach(const TDict& /*dict*/) { + return {}; + } + OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx, + const ortc::Tensor& input_x, + const ortc::Tensor& input_y, + ortc::Tensor& output) const { + const T* input_data_x = input_x.Data(); + const T* input_data_y = input_y.Data(); + auto input_length_x = input_x.NumberOfElement(); + auto input_length_y = input_y.NumberOfElement(); + if (0 == input_length_x || 0 == input_data_y) { + return {}; + } + T* output_data = output.Allocate(input_length_x > input_length_y ? input_x.Shape() : input_y.Shape()); + LaunchMulMulSigmoidKernel(reinterpret_cast(ctx->GetCudaStream()), + input_length_x, + input_length_y, + input_data_x, + input_data_y, + output_data); + return {}; + } +}; + +} // namespace contrib diff --git a/operators/cuda/mul_sigmoid_impl.cu b/operators/cuda/mul_sigmoid_impl.cu new file mode 100644 index 00000000..c2c14fe0 --- /dev/null +++ b/operators/cuda/mul_sigmoid_impl.cu @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "device_prop.cuh" +#include "utils.cuh" +#include "mul_sigmoid_impl.cuh" +#include "cuda_type.h" + +#ifndef CUDA_LONG +#define CUDA_LONG int32_t +#endif + +using namespace Ort::Custom; + +template __device__ __inline__ T _exp_typed(const T x); + +template <> __device__ __inline__ float _exp_typed(const float x) { return expf(x); } + +#if __CUDA_ARCH__ < 700 +template <> __device__ __inline__ half _exp_typed(const half x) { + return __float2half(expf(__half2float(x))); +} +#else +template <> __device__ __inline__ half _exp_typed(const half x) { return hexp(x); } +#endif + +template __device__ __inline__ T sigmoid(const T a) { + return a > T(0) ? (T)1 / ((T)1. + _exp_typed(-a)) + : (T)1 - (T)1 / ((T)1 + _exp_typed(a)); +} + +#if __CUDA_ARCH__ < 700 +template <> __device__ __inline__ half sigmoid(const half a) { + return __float2half(sigmoid(__half2float(a))); +} +#endif + +template __device__ __inline__ T mul_sigmoid(const T a) { return a * sigmoid(a); } + +#if __CUDA_ARCH__ < 700 +template <> __device__ __inline__ half mul_sigmoid(const half a) { + float x = __half2float(a); + return __float2half(x * sigmoid(x)); +} +#endif + +template __device__ __inline__ T mul_mul_sigmoid(const T x, const T y) { + return x * y * sigmoid(y); +} + +#if __CUDA_ARCH__ < 700 +template <> __device__ __inline__ half mul_mul_sigmoid(const half x, const half y) { + float hy = __half2float(y); + return __float2half(__half2float(x) * hy * sigmoid(hy)); +} +#endif + +template +__global__ void MulSigmoidKernel(T *output_data, const T *input_data, CUDA_LONG N) { + CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; + if (id >= N) + return; + output_data[id] = mul_sigmoid(input_data[id]); +} + +template +__global__ void MulMulSigmoidKernel(T *output_data, const T *px, const T *py, CUDA_LONG N, + CUDA_LONG Nx, CUDA_LONG Ny) { + CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; + if (id >= N) + return; + output_data[id] = mul_mul_sigmoid(px[id % Nx], py[id % Ny]); +} + +template +cudaError_t _LaunchMulSigmoidKernel(cudaStream_t stream, int input_length, const T* input, T* output) { + constexpr int blockSize = 256; + const int gridSize = (input_length + blockSize - 1) / blockSize; + using TT = typename contrib::CudaT::MappedType; + MulSigmoidKernel<<>>(reinterpret_cast(output), reinterpret_cast(input), input_length); + return cudaGetLastError(); +} + +template <> +cudaError_t LaunchMulSigmoidKernel(cudaStream_t stream, int input_length, const float* input, float* output) { + return _LaunchMulSigmoidKernel(stream, input_length, input, output); +} + +template <> +cudaError_t LaunchMulSigmoidKernel(cudaStream_t stream, int input_length, const ortc::MFloat16* input, ortc::MFloat16* output) { + return _LaunchMulSigmoidKernel(stream, input_length, input, output); +} + +template +cudaError_t _LaunchMulMulSigmoidKernel(cudaStream_t stream, int input_length_x, int input_length_y, + const T* input_data_x, const T* input_data_y, T* output) { + int input_length = std::max(input_length_x, input_length_y); + constexpr int blockSize = 256; + const int gridSize = (input_length + blockSize - 1) / blockSize; + using TT = typename contrib::CudaT::MappedType; + MulMulSigmoidKernel<<>>(reinterpret_cast(output), + reinterpret_cast(input_data_x), + reinterpret_cast(input_data_y), + input_length, input_length_x, input_length_y); + return cudaGetLastError(); +} + +template <> +cudaError_t LaunchMulMulSigmoidKernel(cudaStream_t stream, int input_length_x, int input_length_y, + const float* input_data_x, const float* input_data_y, float* output) { + return _LaunchMulMulSigmoidKernel(stream, input_length_x, input_length_y, input_data_x, input_data_y, output); +} + +template <> +cudaError_t LaunchMulMulSigmoidKernel(cudaStream_t stream, int input_length_x, int input_length_y, + const ortc::MFloat16* input_data_x, const ortc::MFloat16* input_data_y, + ortc::MFloat16* output) { + return _LaunchMulMulSigmoidKernel(stream, input_length_x, input_length_y, input_data_x, input_data_y, output); +} diff --git a/operators/cuda/mul_sigmoid_impl.cuh b/operators/cuda/mul_sigmoid_impl.cuh new file mode 100644 index 00000000..661c5a04 --- /dev/null +++ b/operators/cuda/mul_sigmoid_impl.cuh @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include + +template +cudaError_t LaunchMulSigmoidKernel(cudaStream_t stream, int input_length, const T* input, T* output); + +template +cudaError_t LaunchMulMulSigmoidKernel(cudaStream_t stream, int input_length_x, int input_length_y, + const T* input_data_x, const T* input_data_y, T* output); diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py index c5c8dc6e..db40f612 100644 --- a/test/cuda/test_cudaops.py +++ b/test/cuda/test_cudaops.py @@ -1,6 +1,6 @@ import unittest import numpy as np -from numpy.testing import assert_almost_equal +from numpy.testing import assert_almost_equal, assert_allclose from onnx import helper, numpy_helper, onnx_pb as onnx_proto, TensorProto from onnx.reference import ReferenceEvaluator from onnx.reference.op_run import OpRun @@ -128,6 +128,131 @@ def test_cuda_fastgelu_f16(self): else: print("CUDAExecutionProvider not available, test_cuda_fastgelu_f16 skipped.") + def _mulmulsigmoid_cuda(self, itype, broad=False, atol=1e-5, rtol=1e-3): + model1 = helper.make_model( + helper.make_graph( + [ + helper.make_node("Mul", ["X", "Y"], ["xy"]), + helper.make_node("Sigmoid", ["Y"], ["sy"]), + helper.make_node("Mul", ["xy", "sy"], ["final"]), + ], + "nd", + [ + helper.make_tensor_value_info("X", itype, [None, None, None]), + helper.make_tensor_value_info("Y", itype, [None, None, None]), + ], + [helper.make_tensor_value_info("final", itype, [None, None, None])], + ), + opset_imports=[helper.make_opsetid("", 18)], + ir_version=9, + ) + + model2 = helper.make_model( + helper.make_graph( + [ + helper.make_node( + "MulMulSigmoid", + ["X", "Y"], + ["final"], + domain="ai.onnx.contrib", + ) + ], + "nd", + [ + helper.make_tensor_value_info("X", itype, [None, None, None]), + helper.make_tensor_value_info("Y", itype, [None, None, None]), + ], + [helper.make_tensor_value_info("final", itype, [None, None, None])], + ), + opset_imports=[ + helper.make_opsetid("", 18), + helper.make_opsetid("ai.onnx.contrib", 1), + ], + ir_version=9, + ) + + dtype = np.float32 if itype == TensorProto.FLOAT else np.float16 + shapex = (1, 2, 3) if broad else (3, 2, 3) + shapey = (3, 2, 3) + x = (np.arange(np.prod(shapex)) + 1).reshape(shapex).astype(dtype) + y = (np.arange(np.prod(shapey)) + 2).reshape(shapey).astype(dtype) + x /= x.size + y /= y.size + + feeds1 = dict(X=x, Y=y) + ref = ReferenceEvaluator(model1) + expected = ref.run(None, feeds1)[0] + + opts = _ort.SessionOptions() + opts.register_custom_ops_library(_get_library_path()) + sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"]) + got = sess.run(None, feeds1)[0] + assert_allclose(expected, got, atol=atol, rtol=rtol) + + @unittest.skipIf(not has_cuda(), reason="cuda not available") + def test_mulmulsigmoid_cuda(self): + self._mulmulsigmoid_cuda(TensorProto.FLOAT) + self._mulmulsigmoid_cuda(TensorProto.FLOAT16) + + @unittest.skipIf(not has_cuda(), reason="cuda not available") + def test_mulmulsigmoid_cuda_broadcast(self): + self._mulmulsigmoid_cuda(TensorProto.FLOAT, True) + self._mulmulsigmoid_cuda(TensorProto.FLOAT16, True) + + def _mul_sigmoid_cuda(self, itype): + model1 = helper.make_model( + helper.make_graph( + [ + helper.make_node("Sigmoid", ["X"], ["sx"]), + helper.make_node("Mul", ["X", "sx"], ["Y"]), + ], + "nd", + [helper.make_tensor_value_info("X", itype, [None, None, None])], + [helper.make_tensor_value_info("Y", itype, [None, None, None])], + ), + opset_imports=[helper.make_opsetid("", 18)], + ir_version=9, + ) + + model2 = helper.make_model( + helper.make_graph( + [ + helper.make_node( + "MulSigmoid", + ["X"], + ["Y"], + domain="ai.onnx.contrib", + ) + ], + "nd", + [helper.make_tensor_value_info("X", itype, [None, None, None])], + [helper.make_tensor_value_info("Y", itype, [None, None, None])], + ), + opset_imports=[ + helper.make_opsetid("", 18), + helper.make_opsetid("ai.onnx.contrib", 1), + ], + ir_version=9, + ) + + dtype = np.float32 if itype == TensorProto.FLOAT else np.float16 + x = (np.arange(18) + 1).reshape((3, 2, 3)).astype(dtype) + + feeds1 = dict(X=x) + ref = ReferenceEvaluator(model1) + expected = ref.run(None, feeds1)[0] + + opts = _ort.SessionOptions() + opts.register_custom_ops_library(_get_library_path()) + sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"]) + got = sess.run(None, feeds1)[0] + assert_allclose(expected, got, atol=1e-5 if itype == TensorProto.FLOAT else 1e-2) + + @unittest.skipIf(not has_cuda(), reason="cuda not available") + def test_mul_sigmoid_cuda(self): + self._mul_sigmoid_cuda(TensorProto.FLOAT) + self._mul_sigmoid_cuda(TensorProto.FLOAT16) + def _negxplus1_cuda(self, itype): dtype = np.float32 if itype == TensorProto.FLOAT else np.float16 model1 = helper.make_model(