diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index a23e07b8..4995642e 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -8,6 +8,27 @@
 
 namespace contrib {
 
+inline void _FillOutputShape3Op(std::vector<int64_t>& dimsA,
+                                std::vector<int64_t>& dimsB,
+                                std::vector<int64_t>& dimsC,
+                                std::vector<int64_t>& output_dims) {
+    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
+    while (dimsA.size() < max_rank)
+      dimsA.insert(dimsA.begin(), 1);
+    while (dimsB.size() < max_rank)
+      dimsB.insert(dimsB.begin(), 1);
+    while (dimsC.size() < max_rank)
+      dimsC.insert(dimsC.begin(), 1);
+
+    output_dims.resize(dimsA.size());
+    for (size_t i = 0; i < dimsA.size(); ++i) {
+      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
+      if (output_dims[i] == 0) {
+        ORTX_CXX_API_THROW("One of the input dimensions is null.", ORT_RUNTIME_EXCEPTION);        
+      }
+    }
+}
+
 template <typename T, bool addition>
 struct AddOrMulSharedInput {
   template <typename TDict>
@@ -20,22 +41,19 @@ struct AddOrMulSharedInput {
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output_ab,
                      ortc::Tensor<T>& output_ac) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
 
-    T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
-    T* output_data_ac = output_ab.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());
-
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
+
+    T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
+    T* output_data_ac = output_ac.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());
+
     LaunchAddOrMulSharedInputKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                       input_data_a, input_data_b, input_data_c,
+                                       tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                        output_data_ab, output_data_ac,
                                        length_a, length_b, length_c,
                                        addition);
@@ -54,25 +72,24 @@ struct AddOrMulTwice {
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
 
-    T* output_data_ab = output_ab.Allocate(
-        length_a <= length_b
-            ? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
-        : lenght_a <= length_b ? tensor_b.Shape()
-                               : tensor_a.Shape());
-
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
+
+    std::vector<int64_t> dimsA = tensor_a.Shape();
+    std::vector<int64_t> dimsB = tensor_b.Shape();
+    std::vector<int64_t> dimsC = tensor_c.Shape();
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
+
+    T* output_data = output.Allocate(output_dims);
+
     LaunchAddOrMulTwiceKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                 input_data_a, input_data_b, input_data_c,
+                                 tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                  output_data,
                                  length_a, length_b, length_c,
                                  addition);
@@ -84,42 +101,27 @@ template <typename T, bool addition_first>
 struct AddAndMul {
   template <typename TDict>
   OrtxStatus OnModelAttach(const TDict& dict) {
-    return {};
+    int64_t default_value = 0;
+    switchMiddelAxis_ = dict.TryToGetAttributeWithDefault("switchMiddleAxis", default_value) == 1;
   }
   OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
                      const ortc::Tensor<T>& tensor_a,
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
 
     std::vector<int64_t> dimsA = tensor_a.Shape();
     std::vector<int64_t> dimsB = tensor_b.Shape();
     std::vector<int64_t> dimsC = tensor_c.Shape();
-
-    auto max_length = std::max(length_a, std::max(length_b, length_c));
-
-    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
-    while (dimsA.size() < max_rank)
-      dimsA.insert(dimsA.begin(), 1);
-    while (dimsB.size() < max_rank)
-      dimsB.insert(dimsB.begin(), 1);
-    while (dimsC.size() < max_rank)
-      dimsC.insert(dimsC.begin(), 1);
-
-    std::vector<int64_t> output_dims(dimsA.size());
-    for (size_t i = 0; i < dimsA.size(); ++i) {
-      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
-    }
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
 
     if (switchMiddelAxis_) {
       if (output_dims.size() != 4) {
@@ -130,15 +132,16 @@ struct AddAndMul {
       int64_t d2 = output_dims[output_dims.size() - 3];
       output_dims[1] = d3;
       output_dims[2] = d2;
+      T* output_data = output.Allocate(output_dims);
       LaunchAddAndMulSwitchMiddleAxesKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                               input_data_a, input_data_b, input_data_c,
+                                               tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                                output_data,
                                                length_a, length_b, length_c,
                                                addition_first, d2, d3, d4);
     } else {
-      T* output_data_ab = output_ab.Allocate(output_dims);
+      T* output_data = output.Allocate(output_dims);
       LaunchAddAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                               input_data_a, input_data_b, input_data_c,
+                               tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                output_data,
                                length_a, length_b, length_c,
                                addition_first);
@@ -154,46 +157,31 @@ template <typename T, bool subtract_first>
 struct SubAndMul {
   template <typename TDict>
   OrtxStatus OnModelAttach(const TDict& dict) {
-    return {};
+    //int64_t default_value = 0;
+    //negative_ = dict.TryToGetAttributeWithDefault("negative", default_value) == 1;
+    negative_ = false;
   }
   OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
                      const ortc::Tensor<T>& tensor_a,
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
 
     std::vector<int64_t> dimsA = tensor_a.Shape();
     std::vector<int64_t> dimsB = tensor_b.Shape();
     std::vector<int64_t> dimsC = tensor_c.Shape();
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
+    T* output_data = output.Allocate(output_dims);
 
-    auto max_length = std::max(length_a, std::max(length_b, length_c));
-
-    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
-    while (dimsA.size() < max_rank)
-      dimsA.insert(dimsA.begin(), 1);
-    while (dimsB.size() < max_rank)
-      dimsB.insert(dimsB.begin(), 1);
-    while (dimsC.size() < max_rank)
-      dimsC.insert(dimsC.begin(), 1);
-
-    std::vector<int64_t> output_dims(dimsA.size());
-    for (size_t i = 0; i < dimsA.size(); ++i) {
-      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
-    }
-
-    T* output_data_ab = output_ab.Allocate(output_dims);
     LaunchSubAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                             input_data_a, input_data_b, input_data_c,
+                             tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                              output_data,
                              length_a, length_b, length_c,
                              subtract_first, negative_);
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
index 14bf73b6..20919c46 100644
--- a/operators/cuda/add_mul_impl.cu
+++ b/operators/cuda/add_mul_impl.cu
@@ -169,15 +169,15 @@ struct Add3Op {
 };
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void AddMulTwiceKernel(T* output, const T* pA, const T* pB,
-                                  const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
-                                  CUDA_LONG N, const TFunc func) {
+__global__ void AddMulTwiceKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+                                  CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
+                                  const TFunc func) {
   CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   CUDA_LONG id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      func(output_ab, pA[id % nA], pB[id % nB], pC[id % nC]);
+      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
       id += NumThreadsPerBlock;
     }
   }
@@ -206,36 +206,39 @@ cudaError_t _LaunchAddOrMulTwiceKernel(cudaStream_t stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC),
             static_cast<CUDA_LONG>(countA), static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
-            static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
+            static_cast<CUDA_LONG>(max_count), Add3Op<TT>());
   } else {
     AddMulTwiceKernel<TT, Mul3Op<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
             static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
-            static_cast<CUDA_LONG>(max_count), Mul3SharedOp<TT>());
+            static_cast<CUDA_LONG>(max_count), Mul3Op<TT>());
   }
   return cudaGetLastError();
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream,
-                                                   const float* input_a, const float* input_b, const float* input_c,
-                                                   float* output,
-                                                   int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
-                                          output,
-                                          length_a, length_b, length_c, addition);
+cudaError_t LaunchAddOrMulTwiceKernel<float>(cudaStream_t stream,
+                                             const float* input_a, const float* input_b, const float* input_c,
+                                             float* output,
+                                             int64_t length_a, int64_t length_b, int64_t length_c,
+                                             bool addition) {
+  return _LaunchAddOrMulTwiceKernel(stream, input_a, input_b, input_c,
+                                    output,
+                                    length_a, length_b, length_c, addition);
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
-                                                            const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
-                                                            ortc::MFloat16* output,
-                                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
-                                          output,
-                                          length_a, length_b, length_c, addition);
+cudaError_t LaunchAddOrMulTwiceKernel<ortc::MFloat16>(cudaStream_t stream,
+                                                      const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
+                                                      const ortc::MFloat16* input_c,
+                                                      ortc::MFloat16* output,
+                                                      int64_t length_a, int64_t length_b, int64_t length_c,
+                                                      bool addition) {
+  return _LaunchAddOrMulTwiceKernel(stream, input_a, input_b, input_c,
+                                    output,
+                                    length_a, length_b, length_c, addition);
 }
 
 __device__ __forceinline__ void _addmul_op(float* address, const float a, const float b,
@@ -281,7 +284,7 @@ struct MulAdd {
 };
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _AddAndMulKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+__global__ void AddAndMulKernel(T* output_data, const T* pA, const T* pB, const T* pC,
                                  CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
                                  const TFunc func) {
   CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@@ -296,7 +299,7 @@ __global__ void _AddAndMulKernel(T* output_data, const T* pA, const T* pB, const
 }
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _AddAndMulSwitchMiddleAxesKernel(T* output_data, const T* pA, const T* pB,
+__global__ void AddAndMulSwitchMiddleAxesKernel(T* output_data, const T* pA, const T* pB,
                                                  const T* pC, CUDA_LONG nA, CUDA_LONG nB,
                                                  CUDA_LONG nC, CUDA_LONG N,
                                                  const TFunc func, CUDA_LONG d2,
@@ -337,23 +340,27 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
   if (addition_first) {
     AddAndMulKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, AddMul<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            AddMul<TT>());
   } else {
     AddAndMulKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, MulAdd<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            MulAdd<TT>());
   }
   return cudaGetLastError();
 }
@@ -361,8 +368,8 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
 template <>
 cudaError_t LaunchAddAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool addition) {
-  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+                                  bool addition_first) {
+  return _LaunchAddAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, addition_first);
 }
 
 template <>
@@ -370,8 +377,8 @@ cudaError_t LaunchAddAndMulKernel(cudaStream_t stream,
                                   const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
                                   const ortc::MFloat16* input_c,
                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool addition) {
-  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+                                  bool addition_first) {
+  return _LaunchAddAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, addition_first);
 }
 
 template <typename T>
@@ -395,23 +402,27 @@ cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
   if (addition_first) {
     AddAndMulSwitchMiddleAxesKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, AddMul<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            AddMul<TT>(), d2, d3, d4);
   } else {
     AddAndMulSwitchMiddleAxesKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, MulAdd<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            MulAdd<TT>(), d2, d3, d4);
   }
   return cudaGetLastError();
 }
@@ -419,9 +430,9 @@ cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
 template <>
 cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                                  bool addition,
+                                                  bool addition_first,
                                                   int64_t d2, int64_t d3, int64_t d4) {
-  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c,
                                                 addition_first, d2, d3, d4);
 }
 
@@ -429,9 +440,9 @@ template <>
 cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const ortc::MFloat16* input_a,
                                                   const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
                                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                                  bool addition,
+                                                  bool addition_first,
                                                   int64_t d2, int64_t d3, int64_t d4) {
-  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c,
                                                 addition_first, d2, d3, d4);
 }
 
@@ -519,27 +530,12 @@ struct MulSubNeg {
   }
 };
 
-template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _MulSubKernel(T* output_data, const T* pA, const T* pB, const T* pC,
-                              CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
-                              const TFunc func) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-  CUDA_LONG id = start;
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
 template <typename T>
 cudaError_t _LaunchSubAndMulKernel(cudaStream_t stream,
                                    const T* pA, const T* pB, const T* pC,
                                    T* output,
                                    int64_t countA, int64_t countB, int64_t countC,
-                                   bool addition_first) {
+                                   bool addition_first, bool negative) {
   int64_t max_count = std::max(std::max(countA, countB), countC);
   if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
     return cudaGetLastError();
@@ -554,47 +550,55 @@ cudaError_t _LaunchSubAndMulKernel(cudaStream_t stream,
 
   if (addition_first) {
     if (negative) {
-      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, SubMulNeg<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, SubMulNEg<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              SubMulNeg<TT>());
     } else {
-      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, SubMul<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              SubMul<TT>());
     }
   } else {
     if (negative) {
-      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, MulSubNeg<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, MulSubNeg<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              MulSubNeg<TT>());
     } else {
-      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, MulSub<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              MulSub<TT>());
     }
   }
   return cudaGetLastError();
@@ -604,7 +608,7 @@ template <>
 cudaError_t LaunchSubAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
                                   bool subtract_first, bool negative) {
-  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+  return _LaunchSubAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, subtract_first, negative);
 }
 
 template <>
@@ -612,6 +616,6 @@ cudaError_t LaunchSubAndMulKernel(cudaStream_t stream,
                                   const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
                                   const ortc::MFloat16* input_c,
                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool subtract_first, negative) {
-  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+                                  bool subtract_first, bool negative) {
+  return _LaunchSubAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, subtract_first, negative);
 }
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
index 21feec2e..0587fc6f 100644
--- a/operators/cuda/cuda_ops.cc
+++ b/operators/cuda/cuda_ops.cc
@@ -29,38 +29,38 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
   using AddTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
   using MulTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
 
-  using AddAndMulFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, true>;
-  using MulAndAddFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, false>;
+  using AddAndMulFloat16Type = typename contrib::AddAndMul<ortc::MFloat16, true>;
+  using MulAndAddFloat16Type = typename contrib::AddAndMul<ortc::MFloat16, false>;
 
-  using SubAndMulFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, true>;
-  using MulAndSubFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, false>;
+  using SubAndMulFloat16Type = typename contrib::SubAndMul<ortc::MFloat16, true>;
+  using MulAndSubFloat16Type = typename contrib::SubAndMul<ortc::MFloat16, false>;
 #endif
 
   static OrtOpLoader op_loader(
       []() { return nullptr; }
 #ifdef USE_CUDA
       ,
+      CustomCudaStructV2("AddAdd", AddTwiceFloat32Type),
       CustomCudaStructV2("AddMul", AddAndMulFloat32Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
-      CustomCudaStructV2("AddTwice", AddTwiceFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat32Type),
+      CustomCudaStructV2("MulMul", MulTwiceFloat32Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
       CustomCudaStructV2("MulSub", MulAndSubFloat32Type),
-      CustomCudaStructV2("MulTwice", MulTwiceFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
       CustomCudaStructV2("SubMul", SubAndMulFloat32Type),
 #if ORT_API_VERSION >= 16
 
+      CustomCudaStructV2("AddAdd", AddTwiceFloat16Type),
       CustomCudaStructV2("AddMul", AddAndMulFloat16Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
-      CustomCudaStructV2("AddTwice", AddTwiceFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat16Type),
+      CustomCudaStructV2("MulMul", MulTwiceFloat16Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
       CustomCudaStructV2("MulSub", MulAndSubFloat16Type),
-      CustomCudaStructV2("MulTwice", MulTwiceFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>),
       CustomCudaStructV2("SubMul", SubAndMulFloat16Type)
 #endif
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
index ca94646f..ab7b1f4a 100644
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@@ -71,8 +71,8 @@ def _addaddmulmul_cuda(self, itype, op_type, broad=False):
         shapey = (3, 2, 3)
         shapez = (1, 2, 3) if broad else (3, 2, 3)
         x = (np.arange(np.prod(shapex)) + 1).reshape(shapex).astype(dtype)
-        y = (np.arange(np.prod(shapey)) + 1).reshape(shapey).astype(dtype)
-        z = (np.arange(np.prod(shapez)) + 1).reshape(shapez).astype(dtype)
+        y = (np.arange(np.prod(shapey)) + 10).reshape(shapey).astype(dtype)
+        z = (np.arange(np.prod(shapez)) + 100).reshape(shapez).astype(dtype)
 
         feeds1 = dict(X=x, Y=y, Z=z)
         ref = ReferenceEvaluator(model1)
@@ -228,8 +228,6 @@ def test_cuda_negxplus1(self):
         self._negxplus1_cuda(TensorProto.FLOAT16)
 
     def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3, 2, 3), shapec=(3, 2, 3)):
-        from ai.onnx.contrib import get_ort_ext_libs
-
         model1 = helper.make_model(
             helper.make_graph(
                 [
@@ -289,7 +287,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
         expected = ref.run(None, feeds1)
 
         opts = _ort.SessionOptions()
-        opts.register_custom_ops_library(get_ort_ext_libs()[0])
+        opts.register_custom_ops_library(_get_library_path())
         sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
         got = sess.run(None, feeds1)
         for i in range(2):
@@ -445,4 +443,4 @@ def test_mulsub_cuda_negative(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(verbosity=2)