Skip to content

Commit

Permalink
fix compilation
Browse files Browse the repository at this point in the history
  • Loading branch information
xadupre committed Jun 10, 2024
1 parent cd66c02 commit cf1e47f
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 159 deletions.
124 changes: 56 additions & 68 deletions operators/cuda/add_mul.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@

namespace contrib {

inline void _FillOutputShape3Op(std::vector<int64_t>& dimsA,
std::vector<int64_t>& dimsB,
std::vector<int64_t>& dimsC,
std::vector<int64_t>& output_dims) {
auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
while (dimsA.size() < max_rank)
dimsA.insert(dimsA.begin(), 1);
while (dimsB.size() < max_rank)
dimsB.insert(dimsB.begin(), 1);
while (dimsC.size() < max_rank)
dimsC.insert(dimsC.begin(), 1);

output_dims.resize(dimsA.size());
for (size_t i = 0; i < dimsA.size(); ++i) {
output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
if (output_dims[i] == 0) {
ORTX_CXX_API_THROW("One of the input dimensions is null.", ORT_RUNTIME_EXCEPTION);
}
}
}

template <typename T, bool addition>
struct AddOrMulSharedInput {
template <typename TDict>
Expand All @@ -20,22 +41,19 @@ struct AddOrMulSharedInput {
const ortc::Tensor<T>& tensor_c,
ortc::Tensor<T>& output_ab,
ortc::Tensor<T>& output_ac) const {
const T* input_data_a = tensor_a.Data();
const T* input_data_b = tensor_b.Data();
const T* input_data_c = tensor_c.Data();

auto length_a = tensor_a.NumberOfElement();
auto length_b = tensor_b.NumberOfElement();
auto length_c = tensor_c.NumberOfElement();

T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
T* output_data_ac = output_ab.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());

if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
if (0 == length_a || 0 == length_b || 0 == length_c) {
return {};
}

T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
T* output_data_ac = output_ac.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());

LaunchAddOrMulSharedInputKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
input_data_a, input_data_b, input_data_c,
tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
output_data_ab, output_data_ac,
length_a, length_b, length_c,
addition);
Expand All @@ -54,25 +72,24 @@ struct AddOrMulTwice {
const ortc::Tensor<T>& tensor_b,
const ortc::Tensor<T>& tensor_c,
ortc::Tensor<T>& output) const {
const T* input_data_a = tensor_a.Data();
const T* input_data_b = tensor_b.Data();
const T* input_data_c = tensor_c.Data();

auto length_a = tensor_a.NumberOfElement();
auto length_b = tensor_b.NumberOfElement();
auto length_c = tensor_c.NumberOfElement();

T* output_data_ab = output_ab.Allocate(
length_a <= length_b
? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
: lenght_a <= length_b ? tensor_b.Shape()
: tensor_a.Shape());

if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
if (0 == length_a || 0 == length_b || 0 == length_c) {
return {};
}

std::vector<int64_t> dimsA = tensor_a.Shape();
std::vector<int64_t> dimsB = tensor_b.Shape();
std::vector<int64_t> dimsC = tensor_c.Shape();
std::vector<int64_t> output_dims;
_FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);

T* output_data = output.Allocate(output_dims);

LaunchAddOrMulTwiceKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
input_data_a, input_data_b, input_data_c,
tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
output_data,
length_a, length_b, length_c,
addition);
Expand All @@ -84,42 +101,27 @@ template <typename T, bool addition_first>
struct AddAndMul {
template <typename TDict>
OrtxStatus OnModelAttach(const TDict& dict) {
return {};
int64_t default_value = 0;
switchMiddelAxis_ = dict.TryToGetAttributeWithDefault("switchMiddleAxis", default_value) == 1;
}
OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
const ortc::Tensor<T>& tensor_a,
const ortc::Tensor<T>& tensor_b,
const ortc::Tensor<T>& tensor_c,
ortc::Tensor<T>& output) const {
const T* input_data_a = tensor_a.Data();
const T* input_data_b = tensor_b.Data();
const T* input_data_c = tensor_c.Data();

auto length_a = tensor_a.NumberOfElement();
auto length_b = tensor_b.NumberOfElement();
auto length_c = tensor_c.NumberOfElement();
if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {

if (0 == length_a || 0 == length_b || 0 == length_c) {
return {};
}

std::vector<int64_t> dimsA = tensor_a.Shape();
std::vector<int64_t> dimsB = tensor_b.Shape();
std::vector<int64_t> dimsC = tensor_c.Shape();

auto max_length = std::max(length_a, std::max(length_b, length_c));

auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
while (dimsA.size() < max_rank)
dimsA.insert(dimsA.begin(), 1);
while (dimsB.size() < max_rank)
dimsB.insert(dimsB.begin(), 1);
while (dimsC.size() < max_rank)
dimsC.insert(dimsC.begin(), 1);

std::vector<int64_t> output_dims(dimsA.size());
for (size_t i = 0; i < dimsA.size(); ++i) {
output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
}
std::vector<int64_t> output_dims;
_FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);

if (switchMiddelAxis_) {
if (output_dims.size() != 4) {
Expand All @@ -130,15 +132,16 @@ struct AddAndMul {
int64_t d2 = output_dims[output_dims.size() - 3];
output_dims[1] = d3;
output_dims[2] = d2;
T* output_data = output.Allocate(output_dims);
LaunchAddAndMulSwitchMiddleAxesKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
input_data_a, input_data_b, input_data_c,
tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
output_data,
length_a, length_b, length_c,
addition_first, d2, d3, d4);
} else {
T* output_data_ab = output_ab.Allocate(output_dims);
T* output_data = output.Allocate(output_dims);
LaunchAddAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
input_data_a, input_data_b, input_data_c,
tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
output_data,
length_a, length_b, length_c,
addition_first);
Expand All @@ -154,46 +157,31 @@ template <typename T, bool subtract_first>
struct SubAndMul {
template <typename TDict>
OrtxStatus OnModelAttach(const TDict& dict) {
return {};
//int64_t default_value = 0;
//negative_ = dict.TryToGetAttributeWithDefault("negative", default_value) == 1;
negative_ = false;
}
OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
const ortc::Tensor<T>& tensor_a,
const ortc::Tensor<T>& tensor_b,
const ortc::Tensor<T>& tensor_c,
ortc::Tensor<T>& output) const {
const T* input_data_a = tensor_a.Data();
const T* input_data_b = tensor_b.Data();
const T* input_data_c = tensor_c.Data();

auto length_a = tensor_a.NumberOfElement();
auto length_b = tensor_b.NumberOfElement();
auto length_c = tensor_c.NumberOfElement();
if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
if (0 == length_a || 0 == length_b || 0 == length_c) {
return {};
}

std::vector<int64_t> dimsA = tensor_a.Shape();
std::vector<int64_t> dimsB = tensor_b.Shape();
std::vector<int64_t> dimsC = tensor_c.Shape();
std::vector<int64_t> output_dims;
_FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
T* output_data = output.Allocate(output_dims);

auto max_length = std::max(length_a, std::max(length_b, length_c));

auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
while (dimsA.size() < max_rank)
dimsA.insert(dimsA.begin(), 1);
while (dimsB.size() < max_rank)
dimsB.insert(dimsB.begin(), 1);
while (dimsC.size() < max_rank)
dimsC.insert(dimsC.begin(), 1);

std::vector<int64_t> output_dims(dimsA.size());
for (size_t i = 0; i < dimsA.size(); ++i) {
output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
}

T* output_data_ab = output_ab.Allocate(output_dims);
LaunchSubAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
input_data_a, input_data_b, input_data_c,
tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
output_data,
length_a, length_b, length_c,
subtract_first, negative_);
Expand Down
Loading

0 comments on commit cf1e47f

Please sign in to comment.