Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addn mklml kernel #12691

Merged
merged 9 commits into from
Sep 8, 2017
2 changes: 2 additions & 0 deletions tensorflow/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ cc_library(
"//tensorflow/core/kernels:mkl_relu_op",
"//tensorflow/core/kernels:mkl_reshape_op",
"//tensorflow/core/kernels:mkl_tfconv_op",
"//tensorflow/core/kernels:mkl_aggregate_ops",
]),
)

Expand Down Expand Up @@ -2479,6 +2480,7 @@ tf_cc_test_mkl(
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:scope",
"//tensorflow/cc:sendrecv_ops",
"//tensorflow/core/kernels:mkl_aggregate_ops",
"//tensorflow/core/kernels:mkl_concat_op",
"//tensorflow/core/kernels:mkl_conv_op",
"//tensorflow/core/kernels:mkl_cwise_ops_common",
Expand Down
32 changes: 32 additions & 0 deletions tensorflow/core/graph/mkl_layout_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
public:
MklLayoutRewritePass() {
// NOTE: names are alphabetically sorted.
csinfo_.addn = "AddN";
csinfo_.avg_pool = "AvgPool";
csinfo_.avg_pool_grad = "AvgPoolGrad";
csinfo_.bias_add = "BiasAdd";
Expand Down Expand Up @@ -294,6 +295,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
// End - element-wise ops. See note above.

// NOTE: names are alphabetically sorted.
rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
AddNRewrite, nullptr});
rinfo_.push_back({csinfo_.add,
mkl_op_registry::GetMklOpName(csinfo_.add),
CopyAttrsDataType, AlwaysRewrite, nullptr});
Expand Down Expand Up @@ -453,6 +456,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
/// Structure to store all constant strings
/// NOTE: names are alphabetically sorted.
typedef struct {
string addn;
string add;
string avg_pool;
string avg_pool_grad;
Expand Down Expand Up @@ -624,6 +628,19 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
return false;
}

static bool AddNRewrite(const Node* n, const ContextInfo* c) {
CHECK_NOTNULL(n);

int num;
CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);

// Condition that specifies non-batch-wise and non-depth-wise pooling.
if (num == 2) {
return true;
}

return false;
}
// Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
// specified in contextinfo 'ci'. Function updates fwd_node to point
// to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
Expand Down Expand Up @@ -927,6 +944,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
// We need operator-specific function to copy attributes because the framework
// does not provide any generic function for it.
// NOTE: names are alphabetically sorted.
static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
Expand Down Expand Up @@ -1475,6 +1493,20 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
}

void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
NodeBuilder* nb) {
DataType T;
int N;

// Get all attributes from old node.
TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));

// Add attributes to new node.
nb->Attr("T", T);
nb->Attr("N", N);
}

void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
NodeBuilder* nb) {
DataType T;
Expand Down
8 changes: 8 additions & 0 deletions tensorflow/core/kernels/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -5559,6 +5559,14 @@ tf_mkl_kernel_library(
],
)

tf_mkl_kernel_library(
name = "mkl_aggregate_ops",
prefix = "mkl_aggregate_ops",
deps = MATH_DEPS + [
"//third_party/mkl:intel_binary_blob",
],
)

tf_mkl_kernel_library(
name = "mkl_concat_op",
prefix = "mkl_concat_op",
Expand Down
273 changes: 273 additions & 0 deletions tensorflow/core/kernels/mkl_aggregate_ops.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// See docs in ../ops/math_ops.cc.

#ifdef INTEL_MKL
#define EIGEN_USE_THREADS

#include <numeric>

#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/platform/logging.h"

#include "mkl_dnn.h"
#include "mkl_dnn_types.h"
#include "tensorflow/core/util/mkl_util.h"

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;

template <typename Device, typename T>
class MklAddNOp : public OpKernel {
public:
explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}

void Compute(OpKernelContext* ctx) override {
const int num = ctx->num_inputs();
OP_REQUIRES(ctx, num / 2 == 2,
errors::InvalidArgument("Only additions of two arguments "
"supported by MKL. Num inputs: ",
num));

MklAddNOpContext mkl_context;
const Tensor& input0 = MklGetInput(ctx, 0);
GetMklShape(ctx, 0, &(mkl_context.input1_shape));
bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();

const Tensor& input1 = MklGetInput(ctx, 1);
GetMklShape(ctx, 1, &(mkl_context.input2_shape));
bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();

mkl_context.in_dims = input1_in_mkl_format
? mkl_context.input1_shape.GetDimension()
: input0.dims();
mkl_context.in_dims = input2_in_mkl_format
? mkl_context.input2_shape.GetDimension()
: input1.dims();
// Generate size, stride for input if input is in MKL format.
ExtractMklOpParams(&mkl_context.in1_sizes,
&mkl_context.in1_strides, input0, &mkl_context.input1_shape);
ExtractMklOpParams(&mkl_context.in2_sizes,
&mkl_context.in2_strides, input1, &mkl_context.input2_shape);

std::vector<float> coeff(2, 1.0);
mkl_context.MklCreateInputLayouts(ctx);
CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
mkl_context.lt_input1, &coeff[0]),
E_SUCCESS);

Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor;
mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor,
&mkl_tmp_input2_buf_tensor);
Tensor* output = nullptr;
if (input1_in_mkl_format || input2_in_mkl_format) {
TensorShape tf_shape;
mkl_context.output_shape.SetMklTensor(true);
mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);

mkl_context.output_shape.SetTfLayout(
mkl_context.in_dims, mkl_context.in1_sizes, mkl_context.in1_strides);
if (input1_in_mkl_format == true) {
mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
mkl_context.input1_shape.GetTfToMklDimMap());
} else {
mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
mkl_context.input2_shape.GetTfToMklDimMap());
}
tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
mkl_context.output_shape.GetMklLayout())) /
sizeof(T));

AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
mkl_context.output_shape);
} else {
const TensorShape& o_shape = input1.shape();
mkl_context.output_shape.SetMklTensor(false);
AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
mkl_context.output_shape);
}

mkl_context.Eltwise_res[dnnResourceDst] =
static_cast<void*>(output->flat<T>().data());

// Execute convolution
CHECK_EQ(dnnExecute_F32(mkl_context.Eltwise, mkl_context.Eltwise_res),
E_SUCCESS);

mkl_context.MklCleanup();
}

void ExtractMklOpParams(size_t** out_sizes, size_t** out_strides,
const Tensor& input, const MklShape* input_shape) {
bool input_in_mkl_format = input_shape->IsMklTensor();
int in_dims = input_in_mkl_format
? input_shape->GetDimension()
: input.dims();
size_t* in_sizes = new size_t[in_dims];
size_t* in_strides = new size_t[in_dims];

if (input_in_mkl_format) {
for (int i = 0; i < in_dims; i++) {
in_sizes[i] = input_shape->GetSizes()[i];
in_strides[i] = input_shape->GetStrides()[i];
}
} else {
for (int i = 0; i < in_dims; i++) {
in_sizes[i] =
input.dim_size((in_dims - 1) - i);
}
in_strides[0] = 1;
for (int i = 1; i < in_dims; i++) {
in_strides[i] =
in_strides[i - 1] * in_sizes[i - 1];
}
}
*out_sizes = in_sizes;
*out_strides = in_strides;
}


private:
typedef struct {
int in_dims;
size_t* in1_sizes;
size_t* in1_strides;

size_t* in2_sizes;
size_t* in2_strides;
dnnPrimitive_t Eltwise = nullptr;
dnnPrimitiveAttributes_t attributes = nullptr;
void* Eltwise_res[dnnResourceNumber];
dnnLayout_t lt_input1 = nullptr, lt_input2 = nullptr;
MklShape input1_shape, input2_shape, output_shape;

void MklCreateInputLayouts(OpKernelContext* context) {
bool input1_in_mkl_format = input1_shape.IsMklTensor();
if (!input1_in_mkl_format) {
CHECK_EQ(
dnnLayoutCreate_F32(&lt_input1, in_dims, in1_sizes, in1_strides),
E_SUCCESS);
} else {
lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
}

bool input2_in_mkl_format = input2_shape.IsMklTensor();
if (!input2_in_mkl_format) {
CHECK_EQ(
dnnLayoutCreate_F32(&lt_input2, in_dims, in2_sizes, in2_strides),
E_SUCCESS);
} else {
lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
}
}

void MklPrepareAddNInputs(OpKernelContext* context,
Tensor* mkl_tmp_input1_buf_tensor,
Tensor* mkl_tmp_input2_buf_tensor) {
bool mkl_convert_input1, mkl_convert_input2;
dnnPrimitive_t mkl_prim_convert_input1 = nullptr,
mkl_prim_convert_input2 = nullptr;
dnnLayout_t mkl_lt_internal_input1 = nullptr,
mkl_lt_internal_input2 = nullptr;
void *mkl_buf_convert_input1 = nullptr, *mkl_buf_convert_input2 = nullptr;
dnnResourceType_t dnnResourceMultipleSrc2 =
(dnnResourceType_t)(dnnResourceMultipleSrc + 1);
// Compare with internal layouts and convert if needed
const Tensor& input1 = MklGetInput(context, 0);

void* mkl_buf_input1 =
const_cast<void*>(static_cast<const void*>(input1.flat<T>().data()));

CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
&mkl_lt_internal_input1, Eltwise, dnnResourceMultipleSrc),
E_SUCCESS);
mkl_convert_input1 =
!dnnLayoutCompare_F32(mkl_lt_internal_input1, lt_input1);
if (mkl_convert_input1) {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input1, lt_input1,
mkl_lt_internal_input1),
E_SUCCESS);
AllocTmpBuffer(context, mkl_tmp_input1_buf_tensor,
mkl_lt_internal_input1, &mkl_buf_convert_input1);
CHECK_EQ(
dnnConversionExecute_F32(mkl_prim_convert_input1, mkl_buf_input1,
mkl_buf_convert_input1),
E_SUCCESS);
dnnDelete_F32(mkl_prim_convert_input1);
}
dnnLayoutDelete_F32(mkl_lt_internal_input1);

Eltwise_res[dnnResourceMultipleSrc] =
(mkl_convert_input1) ? mkl_buf_convert_input1 : mkl_buf_input1;

const Tensor& input2 = MklGetInput(context, 1);
void* mkl_buf_input2 =
const_cast<void*>(static_cast<const void*>(input2.flat<T>().data()));
CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
&mkl_lt_internal_input2, Eltwise, dnnResourceMultipleSrc2),
E_SUCCESS);
mkl_convert_input2 =
!dnnLayoutCompare_F32(mkl_lt_internal_input2, lt_input2);
if (mkl_convert_input2) {
CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input2, lt_input2,
mkl_lt_internal_input2),
E_SUCCESS);
AllocTmpBuffer(context, mkl_tmp_input2_buf_tensor,
mkl_lt_internal_input2, &mkl_buf_convert_input2);
CHECK_EQ(
dnnConversionExecute_F32(mkl_prim_convert_input2, mkl_buf_input2,
mkl_buf_convert_input2),
E_SUCCESS);
dnnDelete_F32(mkl_prim_convert_input2);
}
dnnLayoutDelete_F32(mkl_lt_internal_input2);

Eltwise_res[dnnResourceMultipleSrc2] =
(mkl_convert_input2) ? mkl_buf_convert_input2 : mkl_buf_input2;
}

void MklCleanup() {
bool input1_in_mkl_format = input1_shape.IsMklTensor();
bool input2_in_mkl_format = input2_shape.IsMklTensor();
dnnDelete_F32(Eltwise);
if (!input1_in_mkl_format) {
dnnLayoutDelete_F32(lt_input1);
delete [] in1_sizes;
delete [] in1_strides;
}
if (!input2_in_mkl_format) {
dnnLayoutDelete_F32(lt_input2);
delete [] in2_sizes;
delete [] in2_strides;
}
}
} MklAddNOpContext;
};

#define REGISTER_MKL_CPU(T) \
REGISTER_KERNEL_BUILDER(Name("_MklAddN") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_op_registry::kMklOpLabel), \
MklAddNOp<CPUDevice, T>);

TF_CALL_float(REGISTER_MKL_CPU);
#undef REGISTER_MKL_CPU
} // namespace tensorflow
#endif // INTEL_MKL