Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Intel Mkl] Parallel BiasAddGrad op with eigen intra thread pool #26426

Merged
36 changes: 14 additions & 22 deletions tensorflow/core/kernels/bias_op.cc
Expand Up @@ -18,14 +18,14 @@ limitations under the License.
#define EIGEN_USE_THREADS

#include "tensorflow/core/kernels/bias_op.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/bounds_check.h"
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/kernels/redux_functor.h"
#include "tensorflow/core/util/tensor_format.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

#if GOOGLE_CUDA
#include "tensorflow/core/kernels/bias_op_gpu.h"
Expand Down Expand Up @@ -140,10 +140,10 @@ class BiasOp : public BinaryOp<T> {
Eigen::DSizes<int32, 3> three_dims(1, channel, 1);
Eigen::DSizes<int32, 3> broad_cast_dims(batch, 1, height);
const Device& d = context->eigen_device<Device>();
output->tensor<T, 3>().device(d) =
input.tensor<T, 3>() + bias.tensor<T, 1>()
.reshape(three_dims)
.broadcast(broad_cast_dims);
output->tensor<T, 3>().device(d) = input.tensor<T, 3>() +
bias.tensor<T, 1>()
.reshape(three_dims)
.broadcast(broad_cast_dims);
} break;
case 4: {
Eigen::DSizes<int32, 4> four_dims(1, channel, 1, 1);
Expand Down Expand Up @@ -251,9 +251,8 @@ class BiasGradOp : public OpKernel {
output_backprop.shape().DebugString()));

OP_REQUIRES(
context,
FastBoundsCheck(output_backprop.NumElements(),
std::numeric_limits<int32>::max()),
context, FastBoundsCheck(output_backprop.NumElements(),
std::numeric_limits<int32>::max()),
errors::InvalidArgument("BiasGrad requires tensor size <= int32 max"));

int32 batch, height, width, depth, channel;
Expand All @@ -270,24 +269,17 @@ class BiasGradOp : public OpKernel {
output->template flat<T>().setZero();
} else {
// Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
using AccumT = typename AccumulatorType<T>::type;
if (data_format_ == FORMAT_NCHW) {
const functor::ReduceMiddleDimensions<
T, AccumT, Eigen::internal::scalar_sum_op<AccumT>,
Eigen::internal::SumReducer<T>>
redux;
Eigen::DSizes<Eigen::Index, 3> three_dims(batch, channel,
height * width * depth);
#ifdef EIGEN_HAS_INDEX_LIST
using idx0 = Eigen::type2index<0>;
using idx2 = Eigen::type2index<2>;
Eigen::IndexList<idx0, idx2> reduction_axes;
#else
Eigen::array<Eigen::Index, 2> reduction_axes = {0, 2};
#endif
output->template flat<T>().device(context->eigen_device<Device>()) =
output_backprop.flat<T>()
.template cast<typename AccumulatorType<T>::type>()
.reshape(three_dims)
.sum(reduction_axes)
.template cast<T>(); // End of code by intel_tf.
redux(context->eigen_device<Device>(), three_dims, output_backprop,
output, 1);
} else {
using AccumT = typename AccumulatorType<T>::type;
const functor::ReduceOuterDimensions<
T, AccumT, Eigen::internal::scalar_sum_op<AccumT>>
redux;
Expand Down