Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Intel MKL] support MKL Quantized Matmul With Bias and Requantize Op #33926

8 changes: 8 additions & 0 deletions tensorflow/core/graph/mkl_layout_pass.cc
Expand Up @@ -328,6 +328,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
"QuantizedMatMulWithBiasAndRelu";
csinfo_.quantized_matmul_with_bias_and_relu_and_requantize =
"QuantizedMatMulWithBiasAndReluAndRequantize";
csinfo_.quantized_matmul_with_bias_and_requantize =
"QuantizedMatMulWithBiasAndRequantize";
csinfo_.quantized_depthwise_conv2d = "QuantizedDepthwiseConv2D";
csinfo_.quantized_depthwise_conv2d_with_bias =
"QuantizedDepthwiseConv2DWithBias";
Expand Down Expand Up @@ -621,6 +623,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
mkl_op_registry::GetMklOpName(
csinfo_.quantized_matmul_with_bias_and_relu_and_requantize),
CopyAttrsQuantizedMatMulWithBias, AlwaysRewrite});
rinfo_.push_back({csinfo_.quantized_matmul_with_bias_and_requantize,
mkl_op_registry::GetMklOpName(
csinfo_.quantized_matmul_with_bias_and_requantize),
CopyAttrsQuantizedMatMulWithBias, AlwaysRewrite});
rinfo_.push_back(
{csinfo_.quantized_depthwise_conv2d,
mkl_op_registry::GetMklOpName(csinfo_.quantized_depthwise_conv2d),
Expand Down Expand Up @@ -955,6 +961,7 @@ rinfo_.push_back({csinfo_.tanh_grad,
string quantized_matmul_with_bias;
string quantized_matmul_with_bias_and_relu;
string quantized_matmul_with_bias_and_relu_and_requantize;
string quantized_matmul_with_bias_and_requantize;
string quantized_depthwise_conv2d;
string quantized_depthwise_conv2d_with_bias;
string quantized_depthwise_conv2d_with_bias_and_relu;
Expand Down Expand Up @@ -2218,6 +2225,7 @@ Status MklLayoutRewritePass::SetUpInputs(
"QuantizedConv2DWithBiasSumAndReluAndRequantize",
"QuantizedConv2DWithBiasSignedSumAndReluAndRequantize",
"QuantizedMatMulWithBias",
"QuantizedMatMulWithBiasAndRequantize",
"QuantizedMatMulWithBiasAndRelu",
"QuantizedMatMulWithBiasAndReluAndRequantize",
"QuantizedDepthwiseConv2D",
Expand Down
32 changes: 32 additions & 0 deletions tensorflow/core/kernels/mkl_qmatmul_op.cc
Expand Up @@ -614,6 +614,17 @@ REGISTER_KERNEL_BUILDER(Name("QuantizedMatMulWithBiasAndReluAndRequantize")
.TypeConstraint<quint8>("Toutput"),
NoOp);

// Register NoOp kernel for QuantizedIPWithBiasAndRequantize
nammbash marked this conversation as resolved.
Show resolved Hide resolved
// to get a python interface. This kernel will be replaced by an MKL kernel
// during graph-optimization pass.
REGISTER_KERNEL_BUILDER(Name("QuantizedMatMulWithBiasAndRequantize")
.Device(DEVICE_CPU)
.TypeConstraint<quint8>("T1")
.TypeConstraint<qint8>("T2")
.TypeConstraint("Tbias", {DT_QINT32, DT_FLOAT})
.TypeConstraint<quint8>("Toutput"),
NoOp);

// Register a templatized implementation of _MklQuantizedMatMulWithBiasAndRelu.
REGISTER_KERNEL_BUILDER(
Name("_MklQuantizedMatMulWithBiasAndRelu")
Expand Down Expand Up @@ -644,6 +655,27 @@ REGISTER_KERNEL_BUILDER(
.Label(mkl_op_registry::kMklQuantizedOpLabel),
MklDnnQuantizedMatMulReluOp<CPUDevice, quint8, qint8, float, quint8>);

// Register a templatized implementation of
// _MklQuantizedMatMulWithBiasAndRequantize.
REGISTER_KERNEL_BUILDER(
Name("_MklQuantizedMatMulWithBiasAndRequantize")
.Device(DEVICE_CPU)
.TypeConstraint<quint8>("T1")
.TypeConstraint<qint8>("T2")
.TypeConstraint<qint32>("Tbias")
.TypeConstraint<quint8>("Toutput")
.Label(mkl_op_registry::kMklQuantizedOpLabel),
MklDnnQuantizedMatMulOp<CPUDevice, quint8, qint8, qint32, quint8>);
REGISTER_KERNEL_BUILDER(
Name("_MklQuantizedMatMulWithBiasAndRequantize")
.Device(DEVICE_CPU)
.TypeConstraint<quint8>("T1")
.TypeConstraint<qint8>("T2")
.TypeConstraint<float>("Tbias")
.TypeConstraint<quint8>("Toutput")
.Label(mkl_op_registry::kMklQuantizedOpLabel),
MklDnnQuantizedMatMulOp<CPUDevice, quint8, qint8, float, quint8>);

} // namespace tensorflow

#endif // INTEL_MKL
93 changes: 93 additions & 0 deletions tensorflow/core/kernels/mkl_qmatmul_op_test.cc
Expand Up @@ -300,6 +300,99 @@ TEST_F(QuantizedMatMulTest, Small_WithNegInp) {
test::ExpectTensorEqual<qint32>(expected, output_quantized);
}

// Two small matrices A of type uint8 and B of type int8 are multiplied
// and the result is added with int32 bias and Requantization fusion
TEST_F(QuantizedMatMulTest, Small_withBiasAndReq) {
TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
"_MklQuantizedMatMulWithBiasAndRequantize")
.Input(FakeInput(DT_QUINT8))
.Input(FakeInput(DT_QINT8))
.Input(FakeInput(DT_QINT32))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_FLOAT))
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Input(FakeInput(DT_UINT8)) // MKL second tensor
.Attr("Toutput", DataTypeToEnum<quint8>::v())
.Attr("T", DataTypeToEnum<quint8>::v())
.Attr("_kernel", "QuantizedMklOp")
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
// A matrix is:
// | 1 | 2 | 3 |
// | 4 | 5 | 6 |
AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
// B matrix is:
// | 7 | 8 | 9 | 10 |
// | 11 | 12 | 13 | 14 |
// | 15 | 16 | 17 | 18 |
AddInputFromArray<qint8>(TensorShape({3, 4}),
{7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
AddInputFromArray<qint32>(TensorShape({4}), {10, -20, 30, -40});
AddInputFromArray<float>(TensorShape({1}), {0});
AddInputFromArray<float>(TensorShape({1}), {255.0f});
AddInputFromArray<float>(TensorShape({1}), {-127.0f});
AddInputFromArray<float>(TensorShape({1}), {127.0f});
AddInputFromArray<float>(TensorShape({1}), {0});
AddInputFromArray<float>(TensorShape({1}), {255.0f});
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);
AddInputFromArray<uint8>(kDummyShape, kDummyTensor);

TF_ASSERT_OK(RunOpKernel());
// Here are the results we expect, from hand calculations:
// (1 * 7) + (2 * 11) + (3 * 15) = 74
// (1 * 8) + (2 * 12) + (3 * 16) = 80
// (1 * 9) + (2 * 13) + (3 * 17) = 86
// (1 * 10) + (2 * 14) + (3 * 18) = 92
// (4 * 7) + (5 * 11) + (6 * 15) = 173
// (4 * 8) + (5 * 12) + (6 * 16) = 188
// (4 * 9) + (5 * 13) + (6 * 17) = 203
// (4 * 10) + (5 * 14) + (6 * 18) = 218
// After Bias addition
// 74+10=84, 80-20=60, 86+30=116, 92-40=52,
// 173+10=183, 188-20=168, 203+30=233, 218-40=178
// After Requantize
nammbash marked this conversation as resolved.
Show resolved Hide resolved
// requantscale = scale_int32 / scale_eightbit / static_cast<float>(1 << 23)
// requantscale = 2^31/255/2^23 ~= 1.00392
// 84 * 1.00392 ~= 84.329 ~= 84
// 60 * 1.00392 ~= 60.235 ~= 60
// 116 * 1.00392 ~= 116.454 ~= 116
// 52 * 1.00392 ~= 52.203 ~= 52
// 183 * 1.00392 ~= 183.717 ~= 184
// 168 * 1.00392 ~= 168.658 ~= 169
// 233 * 1.00392 ~= 233.913 ~= 234
// 178 * 1.00392 ~= 178.698 ~= 179

Tensor expected(allocator(), DT_QUINT8, TensorShape({2, 4}));
test::FillValues<quint8>(&expected, {84, 60, 116, 52, 184, 169, 234, 179});

const Tensor& output = *GetOutput(0);
const Tensor& mkl_shape_tensor = *GetOutput(3);
ConvMklToTF conv_comp;
Tensor output_quantized;
conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
output_quantized);

test::ExpectTensorEqual<quint8>(expected, output_quantized);
}

// Two small matrices A of type uint8 and B of type int8 are multiplied
// and the result is added with float bias and then performed relu on the result
TEST_F(QuantizedMatMulTest, Small_withBiasAndRelu) {
Expand Down
50 changes: 50 additions & 0 deletions tensorflow/core/ops/mkl_nn_ops.cc
Expand Up @@ -974,6 +974,56 @@ REGISTER_OP("_MklQuantizedMatMulWithBiasAndReluAndRequantize")
return Status::OK();
});

REGISTER_OP("_MklQuantizedMatMulWithBiasAndRequantize")
.Input("a: T1")
.Input("b: T2")
.Input("bias: Tbias")
.Input("min_a: float")
.Input("max_a: float")
.Input("min_b: float")
.Input("max_b: float")
.Input("min_freezed_output: float")
.Input("max_freezed_output: float")
.Input("mkl_a: uint8") // MKL second tensor
.Input("mkl_b: uint8") // MKL second tensor
.Input("mkl_bias: uint8") // MKL second tensor
.Input("mkl_min_a: uint8") // MKL second tensor
.Input("mkl_max_a: uint8") // MKL second tensor
.Input("mkl_min_b: uint8") // MKL second tensor
.Input("mkl_max_b: uint8") // MKL second tensor
.Input("mkl_min_freezed_output: uint8") // MKL second tensor
.Input("mkl_max_freezed_output: uint8") // MKL second tensor
.Output("out: Toutput")
.Output("min_out: float")
.Output("max_out: float")
.Output("mkl_out: uint8") // MKL second tensor
.Output("mkl_min_out: uint8") // MKL second tensor
.Output("mkl_max_out: uint8") // MKL second tensor
.Attr("T1: quantizedtype")
.Attr("T2: quantizedtype")
.Attr("Tbias: {float, qint32}")
.Attr("T: quantizedtype") // Additional attr "T" for MklToTf conversion
.Attr("Toutput: quantizedtype = DT_QUINT8")
.Attr("transpose_a: bool = false")
.Attr("transpose_b: bool = false")
.Attr("input_quant_mode: {'MIN_FIRST', 'SCALED'} = 'MIN_FIRST'")
.Attr("is_weight_const: bool = true")
.SetShapeFn([](InferenceContext* c) {
TF_RETURN_IF_ERROR(shape_inference::MatMulShape(c));
ShapeHandle unused;
TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));

c->set_output(1, c->Scalar());
c->set_output(2, c->Scalar());
return Status::OK();
});

REGISTER_OP("_MklQuantizedDepthwiseConv2D")
.Input("input: Tinput")
.Input("filter: Tfilter")
Expand Down
47 changes: 41 additions & 6 deletions tensorflow/core/ops/nn_ops.cc
Expand Up @@ -1281,9 +1281,9 @@ Status TopKShapeFn(InferenceContext* c) {
DimensionHandle last_dim = c->Dim(input, -1);
if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
c->Value(last_dim) < c->Value(k_dim)) {
return errors::InvalidArgument(
"input must have last dimension >= k = ", c->Value(k_dim), " but is ",
c->Value(last_dim));
return errors::InvalidArgument("input must have last dimension >= k = ",
c->Value(k_dim), " but is ",
c->Value(last_dim));
}

// Replace last_dim with k_dim.
Expand Down Expand Up @@ -1337,9 +1337,9 @@ REGISTER_OP("NthElement")
DimensionHandle last_dim = c->Dim(input, -1);
if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
c->Value(last_dim) <= c->Value(n_dim)) {
return errors::InvalidArgument(
"Input must have last dimension > n = ", c->Value(n_dim),
" but is ", c->Value(last_dim));
return errors::InvalidArgument("Input must have last dimension > n = ",
c->Value(n_dim), " but is ",
c->Value(last_dim));
}

// Reduce last_dim for output tensor
Expand Down Expand Up @@ -3184,6 +3184,41 @@ REGISTER_OP("QuantizedMatMulWithBiasAndReluAndRequantize")
return Status::OK();
});

REGISTER_OP("QuantizedMatMulWithBiasAndRequantize")
.Input("a: T1")
.Input("b: T2")
.Input("bias: Tbias")
.Input("min_a: float")
.Input("max_a: float")
.Input("min_b: float")
.Input("max_b: float")
.Input("min_freezed_output: float")
.Input("max_freezed_output: float")
.Output("out: Toutput")
.Output("min_out: float")
.Output("max_out: float")
.Attr("T1: quantizedtype")
.Attr("T2: quantizedtype")
.Attr("Tbias: {float, qint32}")
.Attr("Toutput: quantizedtype = DT_QUINT8")
.Attr("transpose_a: bool = false")
.Attr("transpose_b: bool = false")
.Attr("input_quant_mode: {'MIN_FIRST', 'SCALED'} = 'MIN_FIRST'")
.SetShapeFn([](InferenceContext* c) {
TF_RETURN_IF_ERROR(shape_inference::MatMulShape(c));
ShapeHandle unused;
TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
c->set_output(1, c->Scalar());
c->set_output(2, c->Scalar());
return Status::OK();
});

REGISTER_OP("QuantizedConv2DPerChannel")
.Input("input: Tinput")
.Input("filter: Tfilter")
Expand Down