From cedb308b430b48f16a925f851b978c18321db702 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Sun, 7 Feb 2021 18:37:09 +0900
Subject: [PATCH 01/21] Add inner_product_adj_grad and its test

---
 tensorflow_quantum/core/ops/math_ops/BUILD    |   3 +-
 .../core/ops/math_ops/inner_product_op.py     |  91 ++++-
 .../ops/math_ops/inner_product_op_test.py     | 297 +++++++++++++++-
 .../math_ops/tfq_inner_product_adj_grad.cc    | 316 ++++++++++++++++++
 4 files changed, 701 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc

diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index 5db4c7d0c..31d690423 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -14,6 +14,7 @@ cc_binary(
     name = "_tfq_math_ops.so",
     srcs = [
         "tfq_inner_product.cc",
+        "tfq_inner_product_adj_grad.cc",
     ],
     copts = select({
         ":windows": [
@@ -58,8 +59,8 @@ cc_binary(
     deps = [
         "//tensorflow_quantum/core/ops:parse_context",
         "//tensorflow_quantum/core/ops:tfq_simulate_utils",
-        "//tensorflow_quantum/core/src:util_qsim",
         "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "//tensorflow_quantum/core/src:util_qsim",
         "@qsim//lib:qsim_lib",
     ],
 )
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index ed2fb5c43..e10a8006b 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -20,6 +20,75 @@
 MATH_OP_MODULE = load_module(os.path.join("math_ops", "_tfq_math_ops.so"))
 
 
+def inner_product_adj_grad(programs, symbol_names, symbol_values,
+    other_programs):
+    """Calculate the adjoint gradients of the inner product between circuits.
+
+    Compute the gradients of the (potentially many) inner products between
+    the given circuits and the symbol free comparison circuits.
+
+    Calculates out[i][j][k] = $ \frac{\langle \psi_{\text{programs[i]}} \\
+        (\text{symbol_values[i]})}{\partial \text{symbol_names[k]}} | \\
+        \psi_{\text{other_programs[j]}} \rangle $
+
+
+    >>> symbols = sympy.symbols('alpha beta')
+    >>> qubits = cirq.GridQubit.rect(1, 2)
+    >>> reference_circuits = [
+    ...     cirq.Circuit((cirq.H**symbols[0]).on_each(qubits)),
+    ...     cirq.Circuit(
+    ...         cirq.X(qubits[0]) ** symbols[0],
+    ...         cirq.Y(qubits[1]) ** symbols[1])
+    ... ]
+    >>> other_circuits = [
+    ...     cirq.Circuit(cirq.X.on_each(qubits)),
+    ...     cirq.Circuit((cirq.Y**0.125).on_each(qubits)),
+    ...     cirq.Circuit((cirq.X**0.5).on_each(qubits))
+    ... ]
+    >>> reference_tensor = tfq.convert_to_tensor(reference_circuits)
+    >>> symbol_tensor = tf.convert_to_tensor([s.name for s in symbols])
+    >>> values_tensor = tf.convert_to_tensor(np.arange(4).reshape(2, 2))
+    >>> other_tensor = tfq.convert_to_tensor([other_circuits, other_circuits])
+    >>> grad_ip = tfq.math.inner_product_adj_grad(
+    ...               reference_tensor, symbol_tensor, values_tensor,
+    ...               other_tensor)
+    >>> grad_ip
+
+
+    Note: `other_programs` must not contain any free symbols. These can
+        be resolved beforehand with `tfq.resolve_parameters`.
+
+    Note: Currently this op is not differentiable.
+
+    Note: len(symbol_names) (=n_params) should be a positive integer.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        other_programs: `tf.Tensor` of strings with shape [batch_size, n_others]
+            containing the string representations of the circuits with which to
+            compute the overlap on `programs` with. Must not contain any free
+            symbols.
+    Returns:
+        `tf.Tensor` with shape [batch_size, n_others, n_symbols] where
+        `out[i][j][k]` is equal to the gradient w.r.t. `symbol_names[k]` of the
+        inner product between `programs[i]` with `symbol_values[i]` resolved in
+        and `other_programs[j]`.
+    """
+    return MATH_OP_MODULE.tfq_inner_product_adj_grad(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32),
+        other_programs)
+
+
+@tf.custom_gradient
 def inner_product(programs, symbol_names, symbol_values, other_programs):
     """Calculate the inner product between circuits.
 
@@ -61,7 +130,7 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     Note: `other_programs` must not contain any free symbols. These can
         be resolved beforehand with `tfq.resolve_parameters`.
 
-    Note: Currently this op is not differentiable.
+    Note: Currently this op is differentiable via adjoint differentiation.
 
     Args:
         programs: `tf.Tensor` of strings with shape [batch_size] containing
@@ -82,8 +151,24 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
         `tf.Tensor` with shape [batch_size, n_others] where `out[i][j]` is equal
             to the inner product of `programs[i]` with `symbol_values[i]`
             resolved in and `other_programs[i][j]`.
-
     """
+    def grad(dy):
+        """Calculate the gradients of this inner_product op.
+
+        Args:
+            dy: `tf.Tensor` of gradients coming from the next computational op
+                with the shape [batch_size, n_others]
+
+        Returns:
+            `tf.Tensor` with shape [batch_size, n_others, n_symbols] where
+            `out[i][j][k]` is equal to the gradient of the above inner product
+             w.r.t. `symbol_names[k]` merged with the gradient `dy` from the
+             next computational op.
+        """
+        inner_prod_grad = inner_product_adj_grad(
+            programs, symbol_names, tf.cast(symbol_values, tf.float32),
+            other_programs)
+        return tf.einsum("bos,bo->bos", inner_prod_grad, dy)
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
                                             tf.cast(symbol_values, tf.float32),
-                                            other_programs)
+                                            other_programs), grad
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index e63b08354..831da9737 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests that specifically target tfq_simulate_ops."""
+"""Tests that specifically target tfq_inner_product*."""
+import copy
 import numpy as np
 from absl.testing import parameterized
 import tensorflow as tf
@@ -271,7 +272,7 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     ])
     def test_correctness_without_symbols(self, n_qubits, batch_size,
                                          inner_dim_size):
-        """Test that inner_product works with symbols."""
+        """Test that inner_product works without symbols."""
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, _ = \
             util.random_circuit_resolver_batch(
@@ -313,5 +314,297 @@ def test_correctness_empty(self):
         self.assertAllClose(out, expected)
 
 
+class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_inner_product_adj_grad."""
+
+    def test_inner_product_adj_grad_inputs(self):
+        """Makes sure that inner_product_adj_grad fails on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, 3)[0]
+            for i in range(batch_size)
+        ]
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0], util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # other_programs tensor has too few dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(circuit_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in other_batch]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            inner_product_op.inner_product_adj_grad(['junk'] * batch_size, symbol_names,
+                                           symbol_values_array,
+                                           util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in reference circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in paired circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad([1.0] * batch_size, symbol_names,
+                                           symbol_values_array,
+                                           util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # other_programs tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(other_batch), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[::int(batch_size * 0.5)],
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(
+            tf.errors.InvalidArgumentError,
+            expected_regex='Found symbols in other_programs'):
+            # other_programs has symbols.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in circuit_batch]))
+
+        res = inner_product_op.inner_product_adj_grad(
+            util.convert_to_tensor(circuit_batch), symbol_names,
+            symbol_values_array.astype(np.float64),
+            util.convert_to_tensor(other_batch))
+        self.assertDTypeEqual(res, np.complex64)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_with_symbols(self, n_qubits, batch_size,
+        inner_dim_size):
+        """Test that inner_product works with symbols."""
+        symbol_names = ['alpha', 'beta', 'gamma']
+        n_params = len(symbol_names)
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor(symbol_names,
+                                            dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor(symbol_values_array)
+
+        out_arr = np.empty((batch_size, inner_dim_size, n_params),
+                           dtype=np.complex64)
+        dx = 1e-4
+        for i in range(batch_size):
+            for k, name in enumerate(symbol_names):
+                new_resolver = copy.deepcopy(resolver_batch[i])
+                new_resolver.param_dict[name] += dx
+                final_circuit_p = cirq.resolve_parameters(circuit_batch[i],
+                                                          new_resolver)
+                new_resolver.param_dict[name] -= 2*dx
+                final_circuit_m = cirq.resolve_parameters(circuit_batch[i],
+                                                          new_resolver)
+                final_wf_p = cirq.final_state_vector(final_circuit_p)
+                final_wf_m = cirq.final_state_vector(final_circuit_p)
+                final_wf = (final_wf_p - final_wf_m) / 2.0 / dx
+                for j in range(inner_dim_size):
+                    internal_wf = cirq.final_state_vector(other_batch[i][j])
+                    out_arr[i][j][k] = np.vdot(final_wf, internal_wf)
+
+        self.assertAllClose(out, out_arr, atol=1e-5)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_without_symbols(self, n_qubits, batch_size,
+        inner_dim_size):
+        """Test that inner_product_adj_grad works without symbols."""
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, _ = \
+            util.random_circuit_resolver_batch(
+                qubits, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'params must be a positive integer.'):
+            out = inner_product_op.inner_product_adj_grad(
+                programs, symbol_names, symbol_values, other_programs)
+
+    def test_correctness_empty(self):
+        """Test the inner product adj grad between two empty circuits."""
+
+        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'params must be a positive integer.'):
+            out = inner_product_op.inner_product_adj_grad(
+                empty_cicuit, empty_symbols, empty_values, other_program)
+
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
new file mode 100644
index 000000000..356a71f29
--- /dev/null
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -0,0 +1,316 @@
+/* Copyright 2021 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simmux.h"
+#include "cirq/google/api/v2/program.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::cirq::google::api::v2::Program;
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
+
+class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
+ public:
+  explicit TfqInnerProductAdjGradOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 4,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 4 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_internal_size = context->input(3).dim_size(1);
+    const int output_dim_symbols_size = context->input(1).dim_size(0);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_internal_size);
+    output_shape.AddDim(output_dim_symbols_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->tensor<std::complex<float>, 3>();
+
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<Program>> other_programs;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits,
+                                           &other_programs));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits for programs.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<QsimFusedCircuit> fused_circuits(programs.size(),
+                                                 QsimFusedCircuit({}));
+
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        OP_REQUIRES_OK(context, QsimCircuitFromProgram(
+                                    programs[i], maps[i], num_qubits[i],
+                                    &qsim_circuits[i], &fused_circuits[i]));
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        output_dim_batch_size, num_cycles, construct_f);
+
+    // Construct qsim circuits for other_programs.
+    std::vector<std::vector<QsimCircuit>> other_qsim_circuits(
+        output_dim_batch_size,
+        std::vector<QsimCircuit>(output_dim_internal_size, QsimCircuit()));
+    std::vector<std::vector<QsimFusedCircuit>> other_fused_circuits(
+        output_dim_batch_size,
+        std::vector<QsimFusedCircuit>(output_dim_internal_size,
+                                      QsimFusedCircuit({})));
+
+    auto construct_f2 = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        int ii = i / output_dim_internal_size;
+        int jj = i % output_dim_internal_size;
+        Status status = QsimCircuitFromProgram(
+            other_programs[ii][jj], {}, num_qubits[ii],
+            &other_qsim_circuits[ii][jj], &other_fused_circuits[ii][jj]);
+        OP_REQUIRES(context, status.ok(),
+                    tensorflow::errors::InvalidArgument(absl::StrCat(
+                        "Found symbols in other_programs.",
+                        "No symbols are allowed in these circuits.")));
+      }
+    };
+
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        output_dim_batch_size * output_dim_internal_size, num_cycles,
+        construct_f2);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    // Cross reference with standard google cloud compute instances
+    // Memory ~= 2 * num_threads * (2 * 64 * 2 ** num_qubits in circuits)
+    // e2s2 = 2 CPU, 8GB -> Can safely do 25 since Memory = 4GB
+    // e2s4 = 4 CPU, 16GB -> Can safely do 25 since Memory = 8GB
+    // ...
+    if (max_num_qubits >= 26 || output_dim_batch_size == 1) {
+      ComputeLarge(num_qubits, fused_circuits, other_fused_circuits, context,
+                   &output_tensor);
+    } else {
+      ComputeSmall(num_qubits, max_num_qubits, fused_circuits,
+                   other_fused_circuits, context, &output_tensor);
+    }
+  }
+
+ private:
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
+    // Instantiate qsim objects.
+    const auto tfq_for = tfq::QsimFor(context);
+    using Simulator = qsim::Simulator<const tfq::QsimFor&>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(tfq_for);
+    StateSpace ss = StateSpace(tfq_for);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+      }
+      // TODO: add heuristic here so that we do not always recompute
+      //  the state if there is a possibility that circuit[i] and
+      //  circuit[i + 1] produce the same state.
+      ss.SetStateZero(sv);
+      for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+      for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0; j < other_fused_circuits[i].size(); j++) {
+        // (#679) Just ignore empty program
+        int k = 0;
+        if (fused_circuits[i].size() == 0) {
+          (*output_tensor)(i, j, k) = std::complex<float>(0, 0);
+          continue;
+        }
+
+        ss.SetStateZero(scratch);
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0; k < other_fused_circuits[i][j].size(); k++) {
+          qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);
+        }
+
+        std::complex<double> result = ss.InnerProduct(sv, scratch);
+        (*output_tensor)(i, j, k) =
+            std::complex<float>(static_cast<float>(result.real()),
+                                static_cast<float>(result.imag()));
+      }
+    }
+  }
+
+  void ComputeSmall(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
+    const auto tfq_for = qsim::SequentialFor(1);
+    using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
+    using StateSpace = Simulator::StateSpace;
+
+    const int output_dim_internal_size = output_tensor->dimension(1);
+
+    auto DoWork = [&](int start, int end) {
+      int old_batch_index = -2;
+      int cur_batch_index = -1;
+      int largest_nq = 1;
+      int cur_internal_index;
+
+      Simulator sim = Simulator(tfq_for);
+      StateSpace ss = StateSpace(tfq_for);
+      auto sv = ss.Create(largest_nq);
+      auto scratch = ss.Create(largest_nq);
+      for (int i = start; i < end; i++) {
+        cur_batch_index = i / output_dim_internal_size;
+        cur_internal_index = i % output_dim_internal_size;
+
+        const int nq = num_qubits[cur_batch_index];
+
+        // (#679) Just ignore empty program
+        int k = 0;
+        if (fused_circuits[cur_batch_index].size() == 0) {
+          (*output_tensor)(cur_batch_index, cur_internal_index, k) =
+              std::complex<float>(0, 0);
+          continue;
+        }
+
+        if (cur_batch_index != old_batch_index) {
+          // We've run into a new state vector we must compute.
+          // Only compute a new state vector when we have to.
+          if (nq > largest_nq) {
+            largest_nq = nq;
+            sv = ss.Create(largest_nq);
+            scratch = ss.Create(largest_nq);
+          }
+          // no need to update scratch_state since ComputeExpectation
+          // will take care of things for us.
+          ss.SetStateZero(sv);
+          for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
+            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
+          }
+        }
+
+        ss.SetStateZero(scratch);
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
+             k <
+             other_fused_circuits[cur_batch_index][cur_internal_index].size();
+             k++) {
+          qsim::ApplyFusedGate(
+              sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
+              scratch);
+        }
+
+        std::complex<double> result = ss.InnerProduct(sv, scratch);
+        (*output_tensor)(cur_batch_index, cur_internal_index, k) =
+            std::complex<float>(static_cast<float>(result.real()),
+                                static_cast<float>(result.imag()));
+
+        old_batch_index = cur_batch_index;
+      }
+    };
+
+    const int64_t num_cycles =
+        200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        fused_circuits.size() * output_dim_internal_size, num_cycles, DoWork);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TfqInnerProduct").Device(tensorflow::DEVICE_CPU),
+                        TfqInnerProductAdjGradOp);
+
+REGISTER_OP("TfqInnerProduct")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("other_programs: string")
+    .Output("inner_products: complex64")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle other_programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &other_programs_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(other_programs_shape, 1);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return tensorflow::Status::OK();
+    });
+
+}  // namespace tfq

From 66537dde403bd007bc6ec1903934a90456c90b14 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Sun, 7 Feb 2021 19:12:31 +0900
Subject: [PATCH 02/21] Fix TfqInnerProductAdjGradTest and add
 len(symbol_names) > 0 assertion.

---
 .../core/ops/math_ops/inner_product_op_test.py  | 17 ++++++++++-------
 .../ops/math_ops/tfq_inner_product_adj_grad.cc  | 16 ++++++++++++----
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 831da9737..39af4964b 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -527,10 +527,13 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
 
         programs = util.convert_to_tensor(circuit_batch)
         other_programs = util.convert_to_tensor(other_batch)
-        symbol_names = tf.convert_to_tensor(symbol_names,
-                                            dtype=tf.dtypes.string)
+        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                                   dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor(symbol_values_array)
 
+        out = inner_product_op.inner_product_adj_grad(
+            programs, symbol_names_tensor, symbol_values, other_programs)
+
         out_arr = np.empty((batch_size, inner_dim_size, n_params),
                            dtype=np.complex64)
         dx = 1e-4
@@ -544,11 +547,11 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
                 final_circuit_m = cirq.resolve_parameters(circuit_batch[i],
                                                           new_resolver)
                 final_wf_p = cirq.final_state_vector(final_circuit_p)
-                final_wf_m = cirq.final_state_vector(final_circuit_p)
-                final_wf = (final_wf_p - final_wf_m) / 2.0 / dx
+                final_wf_m = cirq.final_state_vector(final_circuit_m)
+                final_wf_grad = (final_wf_p - final_wf_m) / 2.0 / dx
                 for j in range(inner_dim_size):
                     internal_wf = cirq.final_state_vector(other_batch[i][j])
-                    out_arr[i][j][k] = np.vdot(final_wf, internal_wf)
+                    out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
 
         self.assertAllClose(out, out_arr, atol=1e-5)
 
@@ -589,7 +592,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'params must be a positive integer.'):
+                                    'symbols must be a positive integer'):
             out = inner_product_op.inner_product_adj_grad(
                 programs, symbol_names, symbol_values, other_programs)
 
@@ -602,7 +605,7 @@ def test_correctness_empty(self):
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'params must be a positive integer.'):
+                                    'symbols must be a positive integer'):
             out = inner_product_op.inner_product_adj_grad(
                 empty_cicuit, empty_symbols, empty_values, other_program)
 
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 356a71f29..0adcbe899 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -57,6 +57,10 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     const int output_dim_batch_size = context->input(0).dim_size(0);
     const int output_dim_internal_size = context->input(3).dim_size(1);
     const int output_dim_symbols_size = context->input(1).dim_size(0);
+    OP_REQUIRES(context, output_dim_symbols_size > 0,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "The number of symbols must be a positive integer, got ",
+                    output_dim_symbols_size, " symbols.")));
     tensorflow::TensorShape output_shape;
     output_shape.AddDim(output_dim_batch_size);
     output_shape.AddDim(output_dim_internal_size);
@@ -282,10 +286,10 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TfqInnerProduct").Device(tensorflow::DEVICE_CPU),
-                        TfqInnerProductAdjGradOp);
+REGISTER_KERNEL_BUILDER(Name("TfqInnerProductAdjGrad")
+    .Device(tensorflow::DEVICE_CPU), TfqInnerProductAdjGradOp);
 
-REGISTER_OP("TfqInnerProduct")
+REGISTER_OP("TfqInnerProductAdjGrad")
     .Input("programs: string")
     .Input("symbol_names: string")
     .Input("symbol_values: float")
@@ -308,7 +312,11 @@ REGISTER_OP("TfqInnerProduct")
           c->Dim(programs_shape, 0);
       tensorflow::shape_inference::DimensionHandle output_cols =
           c->Dim(other_programs_shape, 1);
-      c->set_output(0, c->Matrix(output_rows, output_cols));
+      tensorflow::shape_inference::DimensionHandle n_symbols =
+          c->Dim(symbol_names_shape, 0);
+      std::vector<tensorflow::shape_inference::DimensionHandle> dims =
+          {output_rows, output_cols, n_symbols};
+      c->set_output(0, c->MakeShape(dims));
 
       return tensorflow::Status::OK();
     });

From 2406516c0578109b6abc264650114e1529cdcfac Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 13:12:40 +0900
Subject: [PATCH 03/21] Add ComputeSmall at inner_product_adj_grad

---
 tensorflow_quantum/core/ops/math_ops/BUILD    |   1 +
 .../math_ops/tfq_inner_product_adj_grad.cc    | 193 ++++++++++++++----
 .../core/ops/tfq_adj_grad_op.cc               |   2 +-
 3 files changed, 158 insertions(+), 38 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index 31d690423..d18bed67b 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -59,6 +59,7 @@ cc_binary(
     deps = [
         "//tensorflow_quantum/core/ops:parse_context",
         "//tensorflow_quantum/core/ops:tfq_simulate_utils",
+        "//tensorflow_quantum/core/src:adj_util",
         "//tensorflow_quantum/core/src:circuit_parser_qsim",
         "//tensorflow_quantum/core/src:util_qsim",
         "@qsim//lib:qsim_lib",
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 0adcbe899..6996adb30 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/src/adj_util.h"
 #include "tensorflow_quantum/core/src/util_qsim.h"
 
 namespace tfq {
@@ -92,11 +93,29 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     std::vector<QsimFusedCircuit> fused_circuits(programs.size(),
                                                  QsimFusedCircuit({}));
 
+    // track metadata.
+    std::vector<std::vector<tfq::GateMetaData>> gate_meta(
+        programs.size(), std::vector<tfq::GateMetaData>({}));
+
+    // Construct qsim circuits.
+    std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>
+        partial_fused_circuits(
+            programs.size(),
+            std::vector<std::vector<qsim::GateFused<QsimGate>>>({}));
+
+    // track gradients
+    std::vector<std::vector<GradientOfGate>> gradient_gates(
+        programs.size(), std::vector<GradientOfGate>({}));
+
     auto construct_f = [&](int start, int end) {
       for (int i = start; i < end; i++) {
         OP_REQUIRES_OK(context, QsimCircuitFromProgram(
                                     programs[i], maps[i], num_qubits[i],
-                                    &qsim_circuits[i], &fused_circuits[i]));
+                                    &qsim_circuits[i], &fused_circuits[i],
+                                    &gate_meta[i]));
+
+        CreateGradientCircuit(qsim_circuits[i], gate_meta[i],
+                              &partial_fused_circuits[i], &gradient_gates[i]);
       }
     };
 
@@ -142,18 +161,26 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     // e2s4 = 4 CPU, 16GB -> Can safely do 25 since Memory = 8GB
     // ...
     if (max_num_qubits >= 26 || output_dim_batch_size == 1) {
-      ComputeLarge(num_qubits, fused_circuits, other_fused_circuits, context,
+      ComputeLarge(num_qubits, maps, qsim_circuits, fused_circuits,
+                   partial_fused_circuits,
+                   gradient_gates, other_fused_circuits, context,
                    &output_tensor);
     } else {
-      ComputeSmall(num_qubits, max_num_qubits, fused_circuits,
-                   other_fused_circuits, context, &output_tensor);
+      ComputeSmall(num_qubits, max_num_qubits, maps, qsim_circuits, fused_circuits,
+                   partial_fused_circuits, gradient_gates, other_fused_circuits,
+                   context, &output_tensor);
     }
   }
 
  private:
   void ComputeLarge(
       const std::vector<int>& num_qubits,
+      const std::vector<SymbolMap>& maps,
+      const std::vector<QsimCircuit>& qsim_circuits,
       const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
       const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
       tensorflow::OpKernelContext* context,
       tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
@@ -168,6 +195,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     StateSpace ss = StateSpace(tfq_for);
     auto sv = ss.Create(largest_nq);
     auto scratch = ss.Create(largest_nq);
+    auto scratch2 = ss.Create(largest_nq);
 
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
@@ -180,37 +208,86 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         sv = ss.Create(largest_nq);
         scratch = ss.Create(largest_nq);
       }
-      // TODO: add heuristic here so that we do not always recompute
-      //  the state if there is a possibility that circuit[i] and
-      //  circuit[i + 1] produce the same state.
       ss.SetStateZero(sv);
       for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
       for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0; j < other_fused_circuits[i].size(); j++) {
-        // (#679) Just ignore empty program
-        int k = 0;
-        if (fused_circuits[i].size() == 0) {
-          (*output_tensor)(i, j, k) = std::complex<float>(0, 0);
-          continue;
-        }
-
         ss.SetStateZero(scratch);
         for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0; k < other_fused_circuits[i][j].size(); k++) {
           qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);
         }
 
-        std::complex<double> result = ss.InnerProduct(sv, scratch);
-        (*output_tensor)(i, j, k) =
-            std::complex<float>(static_cast<float>(result.real()),
-                                static_cast<float>(result.imag()));
+        // now sv is |psi>, scratch is |phi>
+        // initialize gradients for given |psi> and |phi>.
+        for (std::vector<SymbolMap>::size_type k = 0; k < maps.size(); k++) {
+          (*output_tensor)(i, j, k) = std::complex<float>(0, 0);
+        }
+        // Start adjoint differentiation.
+        for (int l = partial_fused_circuits[i].size() - 1; l >= 0; l--) {
+          for (int k = partial_fused_circuits[i][l].size() - 1; k >= 0; k--) {
+            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv);
+            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], scratch);
+          }
+          if (l == 0) {
+            // last layer will have no parametrized gates so can break.
+            break;
+          }
+
+          // Hit a parameterized gate.
+          // todo fix this copy.
+          auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
+          ApplyGateDagger(sim, cur_gate, sv);
+
+          // if applicable compute control qubit mask and control value bits.
+          uint64_t mask = 0;
+          uint64_t cbits = 0;
+          for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
+            uint64_t control_loc = cur_gate.controlled_by[k];
+            mask |= uint64_t{1} << control_loc;
+            cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+          }
+
+          for (int k = 0; k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
+            // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+            // gate".
+            ss.Copy(sv, scratch2);
+            if (!cur_gate.controlled_by.empty()) {
+              // Gradient of controlled gattes puts zeros on diagonal which is
+              // the same as collapsing the state and then applying the
+              // non-controlled version of the gradient gate.
+              ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
+            }
+            qsim::ApplyGate(sim, gradient_gates[i][l - 1].grad_gates[k],
+                            scratch2);
+
+            // don't need not-found check since this is done upstream already.
+            const auto it = maps[i].find(gradient_gates[i][l - 1].params[k]);
+            const int loc = it->second.first;
+            // Apply finite differencing for adjoint gradients.
+            // Finite differencing enables applying multiple `gradient_gate`
+            // of a symbol at the same circuit. For analytic methods like
+            // parameter-shift we need to apply a single `gradient_gate`
+            // per a symbol.
+            std::complex<double> result = ss.InnerProduct(scratch2, scratch);
+            (*output_tensor)(i, j, loc) +=
+                std::complex<float>(static_cast<float>(result.real()),
+                                    static_cast<float>(result.imag()));
+          }
+          ApplyGateDagger(sim, cur_gate, scratch);
+        }
       }
     }
   }
 
   void ComputeSmall(
       const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<SymbolMap>& maps,
+      const std::vector<QsimCircuit>& qsim_circuits,
       const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
       const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
       tensorflow::OpKernelContext* context,
       tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
@@ -230,20 +307,13 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
       StateSpace ss = StateSpace(tfq_for);
       auto sv = ss.Create(largest_nq);
       auto scratch = ss.Create(largest_nq);
+      auto scratch2 = ss.Create(largest_nq);
       for (int i = start; i < end; i++) {
         cur_batch_index = i / output_dim_internal_size;
         cur_internal_index = i % output_dim_internal_size;
 
         const int nq = num_qubits[cur_batch_index];
 
-        // (#679) Just ignore empty program
-        int k = 0;
-        if (fused_circuits[cur_batch_index].size() == 0) {
-          (*output_tensor)(cur_batch_index, cur_internal_index, k) =
-              std::complex<float>(0, 0);
-          continue;
-        }
-
         if (cur_batch_index != old_batch_index) {
           // We've run into a new state vector we must compute.
           // Only compute a new state vector when we have to.
@@ -261,19 +331,68 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         }
 
         ss.SetStateZero(scratch);
-        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
-             k <
-             other_fused_circuits[cur_batch_index][cur_internal_index].size();
-             k++) {
-          qsim::ApplyFusedGate(
-              sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
-              scratch);
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0; k < other_fused_circuits[cur_batch_index][cur_internal_index].size(); k++) {
+          qsim::ApplyFusedGate(sim, other_fused_circuits[cur_batch_index][cur_internal_index][k], scratch);
         }
 
-        std::complex<double> result = ss.InnerProduct(sv, scratch);
-        (*output_tensor)(cur_batch_index, cur_internal_index, k) =
-            std::complex<float>(static_cast<float>(result.real()),
-                                static_cast<float>(result.imag()));
+        // now sv is |psi>, scratch is |phi>
+        // initialize gradients for given |psi> and |phi>.
+        for (std::vector<SymbolMap>::size_type k = 0; k < maps.size(); k++) {
+          (*output_tensor)(cur_batch_index, cur_internal_index, k) = std::complex<float>(0, 0);
+        }
+        // Start adjoint differentiation.
+        for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0; l--) {
+          for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1; k >= 0; k--) {
+            ApplyFusedGateDagger(sim, partial_fused_circuits[cur_batch_index][l][k], sv);
+            ApplyFusedGateDagger(sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
+          }
+          if (l == 0) {
+            // last layer will have no parametrized gates so can break.
+            break;
+          }
+
+          // Hit a parameterized gate.
+          // todo fix this copy.
+          auto cur_gate = qsim_circuits[cur_batch_index].gates[gradient_gates[cur_batch_index][l - 1].index];
+          ApplyGateDagger(sim, cur_gate, sv);
+
+          // if applicable compute control qubit mask and control value bits.
+          uint64_t mask = 0;
+          uint64_t cbits = 0;
+          for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
+            uint64_t control_loc = cur_gate.controlled_by[k];
+            mask |= uint64_t{1} << control_loc;
+            cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+          }
+
+          for (int k = 0; k < gradient_gates[cur_batch_index][l - 1].grad_gates.size(); k++) {
+            // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+            // gate".
+            ss.Copy(sv, scratch2);
+            if (!cur_gate.controlled_by.empty()) {
+              // Gradient of controlled gattes puts zeros on diagonal which is
+              // the same as collapsing the state and then applying the
+              // non-controlled version of the gradient gate.
+              ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
+            }
+            qsim::ApplyGate(sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
+                            scratch2);
+
+            // don't need not-found check since this is done upstream already.
+            const auto it = maps[cur_batch_index].find(gradient_gates[cur_batch_index][l - 1].params[k]);
+            const int loc = it->second.first;
+            // Apply finite differencing for adjoint gradients.
+            // Finite differencing enables applying multiple `gradient_gate`
+            // of a symbol at the same circuit. For analytic methods like
+            // parameter-shift we need to apply a single `gradient_gate`
+            // per a symbol.
+            std::complex<double> result = ss.InnerProduct(scratch2, scratch);
+            (*output_tensor)(cur_batch_index, cur_internal_index, loc) +=
+                std::complex<float>(static_cast<float>(result.real()),
+                                    static_cast<float>(result.imag()));
+          }
+          ApplyGateDagger(sim, cur_gate, scratch);
+        }
 
         old_batch_index = cur_batch_index;
       }
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
index d2c5782f6..d810c171b 100644
--- a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
@@ -238,7 +238,7 @@ class TfqAdjointGradientOp : public tensorflow::OpKernel {
             // gate".
             ss.Copy(sv, scratch2);
             if (!cur_gate.controlled_by.empty()) {
-              // Gradient of controlled gattes puts zeros on diagonal which is
+              // Gradient of controlled gates puts zeros on diagonal which is
               // the same as collapsing the state and then applying the
               // non-controlled version of the gradient gate.
               ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);

From dd8ed18a4b13abfc082d109daea2f6371d37b537 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 13:15:37 +0900
Subject: [PATCH 04/21] Fix format

---
 .../core/ops/math_ops/inner_product_op.py     |  4 +-
 .../ops/math_ops/inner_product_op_test.py     | 38 ++++----
 .../math_ops/tfq_inner_product_adj_grad.cc    | 87 ++++++++++++-------
 3 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index e10a8006b..0efa22039 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -21,7 +21,7 @@
 
 
 def inner_product_adj_grad(programs, symbol_names, symbol_values,
-    other_programs):
+                           other_programs):
     """Calculate the adjoint gradients of the inner product between circuits.
 
     Compute the gradients of the (potentially many) inner products between
@@ -152,6 +152,7 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
             to the inner product of `programs[i]` with `symbol_values[i]`
             resolved in and `other_programs[i][j]`.
     """
+
     def grad(dy):
         """Calculate the gradients of this inner_product op.
 
@@ -169,6 +170,7 @@ def grad(dy):
             programs, symbol_names, tf.cast(symbol_values, tf.float32),
             other_programs)
         return tf.einsum("bos,bo->bos", inner_prod_grad, dy)
+
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
                                             tf.cast(symbol_values, tf.float32),
                                             other_programs), grad
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 39af4964b..2308a0a7b 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -384,9 +384,9 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Unparseable proto'):
             # circuit tensor has the right type but invalid values.
-            inner_product_op.inner_product_adj_grad(['junk'] * batch_size, symbol_names,
-                                           symbol_values_array,
-                                           util.convert_to_tensor(other_batch))
+            inner_product_op.inner_product_adj_grad(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch))
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Could not find symbol in parameter map'):
@@ -421,9 +421,9 @@ def test_inner_product_adj_grad_inputs(self):
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # circuits tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad([1.0] * batch_size, symbol_names,
-                                           symbol_values_array,
-                                           util.convert_to_tensor(other_batch))
+            inner_product_op.inner_product_adj_grad(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch))
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # symbol_names tensor has the wrong type.
@@ -474,8 +474,8 @@ def test_inner_product_adj_grad_inputs(self):
                 util.convert_to_tensor(other_batch))
 
         with self.assertRaisesRegex(
-            tf.errors.InvalidArgumentError,
-            expected_regex='Found symbols in other_programs'):
+                tf.errors.InvalidArgumentError,
+                expected_regex='Found symbols in other_programs'):
             # other_programs has symbols.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
@@ -506,7 +506,7 @@ def test_inner_product_adj_grad_inputs(self):
         },
     ])
     def test_correctness_with_symbols(self, n_qubits, batch_size,
-        inner_dim_size):
+                                      inner_dim_size):
         """Test that inner_product works with symbols."""
         symbol_names = ['alpha', 'beta', 'gamma']
         n_params = len(symbol_names)
@@ -531,8 +531,10 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
                                                    dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor(symbol_values_array)
 
-        out = inner_product_op.inner_product_adj_grad(
-            programs, symbol_names_tensor, symbol_values, other_programs)
+        out = inner_product_op.inner_product_adj_grad(programs,
+                                                      symbol_names_tensor,
+                                                      symbol_values,
+                                                      other_programs)
 
         out_arr = np.empty((batch_size, inner_dim_size, n_params),
                            dtype=np.complex64)
@@ -541,11 +543,11 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
             for k, name in enumerate(symbol_names):
                 new_resolver = copy.deepcopy(resolver_batch[i])
                 new_resolver.param_dict[name] += dx
-                final_circuit_p = cirq.resolve_parameters(circuit_batch[i],
-                                                          new_resolver)
-                new_resolver.param_dict[name] -= 2*dx
-                final_circuit_m = cirq.resolve_parameters(circuit_batch[i],
-                                                          new_resolver)
+                final_circuit_p = cirq.resolve_parameters(
+                    circuit_batch[i], new_resolver)
+                new_resolver.param_dict[name] -= 2 * dx
+                final_circuit_m = cirq.resolve_parameters(
+                    circuit_batch[i], new_resolver)
                 final_wf_p = cirq.final_state_vector(final_circuit_p)
                 final_wf_m = cirq.final_state_vector(final_circuit_m)
                 final_wf_grad = (final_wf_p - final_wf_m) / 2.0 / dx
@@ -573,7 +575,7 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
         },
     ])
     def test_correctness_without_symbols(self, n_qubits, batch_size,
-        inner_dim_size):
+                                         inner_dim_size):
         """Test that inner_product_adj_grad works without symbols."""
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, _ = \
@@ -590,7 +592,6 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
         symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
 
-
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
             out = inner_product_op.inner_product_adj_grad(
@@ -609,5 +610,6 @@ def test_correctness_empty(self):
             out = inner_product_op.inner_product_adj_grad(
                 empty_cicuit, empty_symbols, empty_values, other_program)
 
+
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 6996adb30..cc04b8217 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -109,10 +109,10 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
     auto construct_f = [&](int start, int end) {
       for (int i = start; i < end; i++) {
-        OP_REQUIRES_OK(context, QsimCircuitFromProgram(
-                                    programs[i], maps[i], num_qubits[i],
-                                    &qsim_circuits[i], &fused_circuits[i],
-                                    &gate_meta[i]));
+        OP_REQUIRES_OK(
+            context, QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                            &qsim_circuits[i],
+                                            &fused_circuits[i], &gate_meta[i]));
 
         CreateGradientCircuit(qsim_circuits[i], gate_meta[i],
                               &partial_fused_circuits[i], &gradient_gates[i]);
@@ -162,20 +162,18 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     // ...
     if (max_num_qubits >= 26 || output_dim_batch_size == 1) {
       ComputeLarge(num_qubits, maps, qsim_circuits, fused_circuits,
-                   partial_fused_circuits,
-                   gradient_gates, other_fused_circuits, context,
-                   &output_tensor);
-    } else {
-      ComputeSmall(num_qubits, max_num_qubits, maps, qsim_circuits, fused_circuits,
                    partial_fused_circuits, gradient_gates, other_fused_circuits,
                    context, &output_tensor);
+    } else {
+      ComputeSmall(num_qubits, max_num_qubits, maps, qsim_circuits,
+                   fused_circuits, partial_fused_circuits, gradient_gates,
+                   other_fused_circuits, context, &output_tensor);
     }
   }
 
  private:
   void ComputeLarge(
-      const std::vector<int>& num_qubits,
-      const std::vector<SymbolMap>& maps,
+      const std::vector<int>& num_qubits, const std::vector<SymbolMap>& maps,
       const std::vector<QsimCircuit>& qsim_circuits,
       const std::vector<QsimFusedCircuit>& fused_circuits,
       const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
@@ -200,7 +198,8 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as necessary.
-    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type i = 0; i < fused_circuits.size(); i++) {
+    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type i = 0;
+         i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
       if (nq > largest_nq) {
         // need to switch to larger statespace.
@@ -209,12 +208,15 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         scratch = ss.Create(largest_nq);
       }
       ss.SetStateZero(sv);
-      for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0; j < fused_circuits[i].size(); j++) {
+      for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
+           j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0; j < other_fused_circuits[i].size(); j++) {
+      for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
+           j < other_fused_circuits[i].size(); j++) {
         ss.SetStateZero(scratch);
-        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0; k < other_fused_circuits[i][j].size(); k++) {
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
+             k < other_fused_circuits[i][j].size(); k++) {
           qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);
         }
 
@@ -236,7 +238,8 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
           // Hit a parameterized gate.
           // todo fix this copy.
-          auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
+          auto cur_gate =
+              qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
           ApplyGateDagger(sim, cur_gate, sv);
 
           // if applicable compute control qubit mask and control value bits.
@@ -325,26 +328,37 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           // no need to update scratch_state since ComputeExpectation
           // will take care of things for us.
           ss.SetStateZero(sv);
-          for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
+          for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
+               j < fused_circuits[cur_batch_index].size(); j++) {
             qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
           }
         }
 
         ss.SetStateZero(scratch);
-        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0; k < other_fused_circuits[cur_batch_index][cur_internal_index].size(); k++) {
-          qsim::ApplyFusedGate(sim, other_fused_circuits[cur_batch_index][cur_internal_index][k], scratch);
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
+             k <
+             other_fused_circuits[cur_batch_index][cur_internal_index].size();
+             k++) {
+          qsim::ApplyFusedGate(
+              sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
+              scratch);
         }
 
         // now sv is |psi>, scratch is |phi>
         // initialize gradients for given |psi> and |phi>.
         for (std::vector<SymbolMap>::size_type k = 0; k < maps.size(); k++) {
-          (*output_tensor)(cur_batch_index, cur_internal_index, k) = std::complex<float>(0, 0);
+          (*output_tensor)(cur_batch_index, cur_internal_index, k) =
+              std::complex<float>(0, 0);
         }
         // Start adjoint differentiation.
-        for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0; l--) {
-          for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1; k >= 0; k--) {
-            ApplyFusedGateDagger(sim, partial_fused_circuits[cur_batch_index][l][k], sv);
-            ApplyFusedGateDagger(sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
+        for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0;
+             l--) {
+          for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
+               k >= 0; k--) {
+            ApplyFusedGateDagger(
+                sim, partial_fused_circuits[cur_batch_index][l][k], sv);
+            ApplyFusedGateDagger(
+                sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
           }
           if (l == 0) {
             // last layer will have no parametrized gates so can break.
@@ -353,7 +367,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
           // Hit a parameterized gate.
           // todo fix this copy.
-          auto cur_gate = qsim_circuits[cur_batch_index].gates[gradient_gates[cur_batch_index][l - 1].index];
+          auto cur_gate =
+              qsim_circuits[cur_batch_index]
+                  .gates[gradient_gates[cur_batch_index][l - 1].index];
           ApplyGateDagger(sim, cur_gate, sv);
 
           // if applicable compute control qubit mask and control value bits.
@@ -365,7 +381,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
           }
 
-          for (int k = 0; k < gradient_gates[cur_batch_index][l - 1].grad_gates.size(); k++) {
+          for (int k = 0;
+               k < gradient_gates[cur_batch_index][l - 1].grad_gates.size();
+               k++) {
             // Copy sv onto scratch2 in anticipation of non-unitary "gradient
             // gate".
             ss.Copy(sv, scratch2);
@@ -375,11 +393,13 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
               // non-controlled version of the gradient gate.
               ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
             }
-            qsim::ApplyGate(sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
-                            scratch2);
+            qsim::ApplyGate(
+                sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
+                scratch2);
 
             // don't need not-found check since this is done upstream already.
-            const auto it = maps[cur_batch_index].find(gradient_gates[cur_batch_index][l - 1].params[k]);
+            const auto it = maps[cur_batch_index].find(
+                gradient_gates[cur_batch_index][l - 1].params[k]);
             const int loc = it->second.first;
             // Apply finite differencing for adjoint gradients.
             // Finite differencing enables applying multiple `gradient_gate`
@@ -405,8 +425,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TfqInnerProductAdjGrad")
-    .Device(tensorflow::DEVICE_CPU), TfqInnerProductAdjGradOp);
+REGISTER_KERNEL_BUILDER(
+    Name("TfqInnerProductAdjGrad").Device(tensorflow::DEVICE_CPU),
+    TfqInnerProductAdjGradOp);
 
 REGISTER_OP("TfqInnerProductAdjGrad")
     .Input("programs: string")
@@ -433,8 +454,8 @@ REGISTER_OP("TfqInnerProductAdjGrad")
           c->Dim(other_programs_shape, 1);
       tensorflow::shape_inference::DimensionHandle n_symbols =
           c->Dim(symbol_names_shape, 0);
-      std::vector<tensorflow::shape_inference::DimensionHandle> dims =
-          {output_rows, output_cols, n_symbols};
+      std::vector<tensorflow::shape_inference::DimensionHandle> dims = {
+          output_rows, output_cols, n_symbols};
       c->set_output(0, c->MakeShape(dims));
 
       return tensorflow::Status::OK();

From 8012e37d9076fcf0c1110591c3414ffb42100bff Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 14:00:51 +0900
Subject: [PATCH 05/21] Split inner_product_adj_grad_op_test from
 inner_product_op_test

---
 tensorflow_quantum/core/ops/math_ops/BUILD    |  10 +
 .../inner_product_adj_grad_op_test.py         | 323 ++++++++++++++++++
 .../ops/math_ops/inner_product_op_test.py     | 297 ----------------
 .../math_ops/tfq_inner_product_adj_grad.cc    |  23 +-
 4 files changed, 347 insertions(+), 306 deletions(-)
 create mode 100644 tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py

diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index d18bed67b..05645398c 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -84,3 +84,13 @@ py_test(
         "//tensorflow_quantum/python:util",
     ],
 )
+
+py_test(
+    name = "inner_product_adj_grad_op_test",
+    srcs = ["inner_product_adj_grad_op_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":inner_product_op_py",
+        "//tensorflow_quantum/python:util",
+    ],
+)
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
new file mode 100644
index 000000000..2122b0ec6
--- /dev/null
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -0,0 +1,323 @@
+# Copyright 2021 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that specifically target tfq_inner_product*."""
+import copy
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops.math_ops import inner_product_op
+from tensorflow_quantum.python import util
+
+class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
+  """Tests tfq_inner_product_adj_grad."""
+
+  def test_inner_product_adj_grad_inputs(self):
+    """Makes sure that inner_product_adj_grad fails on bad inputs."""
+    n_qubits = 5
+    batch_size = 5
+    symbol_names = ['alpha']
+    qubits = cirq.GridQubit.rect(1, n_qubits)
+    circuit_batch, resolver_batch = \
+      util.random_symbol_circuit_resolver_batch(
+          qubits, symbol_names, batch_size)
+
+    symbol_values_array = np.array(
+        [[resolver[symbol]
+          for symbol in symbol_names]
+         for resolver in resolver_batch])
+
+    other_batch = [
+        util.random_circuit_resolver_batch(qubits, 3)[0]
+        for i in range(batch_size)
+    ]
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'programs must be rank 1'):
+      # Circuit tensor has too many dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor([circuit_batch]), symbol_names,
+          symbol_values_array, util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'symbol_names must be rank 1.'):
+      # symbol_names tensor has too many dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+          symbol_values_array, util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'symbol_values must be rank 2.'):
+      # symbol_values_array tensor has too many dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          np.array([symbol_values_array]),
+          util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'symbol_values must be rank 2.'):
+      # symbol_values_array tensor has too few dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array[0], util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'other_programs must be rank 2.'):
+      # other_programs tensor has too few dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array, util.convert_to_tensor(circuit_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'other_programs must be rank 2.'):
+      # pauli_sums tensor has too many dimensions.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array,
+          util.convert_to_tensor([[x] for x in other_batch]))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'Unparseable proto'):
+      # circuit tensor has the right type but invalid values.
+      inner_product_op.inner_product_adj_grad(
+          ['junk'] * batch_size, symbol_names, symbol_values_array,
+          util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'Could not find symbol in parameter map'):
+      # symbol_names tensor has the right type but invalid values.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), ['junk'],
+          symbol_values_array, util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'not found in reference circuit'):
+      # other_programs tensor has the right type but operates on
+      # qubits that the reference ciruit doesn't have.
+      new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+      new_circuits, _ = util.random_circuit_resolver_batch(
+          new_qubits, batch_size)
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array,
+          util.convert_to_tensor([[x] for x in new_circuits]))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'not found in paired circuit'):
+      # other_programs tensor has the right type but operates on
+      # qubits that the reference ciruit doesn't have.
+      new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
+      new_circuits, _ = util.random_circuit_resolver_batch(
+          new_qubits, batch_size)
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array,
+          util.convert_to_tensor([[x] for x in new_circuits]))
+
+    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+      # circuits tensor has the wrong type.
+      inner_product_op.inner_product_adj_grad(
+          [1.0] * batch_size, symbol_names, symbol_values_array,
+          util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+      # symbol_names tensor has the wrong type.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), [0.1234],
+          symbol_values_array, util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+      # symbol_values tensor has the wrong type.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          [['junk']] * batch_size, util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+      # other_programs tensor has the wrong type.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array, [[1.0]] * batch_size)
+
+    with self.assertRaisesRegex(TypeError, 'missing'):
+      # we are missing an argument.
+      # pylint: disable=no-value-for-parameter
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array)
+      # pylint: enable=no-value-for-parameter
+
+    with self.assertRaisesRegex(TypeError, 'positional arguments'):
+      # pylint: disable=too-many-function-args
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array, util.convert_to_tensor(other_batch), [])
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                expected_regex='do not match'):
+      # batch programs has wrong batch size.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array,
+          util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                expected_regex='do not match'):
+      # batch programs has wrong batch size.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array[::int(batch_size * 0.5)],
+          util.convert_to_tensor(other_batch))
+
+    with self.assertRaisesRegex(
+        tf.errors.InvalidArgumentError,
+        expected_regex='Found symbols in other_programs'):
+      # other_programs has symbols.
+      inner_product_op.inner_product_adj_grad(
+          util.convert_to_tensor(circuit_batch), symbol_names,
+          symbol_values_array,
+          util.convert_to_tensor([[x] for x in circuit_batch]))
+
+    res = inner_product_op.inner_product_adj_grad(
+        util.convert_to_tensor(circuit_batch), symbol_names,
+        symbol_values_array.astype(np.float64),
+        util.convert_to_tensor(other_batch))
+    self.assertDTypeEqual(res, np.complex64)
+
+  @parameterized.parameters([
+      {
+          'n_qubits': 5,
+          'batch_size': 10,
+          'inner_dim_size': 1
+      },
+      {
+          'n_qubits': 10,
+          'batch_size': 10,
+          'inner_dim_size': 2
+      },
+      {
+          'n_qubits': 5,
+          'batch_size': 10,
+          'inner_dim_size': 5
+      },
+  ])
+  def test_correctness_with_symbols(self, n_qubits, batch_size,
+      inner_dim_size):
+    """Test that inner_product works with symbols."""
+    symbol_names = ['alpha', 'beta', 'gamma']
+    n_params = len(symbol_names)
+    qubits = cirq.GridQubit.rect(1, n_qubits)
+    circuit_batch, resolver_batch = \
+      util.random_symbol_circuit_resolver_batch(
+          qubits, symbol_names, batch_size)
+
+    other_batch = [
+        util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+        for i in range(batch_size)
+    ]
+
+    symbol_values_array = np.array(
+        [[resolver[symbol]
+          for symbol in symbol_names]
+         for resolver in resolver_batch])
+
+    programs = util.convert_to_tensor(circuit_batch)
+    other_programs = util.convert_to_tensor(other_batch)
+    symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                               dtype=tf.dtypes.string)
+    symbol_values = tf.convert_to_tensor(symbol_values_array)
+
+    out = inner_product_op.inner_product_adj_grad(programs,
+                                                  symbol_names_tensor,
+                                                  symbol_values,
+                                                  other_programs)
+
+    out_arr = np.empty((batch_size, inner_dim_size, n_params),
+                       dtype=np.complex64)
+    dx = 1e-6
+    for i in range(batch_size):
+      for k, name in enumerate(symbol_names):
+        new_resolver = copy.deepcopy(resolver_batch[i])
+        new_resolver.param_dict[name] += dx
+        final_circuit_p = cirq.resolve_parameters(
+            circuit_batch[i], new_resolver)
+        new_resolver.param_dict[name] -= dx
+        final_circuit_m = cirq.resolve_parameters(
+            circuit_batch[i], new_resolver)
+        final_wf_p = cirq.final_state_vector(final_circuit_p)
+        final_wf_m = cirq.final_state_vector(final_circuit_m)
+        final_wf_grad = (final_wf_p - final_wf_m) / dx
+        for j in range(inner_dim_size):
+          internal_wf = cirq.final_state_vector(other_batch[i][j])
+          out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
+
+    self.assertAllClose(out, out_arr, atol=1e-5)
+
+  @parameterized.parameters([
+      {
+          'n_qubits': 5,
+          'batch_size': 10,
+          'inner_dim_size': 1
+      },
+      {
+          'n_qubits': 10,
+          'batch_size': 10,
+          'inner_dim_size': 2
+      },
+      {
+          'n_qubits': 5,
+          'batch_size': 10,
+          'inner_dim_size': 5
+      },
+  ])
+  def test_correctness_without_symbols(self, n_qubits, batch_size,
+      inner_dim_size):
+    """Test that inner_product_adj_grad works without symbols."""
+    qubits = cirq.GridQubit.rect(1, n_qubits)
+    circuit_batch, _ = \
+      util.random_circuit_resolver_batch(
+          qubits, batch_size)
+
+    other_batch = [
+        util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+        for i in range(batch_size)
+    ]
+
+    programs = util.convert_to_tensor(circuit_batch)
+    other_programs = util.convert_to_tensor(other_batch)
+    symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+    symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'symbols must be a positive integer'):
+      out = inner_product_op.inner_product_adj_grad(
+          programs, symbol_names, symbol_values, other_programs)
+
+  def test_correctness_empty(self):
+    """Test the inner product adj grad between two empty circuits."""
+
+    empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+    empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+    empty_values = tf.convert_to_tensor([[]])
+    other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                'symbols must be a positive integer'):
+      out = inner_product_op.inner_product_adj_grad(
+          empty_cicuit, empty_symbols, empty_values, other_program)
+
+
+if __name__ == "__main__":
+  tf.test.main()
\ No newline at end of file
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 2308a0a7b..8999db5ea 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -314,302 +314,5 @@ def test_correctness_empty(self):
         self.assertAllClose(out, expected)
 
 
-class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
-    """Tests tfq_inner_product_adj_grad."""
-
-    def test_inner_product_adj_grad_inputs(self):
-        """Makes sure that inner_product_adj_grad fails on bad inputs."""
-        n_qubits = 5
-        batch_size = 5
-        symbol_names = ['alpha']
-        qubits = cirq.GridQubit.rect(1, n_qubits)
-        circuit_batch, resolver_batch = \
-            util.random_symbol_circuit_resolver_batch(
-                qubits, symbol_names, batch_size)
-
-        symbol_values_array = np.array(
-            [[resolver[symbol]
-              for symbol in symbol_names]
-             for resolver in resolver_batch])
-
-        other_batch = [
-            util.random_circuit_resolver_batch(qubits, 3)[0]
-            for i in range(batch_size)
-        ]
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'programs must be rank 1'):
-            # Circuit tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor([circuit_batch]), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'symbol_names must be rank 1.'):
-            # symbol_names tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
-                symbol_values_array, util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'symbol_values must be rank 2.'):
-            # symbol_values_array tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                np.array([symbol_values_array]),
-                util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'symbol_values must be rank 2.'):
-            # symbol_values_array tensor has too few dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array[0], util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'other_programs must be rank 2.'):
-            # other_programs tensor has too few dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(circuit_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'other_programs must be rank 2.'):
-            # pauli_sums tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                util.convert_to_tensor([[x] for x in other_batch]))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'Unparseable proto'):
-            # circuit tensor has the right type but invalid values.
-            inner_product_op.inner_product_adj_grad(
-                ['junk'] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'Could not find symbol in parameter map'):
-            # symbol_names tensor has the right type but invalid values.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), ['junk'],
-                symbol_values_array, util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'not found in reference circuit'):
-            # other_programs tensor has the right type but operates on
-            # qubits that the reference ciruit doesn't have.
-            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
-            new_circuits, _ = util.random_circuit_resolver_batch(
-                new_qubits, batch_size)
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'not found in paired circuit'):
-            # other_programs tensor has the right type but operates on
-            # qubits that the reference ciruit doesn't have.
-            new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
-            new_circuits, _ = util.random_circuit_resolver_batch(
-                new_qubits, batch_size)
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]))
-
-        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-            # circuits tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                [1.0] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-            # symbol_names tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), [0.1234],
-                symbol_values_array, util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
-            # symbol_values tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                [['junk']] * batch_size, util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-            # other_programs tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, [[1.0]] * batch_size)
-
-        with self.assertRaisesRegex(TypeError, 'missing'):
-            # we are missing an argument.
-            # pylint: disable=no-value-for-parameter
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array)
-            # pylint: enable=no-value-for-parameter
-
-        with self.assertRaisesRegex(TypeError, 'positional arguments'):
-            # pylint: disable=too-many-function-args
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch), [])
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    expected_regex='do not match'):
-            # batch programs has wrong batch size.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    expected_regex='do not match'):
-            # batch programs has wrong batch size.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array[::int(batch_size * 0.5)],
-                util.convert_to_tensor(other_batch))
-
-        with self.assertRaisesRegex(
-                tf.errors.InvalidArgumentError,
-                expected_regex='Found symbols in other_programs'):
-            # other_programs has symbols.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                util.convert_to_tensor([[x] for x in circuit_batch]))
-
-        res = inner_product_op.inner_product_adj_grad(
-            util.convert_to_tensor(circuit_batch), symbol_names,
-            symbol_values_array.astype(np.float64),
-            util.convert_to_tensor(other_batch))
-        self.assertDTypeEqual(res, np.complex64)
-
-    @parameterized.parameters([
-        {
-            'n_qubits': 5,
-            'batch_size': 10,
-            'inner_dim_size': 1
-        },
-        {
-            'n_qubits': 10,
-            'batch_size': 10,
-            'inner_dim_size': 2
-        },
-        {
-            'n_qubits': 5,
-            'batch_size': 10,
-            'inner_dim_size': 5
-        },
-    ])
-    def test_correctness_with_symbols(self, n_qubits, batch_size,
-                                      inner_dim_size):
-        """Test that inner_product works with symbols."""
-        symbol_names = ['alpha', 'beta', 'gamma']
-        n_params = len(symbol_names)
-        qubits = cirq.GridQubit.rect(1, n_qubits)
-        circuit_batch, resolver_batch = \
-            util.random_symbol_circuit_resolver_batch(
-                qubits, symbol_names, batch_size)
-
-        other_batch = [
-            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
-            for i in range(batch_size)
-        ]
-
-        symbol_values_array = np.array(
-            [[resolver[symbol]
-              for symbol in symbol_names]
-             for resolver in resolver_batch])
-
-        programs = util.convert_to_tensor(circuit_batch)
-        other_programs = util.convert_to_tensor(other_batch)
-        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
-                                                   dtype=tf.dtypes.string)
-        symbol_values = tf.convert_to_tensor(symbol_values_array)
-
-        out = inner_product_op.inner_product_adj_grad(programs,
-                                                      symbol_names_tensor,
-                                                      symbol_values,
-                                                      other_programs)
-
-        out_arr = np.empty((batch_size, inner_dim_size, n_params),
-                           dtype=np.complex64)
-        dx = 1e-4
-        for i in range(batch_size):
-            for k, name in enumerate(symbol_names):
-                new_resolver = copy.deepcopy(resolver_batch[i])
-                new_resolver.param_dict[name] += dx
-                final_circuit_p = cirq.resolve_parameters(
-                    circuit_batch[i], new_resolver)
-                new_resolver.param_dict[name] -= 2 * dx
-                final_circuit_m = cirq.resolve_parameters(
-                    circuit_batch[i], new_resolver)
-                final_wf_p = cirq.final_state_vector(final_circuit_p)
-                final_wf_m = cirq.final_state_vector(final_circuit_m)
-                final_wf_grad = (final_wf_p - final_wf_m) / 2.0 / dx
-                for j in range(inner_dim_size):
-                    internal_wf = cirq.final_state_vector(other_batch[i][j])
-                    out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
-
-        self.assertAllClose(out, out_arr, atol=1e-5)
-
-    @parameterized.parameters([
-        {
-            'n_qubits': 5,
-            'batch_size': 10,
-            'inner_dim_size': 1
-        },
-        {
-            'n_qubits': 10,
-            'batch_size': 10,
-            'inner_dim_size': 2
-        },
-        {
-            'n_qubits': 5,
-            'batch_size': 10,
-            'inner_dim_size': 5
-        },
-    ])
-    def test_correctness_without_symbols(self, n_qubits, batch_size,
-                                         inner_dim_size):
-        """Test that inner_product_adj_grad works without symbols."""
-        qubits = cirq.GridQubit.rect(1, n_qubits)
-        circuit_batch, _ = \
-            util.random_circuit_resolver_batch(
-                qubits, batch_size)
-
-        other_batch = [
-            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
-            for i in range(batch_size)
-        ]
-
-        programs = util.convert_to_tensor(circuit_batch)
-        other_programs = util.convert_to_tensor(other_batch)
-        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
-        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'symbols must be a positive integer'):
-            out = inner_product_op.inner_product_adj_grad(
-                programs, symbol_names, symbol_values, other_programs)
-
-    def test_correctness_empty(self):
-        """Test the inner product adj grad between two empty circuits."""
-
-        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
-        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
-        empty_values = tf.convert_to_tensor([[]])
-        other_program = util.convert_to_tensor([[cirq.Circuit()]])
-
-        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                    'symbols must be a positive integer'):
-            out = inner_product_op.inner_product_adj_grad(
-                empty_cicuit, empty_symbols, empty_values, other_program)
-
-
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index cc04b8217..0df5e3316 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -57,15 +57,15 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     // Create the output Tensor.
     const int output_dim_batch_size = context->input(0).dim_size(0);
     const int output_dim_internal_size = context->input(3).dim_size(1);
-    const int output_dim_symbols_size = context->input(1).dim_size(0);
-    OP_REQUIRES(context, output_dim_symbols_size > 0,
+    const int output_dim_symbol_size = context->input(1).dim_size(0);
+    OP_REQUIRES(context, output_dim_symbol_size > 0,
                 tensorflow::errors::InvalidArgument(absl::StrCat(
                     "The number of symbols must be a positive integer, got ",
-                    output_dim_symbols_size, " symbols.")));
+                    output_dim_symbol_size, " symbols.")));
     tensorflow::TensorShape output_shape;
     output_shape.AddDim(output_dim_batch_size);
     output_shape.AddDim(output_dim_internal_size);
-    output_shape.AddDim(output_dim_symbols_size);
+    output_shape.AddDim(output_dim_symbol_size);
 
     tensorflow::Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
@@ -87,6 +87,11 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                     "Number of circuits and symbol_values do not match. Got ",
                     programs.size(), " circuits and ", maps.size(),
                     " symbol values.")));
+    OP_REQUIRES(context, output_dim_symbol_size == maps[0].size(),
+                    tensorflow::errors::InvalidArgument(absl::StrCat(
+                        "Number of symbols and symbol maps do not match. Got ",
+                        output_dim_symbol_size, " symbols and ", maps[0].size(),
+                        " symbol values.")));
 
     // Construct qsim circuits for programs.
     std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
@@ -222,7 +227,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
         // now sv is |psi>, scratch is |phi>
         // initialize gradients for given |psi> and |phi>.
-        for (std::vector<SymbolMap>::size_type k = 0; k < maps.size(); k++) {
+        for (int k = 0; k < maps[i].size(); k++) {
           (*output_tensor)(i, j, k) = std::complex<float>(0, 0);
         }
         // Start adjoint differentiation.
@@ -245,13 +250,13 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           // if applicable compute control qubit mask and control value bits.
           uint64_t mask = 0;
           uint64_t cbits = 0;
-          for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
+          for (std::vector<unsigned int>::size_type k = 0; k < cur_gate.controlled_by.size(); k++) {
             uint64_t control_loc = cur_gate.controlled_by[k];
             mask |= uint64_t{1} << control_loc;
             cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
           }
 
-          for (int k = 0; k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
+          for (std::vector<QsimGate>::size_type k = 0; k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
             // Copy sv onto scratch2 in anticipation of non-unitary "gradient
             // gate".
             ss.Copy(sv, scratch2);
@@ -324,6 +329,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             largest_nq = nq;
             sv = ss.Create(largest_nq);
             scratch = ss.Create(largest_nq);
+            scratch2 = ss.Create(largest_nq);
           }
           // no need to update scratch_state since ComputeExpectation
           // will take care of things for us.
@@ -346,7 +352,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
         // now sv is |psi>, scratch is |phi>
         // initialize gradients for given |psi> and |phi>.
-        for (std::vector<SymbolMap>::size_type k = 0; k < maps.size(); k++) {
+        for (int k = 0; k < maps[cur_batch_index].size(); k++) {
           (*output_tensor)(cur_batch_index, cur_internal_index, k) =
               std::complex<float>(0, 0);
         }
@@ -413,7 +419,6 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           }
           ApplyGateDagger(sim, cur_gate, scratch);
         }
-
         old_batch_index = cur_batch_index;
       }
     };

From c9245e5bb3e81f0c3093e25ed4d6f14fc11e98f8 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 15:38:42 +0900
Subject: [PATCH 06/21] Fix the inner-loop sv intialization error in
 ComputeSmall

---
 .../inner_product_adj_grad_op_test.py         | 597 +++++++++---------
 .../ops/math_ops/inner_product_op_test.py     |   2 +-
 .../math_ops/tfq_inner_product_adj_grad.cc    |  23 +-
 3 files changed, 316 insertions(+), 306 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index 2122b0ec6..ad6854072 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests that specifically target tfq_inner_product*."""
+"""Tests that specifically target tfq_inner_product_adj_grad."""
 import copy
 import numpy as np
 from absl.testing import parameterized
@@ -22,302 +22,307 @@
 from tensorflow_quantum.core.ops.math_ops import inner_product_op
 from tensorflow_quantum.python import util
 
+
 class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests tfq_inner_product_adj_grad."""
-
-  def test_inner_product_adj_grad_inputs(self):
-    """Makes sure that inner_product_adj_grad fails on bad inputs."""
-    n_qubits = 5
-    batch_size = 5
-    symbol_names = ['alpha']
-    qubits = cirq.GridQubit.rect(1, n_qubits)
-    circuit_batch, resolver_batch = \
-      util.random_symbol_circuit_resolver_batch(
-          qubits, symbol_names, batch_size)
-
-    symbol_values_array = np.array(
-        [[resolver[symbol]
-          for symbol in symbol_names]
-         for resolver in resolver_batch])
-
-    other_batch = [
-        util.random_circuit_resolver_batch(qubits, 3)[0]
-        for i in range(batch_size)
-    ]
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'programs must be rank 1'):
-      # Circuit tensor has too many dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor([circuit_batch]), symbol_names,
-          symbol_values_array, util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'symbol_names must be rank 1.'):
-      # symbol_names tensor has too many dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
-          symbol_values_array, util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'symbol_values must be rank 2.'):
-      # symbol_values_array tensor has too many dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          np.array([symbol_values_array]),
-          util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'symbol_values must be rank 2.'):
-      # symbol_values_array tensor has too few dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array[0], util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'other_programs must be rank 2.'):
-      # other_programs tensor has too few dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array, util.convert_to_tensor(circuit_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'other_programs must be rank 2.'):
-      # pauli_sums tensor has too many dimensions.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array,
-          util.convert_to_tensor([[x] for x in other_batch]))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'Unparseable proto'):
-      # circuit tensor has the right type but invalid values.
-      inner_product_op.inner_product_adj_grad(
-          ['junk'] * batch_size, symbol_names, symbol_values_array,
-          util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'Could not find symbol in parameter map'):
-      # symbol_names tensor has the right type but invalid values.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), ['junk'],
-          symbol_values_array, util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'not found in reference circuit'):
-      # other_programs tensor has the right type but operates on
-      # qubits that the reference ciruit doesn't have.
-      new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
-      new_circuits, _ = util.random_circuit_resolver_batch(
-          new_qubits, batch_size)
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array,
-          util.convert_to_tensor([[x] for x in new_circuits]))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'not found in paired circuit'):
-      # other_programs tensor has the right type but operates on
-      # qubits that the reference ciruit doesn't have.
-      new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
-      new_circuits, _ = util.random_circuit_resolver_batch(
-          new_qubits, batch_size)
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array,
-          util.convert_to_tensor([[x] for x in new_circuits]))
-
-    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-      # circuits tensor has the wrong type.
-      inner_product_op.inner_product_adj_grad(
-          [1.0] * batch_size, symbol_names, symbol_values_array,
-          util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-      # symbol_names tensor has the wrong type.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), [0.1234],
-          symbol_values_array, util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
-      # symbol_values tensor has the wrong type.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          [['junk']] * batch_size, util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(TypeError, 'Cannot convert'):
-      # other_programs tensor has the wrong type.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array, [[1.0]] * batch_size)
-
-    with self.assertRaisesRegex(TypeError, 'missing'):
-      # we are missing an argument.
-      # pylint: disable=no-value-for-parameter
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array)
-      # pylint: enable=no-value-for-parameter
-
-    with self.assertRaisesRegex(TypeError, 'positional arguments'):
-      # pylint: disable=too-many-function-args
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array, util.convert_to_tensor(other_batch), [])
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                expected_regex='do not match'):
-      # batch programs has wrong batch size.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array,
-          util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                expected_regex='do not match'):
-      # batch programs has wrong batch size.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array[::int(batch_size * 0.5)],
-          util.convert_to_tensor(other_batch))
-
-    with self.assertRaisesRegex(
-        tf.errors.InvalidArgumentError,
-        expected_regex='Found symbols in other_programs'):
-      # other_programs has symbols.
-      inner_product_op.inner_product_adj_grad(
-          util.convert_to_tensor(circuit_batch), symbol_names,
-          symbol_values_array,
-          util.convert_to_tensor([[x] for x in circuit_batch]))
-
-    res = inner_product_op.inner_product_adj_grad(
-        util.convert_to_tensor(circuit_batch), symbol_names,
-        symbol_values_array.astype(np.float64),
-        util.convert_to_tensor(other_batch))
-    self.assertDTypeEqual(res, np.complex64)
-
-  @parameterized.parameters([
-      {
-          'n_qubits': 5,
-          'batch_size': 10,
-          'inner_dim_size': 1
-      },
-      {
-          'n_qubits': 10,
-          'batch_size': 10,
-          'inner_dim_size': 2
-      },
-      {
-          'n_qubits': 5,
-          'batch_size': 10,
-          'inner_dim_size': 5
-      },
-  ])
-  def test_correctness_with_symbols(self, n_qubits, batch_size,
-      inner_dim_size):
-    """Test that inner_product works with symbols."""
-    symbol_names = ['alpha', 'beta', 'gamma']
-    n_params = len(symbol_names)
-    qubits = cirq.GridQubit.rect(1, n_qubits)
-    circuit_batch, resolver_batch = \
-      util.random_symbol_circuit_resolver_batch(
-          qubits, symbol_names, batch_size)
-
-    other_batch = [
-        util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
-        for i in range(batch_size)
-    ]
-
-    symbol_values_array = np.array(
-        [[resolver[symbol]
-          for symbol in symbol_names]
-         for resolver in resolver_batch])
-
-    programs = util.convert_to_tensor(circuit_batch)
-    other_programs = util.convert_to_tensor(other_batch)
-    symbol_names_tensor = tf.convert_to_tensor(symbol_names,
-                                               dtype=tf.dtypes.string)
-    symbol_values = tf.convert_to_tensor(symbol_values_array)
-
-    out = inner_product_op.inner_product_adj_grad(programs,
-                                                  symbol_names_tensor,
-                                                  symbol_values,
-                                                  other_programs)
-
-    out_arr = np.empty((batch_size, inner_dim_size, n_params),
-                       dtype=np.complex64)
-    dx = 1e-6
-    for i in range(batch_size):
-      for k, name in enumerate(symbol_names):
-        new_resolver = copy.deepcopy(resolver_batch[i])
-        new_resolver.param_dict[name] += dx
-        final_circuit_p = cirq.resolve_parameters(
-            circuit_batch[i], new_resolver)
-        new_resolver.param_dict[name] -= dx
-        final_circuit_m = cirq.resolve_parameters(
-            circuit_batch[i], new_resolver)
-        final_wf_p = cirq.final_state_vector(final_circuit_p)
-        final_wf_m = cirq.final_state_vector(final_circuit_m)
-        final_wf_grad = (final_wf_p - final_wf_m) / dx
-        for j in range(inner_dim_size):
-          internal_wf = cirq.final_state_vector(other_batch[i][j])
-          out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
-
-    self.assertAllClose(out, out_arr, atol=1e-5)
-
-  @parameterized.parameters([
-      {
-          'n_qubits': 5,
-          'batch_size': 10,
-          'inner_dim_size': 1
-      },
-      {
-          'n_qubits': 10,
-          'batch_size': 10,
-          'inner_dim_size': 2
-      },
-      {
-          'n_qubits': 5,
-          'batch_size': 10,
-          'inner_dim_size': 5
-      },
-  ])
-  def test_correctness_without_symbols(self, n_qubits, batch_size,
-      inner_dim_size):
-    """Test that inner_product_adj_grad works without symbols."""
-    qubits = cirq.GridQubit.rect(1, n_qubits)
-    circuit_batch, _ = \
-      util.random_circuit_resolver_batch(
-          qubits, batch_size)
-
-    other_batch = [
-        util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
-        for i in range(batch_size)
-    ]
-
-    programs = util.convert_to_tensor(circuit_batch)
-    other_programs = util.convert_to_tensor(other_batch)
-    symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
-    symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'symbols must be a positive integer'):
-      out = inner_product_op.inner_product_adj_grad(
-          programs, symbol_names, symbol_values, other_programs)
-
-  def test_correctness_empty(self):
-    """Test the inner product adj grad between two empty circuits."""
-
-    empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
-    empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
-    empty_values = tf.convert_to_tensor([[]])
-    other_program = util.convert_to_tensor([[cirq.Circuit()]])
-
-    with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
-                                'symbols must be a positive integer'):
-      out = inner_product_op.inner_product_adj_grad(
-          empty_cicuit, empty_symbols, empty_values, other_program)
+    """Tests tfq_inner_product_adj_grad."""
+
+    def test_inner_product_adj_grad_inputs(self):
+        """Makes sure that inner_product_adj_grad fails on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+          util.random_symbol_circuit_resolver_batch(
+              qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, 3)[0]
+            for i in range(batch_size)
+        ]
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0], util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # other_programs tensor has too few dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(circuit_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in other_batch]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            inner_product_op.inner_product_adj_grad(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in reference circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in paired circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size, util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # other_programs tensor has the wrong type.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(other_batch), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[::int(batch_size * 0.5)],
+                util.convert_to_tensor(other_batch))
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='Found symbols in other_programs'):
+            # other_programs has symbols.
+            inner_product_op.inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in circuit_batch]))
+
+        res = inner_product_op.inner_product_adj_grad(
+            util.convert_to_tensor(circuit_batch), symbol_names,
+            symbol_values_array.astype(np.float64),
+            util.convert_to_tensor(other_batch))
+        self.assertDTypeEqual(res, np.complex64)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_with_symbols(self, n_qubits, batch_size,
+                                      inner_dim_size):
+        """Test that inner_product works with symbols."""
+        symbol_names = ['alpha', 'beta', 'gamma']
+        n_params = len(symbol_names)
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+          util.random_symbol_circuit_resolver_batch(
+              qubits, symbol_names, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                                   dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor(symbol_values_array)
+
+        out = inner_product_op.inner_product_adj_grad(programs,
+                                                      symbol_names_tensor,
+                                                      symbol_values,
+                                                      other_programs)
+
+        out_arr = np.empty((batch_size, inner_dim_size, n_params),
+                           dtype=np.complex64)
+        # dx came from _GRAD_EPS of core/src/adj_util.cc
+        dx = 5e-3
+        for i in range(batch_size):
+            for k, name in enumerate(symbol_names):
+                if name in resolver_batch[i].param_dict:
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] += dx
+                    final_circuit_p = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] -= dx
+                    final_circuit_m = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    final_wf_p = cirq.final_state_vector(final_circuit_p)
+                    final_wf_m = cirq.final_state_vector(final_circuit_m)
+                    # Performs central finite difference.
+                    final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
+                    for j in range(inner_dim_size):
+                        internal_wf = cirq.final_state_vector(other_batch[i][j])
+                        out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
+
+        self.assertAllClose(out, out_arr, atol=1e-3)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_without_symbols(self, n_qubits, batch_size,
+                                         inner_dim_size):
+        """Test that inner_product_adj_grad works without symbols."""
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, _ = \
+          util.random_circuit_resolver_batch(
+              qubits, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbols must be a positive integer'):
+            out = inner_product_op.inner_product_adj_grad(
+                programs, symbol_names, symbol_values, other_programs)
+
+    def test_correctness_empty(self):
+        """Test the inner product adj grad between two empty circuits."""
+
+        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbols must be a positive integer'):
+            out = inner_product_op.inner_product_adj_grad(
+                empty_cicuit, empty_symbols, empty_values, other_program)
 
 
 if __name__ == "__main__":
-  tf.test.main()
\ No newline at end of file
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 8999db5ea..3c9ba7856 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests that specifically target tfq_inner_product*."""
+"""Tests that specifically target tfq_inner_product."""
 import copy
 import numpy as np
 from absl.testing import parameterized
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 0df5e3316..187ecbb0e 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -88,10 +88,10 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                     programs.size(), " circuits and ", maps.size(),
                     " symbol values.")));
     OP_REQUIRES(context, output_dim_symbol_size == maps[0].size(),
-                    tensorflow::errors::InvalidArgument(absl::StrCat(
-                        "Number of symbols and symbol maps do not match. Got ",
-                        output_dim_symbol_size, " symbols and ", maps[0].size(),
-                        " symbol values.")));
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of symbols and symbol maps do not match. Got ",
+                    output_dim_symbol_size, " symbols and ", maps[0].size(),
+                    " symbol values.")));
 
     // Construct qsim circuits for programs.
     std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
@@ -250,13 +250,15 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           // if applicable compute control qubit mask and control value bits.
           uint64_t mask = 0;
           uint64_t cbits = 0;
-          for (std::vector<unsigned int>::size_type k = 0; k < cur_gate.controlled_by.size(); k++) {
+          for (std::vector<unsigned int>::size_type k = 0;
+               k < cur_gate.controlled_by.size(); k++) {
             uint64_t control_loc = cur_gate.controlled_by[k];
             mask |= uint64_t{1} << control_loc;
             cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
           }
 
-          for (std::vector<QsimGate>::size_type k = 0; k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
+          for (std::vector<QsimGate>::size_type k = 0;
+               k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
             // Copy sv onto scratch2 in anticipation of non-unitary "gradient
             // gate".
             ss.Copy(sv, scratch2);
@@ -314,6 +316,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
       Simulator sim = Simulator(tfq_for);
       StateSpace ss = StateSpace(tfq_for);
       auto sv = ss.Create(largest_nq);
+      auto sv_adj = ss.Create(largest_nq);
       auto scratch = ss.Create(largest_nq);
       auto scratch2 = ss.Create(largest_nq);
       for (int i = start; i < end; i++) {
@@ -328,6 +331,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           if (nq > largest_nq) {
             largest_nq = nq;
             sv = ss.Create(largest_nq);
+            sv_adj = ss.Create(largest_nq);
             scratch = ss.Create(largest_nq);
             scratch2 = ss.Create(largest_nq);
           }
@@ -340,6 +344,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           }
         }
 
+        ss.Copy(sv, sv_adj);
         ss.SetStateZero(scratch);
         for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
              k <
@@ -362,7 +367,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
                k >= 0; k--) {
             ApplyFusedGateDagger(
-                sim, partial_fused_circuits[cur_batch_index][l][k], sv);
+                sim, partial_fused_circuits[cur_batch_index][l][k], sv_adj);
             ApplyFusedGateDagger(
                 sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
           }
@@ -376,7 +381,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           auto cur_gate =
               qsim_circuits[cur_batch_index]
                   .gates[gradient_gates[cur_batch_index][l - 1].index];
-          ApplyGateDagger(sim, cur_gate, sv);
+          ApplyGateDagger(sim, cur_gate, sv_adj);
 
           // if applicable compute control qubit mask and control value bits.
           uint64_t mask = 0;
@@ -392,7 +397,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                k++) {
             // Copy sv onto scratch2 in anticipation of non-unitary "gradient
             // gate".
-            ss.Copy(sv, scratch2);
+            ss.Copy(sv_adj, scratch2);
             if (!cur_gate.controlled_by.empty()) {
               // Gradient of controlled gattes puts zeros on diagonal which is
               // the same as collapsing the state and then applying the

From 94117b5b5bc00f497ab6c176eb35b0c5fdaf0ddd Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 15:53:47 +0900
Subject: [PATCH 07/21] Fix ComputeLarge in inner_product_adj_grad and add
 ComputeLarge tests

---
 .../ops/math_ops/inner_product_adj_grad_op_test.py | 11 ++++++++---
 .../core/ops/math_ops/inner_product_op_test.py     | 11 ++++++++---
 .../ops/math_ops/tfq_inner_product_adj_grad.cc     | 14 +++++++++-----
 tensorflow_quantum/core/ops/tfq_adj_grad_op.cc     |  2 +-
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index ad6854072..38ae138c7 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -200,17 +200,22 @@ def test_inner_product_adj_grad_inputs(self):
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 5
         },
     ])
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 3c9ba7856..798424877 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -200,17 +200,22 @@ def test_inner_product_inputs(self):
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 5
         },
     ])
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 187ecbb0e..d7b54806b 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -197,6 +197,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     Simulator sim = Simulator(tfq_for);
     StateSpace ss = StateSpace(tfq_for);
     auto sv = ss.Create(largest_nq);
+    auto sv_adj = ss.Create(largest_nq);
     auto scratch = ss.Create(largest_nq);
     auto scratch2 = ss.Create(largest_nq);
 
@@ -210,7 +211,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         // need to switch to larger statespace.
         largest_nq = nq;
         sv = ss.Create(largest_nq);
+        sv_adj = ss.Create(largest_nq);
         scratch = ss.Create(largest_nq);
+        scratch2 = ss.Create(largest_nq);
       }
       ss.SetStateZero(sv);
       for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
@@ -219,6 +222,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
       }
       for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
            j < other_fused_circuits[i].size(); j++) {
+        ss.Copy(sv, sv_adj);
         ss.SetStateZero(scratch);
         for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
              k < other_fused_circuits[i][j].size(); k++) {
@@ -233,7 +237,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         // Start adjoint differentiation.
         for (int l = partial_fused_circuits[i].size() - 1; l >= 0; l--) {
           for (int k = partial_fused_circuits[i][l].size() - 1; k >= 0; k--) {
-            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv);
+            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv_adj);
             ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], scratch);
           }
           if (l == 0) {
@@ -245,7 +249,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           // todo fix this copy.
           auto cur_gate =
               qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
-          ApplyGateDagger(sim, cur_gate, sv);
+          ApplyGateDagger(sim, cur_gate, sv_adj);
 
           // if applicable compute control qubit mask and control value bits.
           uint64_t mask = 0;
@@ -261,9 +265,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
             // Copy sv onto scratch2 in anticipation of non-unitary "gradient
             // gate".
-            ss.Copy(sv, scratch2);
+            ss.Copy(sv_adj, scratch2);
             if (!cur_gate.controlled_by.empty()) {
-              // Gradient of controlled gattes puts zeros on diagonal which is
+              // Gradient of controlled gates puts zeros on diagonal which is
               // the same as collapsing the state and then applying the
               // non-controlled version of the gradient gate.
               ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
@@ -399,7 +403,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             // gate".
             ss.Copy(sv_adj, scratch2);
             if (!cur_gate.controlled_by.empty()) {
-              // Gradient of controlled gattes puts zeros on diagonal which is
+              // Gradient of controlled gates puts zeros on diagonal which is
               // the same as collapsing the state and then applying the
               // non-controlled version of the gradient gate.
               ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
index d810c171b..4091a839b 100644
--- a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
@@ -349,7 +349,7 @@ class TfqAdjointGradientOp : public tensorflow::OpKernel {
           // gate".
           ss.Copy(sv, scratch2);
           if (!cur_gate.controlled_by.empty()) {
-            // Gradient of controlled gattes puts zeros on diagonal which is
+            // Gradient of controlled gates puts zeros on diagonal which is
             // the same as collapsing the state and then applying the
             // non-controlled version of the gradient gate.
             ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);

From 76d4ba2251dd4c93af827a84295c2267e523961c Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 16:38:20 +0900
Subject: [PATCH 08/21] Add edge test cases - empty symbols in tf.gradient() of
 inner_product

---
 .../inner_product_adj_grad_op_test.py         |  11 +-
 .../core/ops/math_ops/inner_product_op.py     |  33 +++-
 .../ops/math_ops/inner_product_op_test.py     | 145 +++++++++++++++++-
 3 files changed, 178 insertions(+), 11 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index 38ae138c7..a8ed07877 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -278,17 +278,22 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 5
         },
     ])
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index 0efa22039..02482a8f8 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -156,6 +156,35 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     def grad(dy):
         """Calculate the gradients of this inner_product op.
 
+        For empty symbols, this function will just output `None` as the default
+        behavior of tensorflow gradient. For example, no matter what shape is,
+        gradients of a given tensor `x` w.r.t empty `symbol` is None.
+
+        >>> qubits = cirq.GridQubit.rect(1, 2)
+        >>> programs = [
+        ...     cirq.Circuit(cirq.H.on_each(qubits)),
+        ...     cirq.Circuit(
+        ...         cirq.X(qubits[0]),
+        ...         cirq.Y(qubits[1]))
+        ... ]
+        >>> other_programs = [
+        ...     cirq.Circuit(cirq.X.on_each(qubits)),
+        ...     cirq.Circuit((cirq.Y**0.125).on_each(qubits)),
+        ...     cirq.Circuit((cirq.X**0.5).on_each(qubits))
+        ... ]
+        >>> programs = tfq.constant([])
+        >>> symbol_names = tf.constant([])
+        >>> symbol_values = tf.constant([])
+        >>> with tf.GradientTape() as t:
+        >>>   t.watch(symbol)
+        >>>   x = tf.constant(tf.ones((3,4)))
+        >>>   y = tfq.math_ops.inner_product(programs, symbol_names,
+        ...                                  symbol_values, other_programs)
+        >>> t.gradient(x, s)  # TensorFlow default behavior.
+        None
+        >>> t.gradient(y, s)
+        None
+
         Args:
             dy: `tf.Tensor` of gradients coming from the next computational op
                 with the shape [batch_size, n_others]
@@ -166,10 +195,12 @@ def grad(dy):
              w.r.t. `symbol_names[k]` merged with the gradient `dy` from the
              next computational op.
         """
+        if symbol_names.shape[0] == 0:
+            return [None, None, None, None]
         inner_prod_grad = inner_product_adj_grad(
             programs, symbol_names, tf.cast(symbol_values, tf.float32),
             other_programs)
-        return tf.einsum("bos,bo->bos", inner_prod_grad, dy)
+        return [None, None, tf.einsum("bos,bo->bos", inner_prod_grad, dy), None]
 
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
                                             tf.cast(symbol_values, tf.float32),
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 798424877..4cadb7476 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -27,7 +27,7 @@ class InnerProductTest(tf.test.TestCase, parameterized.TestCase):
     """Tests tfq_inner_product."""
 
     def test_inner_product_inputs(self):
-        """Make sure that inner_product fails gracefully on bad inputs."""
+        """Makes sure that inner_product fails gracefully on bad inputs."""
         n_qubits = 5
         batch_size = 5
         symbol_names = ['alpha']
@@ -221,7 +221,7 @@ def test_inner_product_inputs(self):
     ])
     def test_correctness_with_symbols(self, n_qubits, batch_size,
                                       inner_dim_size):
-        """Test that inner_product works with symbols."""
+        """Tests that inner_product works with symbols."""
         symbol_names = ['alpha', 'beta', 'gamma']
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, resolver_batch = \
@@ -261,23 +261,28 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 10,  # ComputeSmall
             'inner_dim_size': 5
         },
     ])
     def test_correctness_without_symbols(self, n_qubits, batch_size,
                                          inner_dim_size):
-        """Test that inner_product works without symbols."""
+        """Tests that inner_product works without symbols."""
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, _ = \
             util.random_circuit_resolver_batch(
@@ -306,7 +311,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
         self.assertAllClose(out, out_arr, atol=1e-5)
 
     def test_correctness_empty(self):
-        """Test the inner product between two empty circuits."""
+        """Tests the inner product between two empty circuits."""
 
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
@@ -318,6 +323,132 @@ def test_correctness_empty(self):
         expected = np.array([[1.0]], dtype=np.complex64)
         self.assertAllClose(out, expected)
 
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 5
+        },
+    ])
+    def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
+                                                  inner_dim_size):
+        """Tests that tf.gradient of inner_product works with symbols."""
+        symbol_names = ['alpha', 'beta', 'gamma']
+        n_params = len(symbol_names)
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                                   dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor(symbol_values_array)
+
+        with tf.GradientTape() as tape:
+            tape.watch(symbol_values)
+            ip = inner_product_op.inner_product(programs, symbol_names_tensor,
+                                                symbol_values, other_programs)
+        out = tape.gradient(ip, symbol_values)
+
+        out_arr = np.empty((batch_size, inner_dim_size, n_params),
+                           dtype=np.complex64)
+        # dx came from _GRAD_EPS of core/src/adj_util.cc
+        dx = 5e-3
+        for i in range(batch_size):
+            for k, name in enumerate(symbol_names):
+                if name in resolver_batch[i].param_dict:
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] += dx
+                    final_circuit_p = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] -= dx
+                    final_circuit_m = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    final_wf_p = cirq.final_state_vector(final_circuit_p)
+                    final_wf_m = cirq.final_state_vector(final_circuit_m)
+                    # Performs central finite difference.
+                    final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
+                    for j in range(inner_dim_size):
+                        internal_wf = cirq.final_state_vector(other_batch[i][j])
+                        out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
+
+        self.assertAllClose(out, out_arr, atol=1e-3)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,  # ComputeLarge
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,  # ComputeSmall
+            'inner_dim_size': 5
+        },
+    ])
+    def test_tf_gradient_correctness_without_symbols(self, n_qubits, batch_size,
+                                                     inner_dim_size):
+        """Tests that tf.gradient of inner_product works without symbols."""
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, _ = \
+            util.random_circuit_resolver_batch(
+                qubits, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+
+        with tf.GradientTape() as tape:
+            tape.watch(symbol_values)
+            ip = inner_product_op.inner_product(programs, symbol_names,
+                                                symbol_values, other_programs)
+        out = tape.gradient(ip, symbol_values)
+        self.assertIsNone(out)
+
 
 if __name__ == "__main__":
     tf.test.main()

From 8bf380152063f55924ebf30fc577ed180eb56c0a Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:04:17 +0900
Subject: [PATCH 09/21] Fix empty circuit case

---
 .../ops/math_ops/inner_product_adj_grad_op_test.py  | 13 ++++++++++++-
 .../core/ops/math_ops/tfq_inner_product.cc          | 12 ------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index a8ed07877..9e9c1b80b 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -322,7 +322,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
     def test_correctness_empty(self):
         """Test the inner product adj grad between two empty circuits."""
-
+        symbol_names = ['alpha', 'beta']
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         empty_values = tf.convert_to_tensor([[]])
@@ -333,6 +333,17 @@ def test_correctness_empty(self):
             out = inner_product_op.inner_product_adj_grad(
                 empty_cicuit, empty_symbols, empty_values, other_program)
 
+        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        symbol_names = tf.convert_to_tensor(symbol_names,
+                                            dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[0.0 for _ in range(2)]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        out = inner_product_op.inner_product_adj_grad(
+            empty_cicuit, symbol_names, symbol_values, other_program)
+        expected = np.zeros((1, 1, len(symbol_names)), dtype=np.complex64)
+        self.assertAllClose(out, expected)
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
index cf6fe9c32..8275cc8ca 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
@@ -182,11 +182,6 @@ class TfqInnerProductOp : public tensorflow::OpKernel {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
       for (int j = 0; j < other_fused_circuits[i].size(); j++) {
-        // (#679) Just ignore empty program
-        if (fused_circuits[i].size() == 0) {
-          (*output_tensor)(i, j) = std::complex<float>(1, 0);
-          continue;
-        }
 
         ss.SetStateZero(scratch);
         for (int k = 0; k < other_fused_circuits[i][j].size(); k++) {
@@ -229,13 +224,6 @@ class TfqInnerProductOp : public tensorflow::OpKernel {
 
         const int nq = num_qubits[cur_batch_index];
 
-        // (#679) Just ignore empty program
-        if (fused_circuits[cur_batch_index].size() == 0) {
-          (*output_tensor)(cur_batch_index, cur_internal_index) =
-              std::complex<float>(1, 0);
-          continue;
-        }
-
         if (cur_batch_index != old_batch_index) {
           // We've run into a new state vector we must compute.
           // Only compute a new state vector when we have to.

From 8a88107f7d80b1c9ee173bbb1255cecaa0f1124d Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:05:51 +0900
Subject: [PATCH 10/21] Fix format

---
 .../core/ops/math_ops/inner_product_adj_grad_op_test.py     | 6 ++++--
 tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc   | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index 9e9c1b80b..e7b66f43e 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -339,8 +339,10 @@ def test_correctness_empty(self):
         symbol_values = tf.convert_to_tensor([[0.0 for _ in range(2)]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
-        out = inner_product_op.inner_product_adj_grad(
-            empty_cicuit, symbol_names, symbol_values, other_program)
+        out = inner_product_op.inner_product_adj_grad(empty_cicuit,
+                                                      symbol_names,
+                                                      symbol_values,
+                                                      other_program)
         expected = np.zeros((1, 1, len(symbol_names)), dtype=np.complex64)
         self.assertAllClose(out, expected)
 
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
index 8275cc8ca..a66a1d076 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
@@ -182,7 +182,6 @@ class TfqInnerProductOp : public tensorflow::OpKernel {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
       for (int j = 0; j < other_fused_circuits[i].size(); j++) {
-
         ss.SetStateZero(scratch);
         for (int k = 0; k < other_fused_circuits[i][j].size(); k++) {
           qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);

From 144ef4fcc3f442710564a2cd8c8e1461f10011ae Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:07:20 +0900
Subject: [PATCH 11/21] Fix test one-line comments

---
 .../core/ops/math_ops/inner_product_adj_grad_op_test.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index e7b66f43e..dfcae112c 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -221,7 +221,7 @@ def test_inner_product_adj_grad_inputs(self):
     ])
     def test_correctness_with_symbols(self, n_qubits, batch_size,
                                       inner_dim_size):
-        """Test that inner_product works with symbols."""
+        """Tests that inner_product works with symbols."""
         symbol_names = ['alpha', 'beta', 'gamma']
         n_params = len(symbol_names)
         qubits = cirq.GridQubit.rect(1, n_qubits)
@@ -299,7 +299,7 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     ])
     def test_correctness_without_symbols(self, n_qubits, batch_size,
                                          inner_dim_size):
-        """Test that inner_product_adj_grad works without symbols."""
+        """Tests that inner_product_adj_grad works without symbols."""
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, _ = \
           util.random_circuit_resolver_batch(
@@ -321,7 +321,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
                 programs, symbol_names, symbol_values, other_programs)
 
     def test_correctness_empty(self):
-        """Test the inner product adj grad between two empty circuits."""
+        """Tests the inner product adj grad between two empty circuits."""
         symbol_names = ['alpha', 'beta']
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)

From 8e8b1312361a52e527fb565a521225bf5da6ca4c Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:12:56 +0900
Subject: [PATCH 12/21] Fix lint

---
 .../core/ops/math_ops/inner_product_adj_grad_op_test.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index dfcae112c..3c5882be1 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -317,7 +317,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            out = inner_product_op.inner_product_adj_grad(
+            inner_product_op.inner_product_adj_grad(
                 programs, symbol_names, symbol_values, other_programs)
 
     def test_correctness_empty(self):

From 80adeb1cece35f2e74bf8d6836cda7aa525b26a9 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:28:48 +0900
Subject: [PATCH 13/21] Add missing docstring code result of `grad_ip`

---
 .../ops/math_ops/inner_product_adj_grad_op_test.py  |  5 +++--
 .../core/ops/math_ops/inner_product_op.py           | 13 +++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index 3c5882be1..ac403f7c9 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -317,8 +317,9 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            inner_product_op.inner_product_adj_grad(
-                programs, symbol_names, symbol_values, other_programs)
+            inner_product_op.inner_product_adj_grad(programs, symbol_names,
+                                                    symbol_values,
+                                                    other_programs)
 
     def test_correctness_empty(self):
         """Tests the inner product adj grad between two empty circuits."""
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index 02482a8f8..562b118f7 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -53,6 +53,16 @@ def inner_product_adj_grad(programs, symbol_names, symbol_values,
     ...               reference_tensor, symbol_tensor, values_tensor,
     ...               other_tensor)
     >>> grad_ip
+    tf.Tensor(
+    [[[ 0+0.j  0+0.j]
+      [ 0.17605604-0.42503685j  0+0.j]
+      [ 0.46005663+1.1106750j  0+0.j]]
+
+     [[ 1.5707562-0.j 0-0.j]
+      [-0.22244042-0.09213787j -0.8559104+1.2809625j]
+      [-0.78537834-0.78537798j -1.5707799+0.j]]], shape=(2, 3, 2),
+      dtype=complex64)
+
 
 
     Note: `other_programs` must not contain any free symbols. These can
@@ -160,6 +170,7 @@ def grad(dy):
         behavior of tensorflow gradient. For example, no matter what shape is,
         gradients of a given tensor `x` w.r.t empty `symbol` is None.
 
+
         >>> qubits = cirq.GridQubit.rect(1, 2)
         >>> programs = [
         ...     cirq.Circuit(cirq.H.on_each(qubits)),
@@ -185,6 +196,8 @@ def grad(dy):
         >>> t.gradient(y, s)
         None
 
+
+
         Args:
             dy: `tf.Tensor` of gradients coming from the next computational op
                 with the shape [batch_size, n_others]

From 231341d363e35fb8e6872334d1d33e28400a3eb6 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 8 Feb 2021 17:31:00 +0900
Subject: [PATCH 14/21] Fix nits

---
 tensorflow_quantum/core/ops/math_ops/inner_product_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index 562b118f7..cdfba65d2 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -189,7 +189,7 @@ def grad(dy):
         >>> with tf.GradientTape() as t:
         >>>   t.watch(symbol)
         >>>   x = tf.constant(tf.ones((3,4)))
-        >>>   y = tfq.math_ops.inner_product(programs, symbol_names,
+        >>>   y = tfq.math.inner_product(programs, symbol_names,
         ...                                  symbol_values, other_programs)
         >>> t.gradient(x, s)  # TensorFlow default behavior.
         None

From e3516943a7ddf0a9c8660fd55e75f632e93376ce Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 22 Feb 2021 00:39:21 +0900
Subject: [PATCH 15/21] Add Mike's feedback

---
 .../inner_product_adj_grad_op_test.py         | 104 +++++----
 .../core/ops/math_ops/inner_product_op.py     |  77 ++-----
 .../ops/math_ops/inner_product_op_test.py     |  18 +-
 .../core/ops/math_ops/tfq_inner_product.cc    |  13 ++
 .../math_ops/tfq_inner_product_adj_grad.cc    | 198 ++++++++++--------
 5 files changed, 217 insertions(+), 193 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
index ac403f7c9..5664762b4 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
@@ -30,8 +30,10 @@ def test_inner_product_adj_grad_inputs(self):
         """Makes sure that inner_product_adj_grad fails on bad inputs."""
         n_qubits = 5
         batch_size = 5
+        n_other_programs = 3
         symbol_names = ['alpha']
         qubits = cirq.GridQubit.rect(1, n_qubits)
+        prev_grad = np.ones((batch_size, n_other_programs))
         circuit_batch, resolver_batch = \
           util.random_symbol_circuit_resolver_batch(
               qubits, symbol_names, batch_size)
@@ -42,7 +44,7 @@ def test_inner_product_adj_grad_inputs(self):
              for resolver in resolver_batch])
 
         other_batch = [
-            util.random_circuit_resolver_batch(qubits, 3)[0]
+            util.random_circuit_resolver_batch(qubits, n_other_programs)[0]
             for i in range(batch_size)
         ]
 
@@ -51,14 +53,16 @@ def test_inner_product_adj_grad_inputs(self):
             # Circuit tensor has too many dimensions.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor([circuit_batch]), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch))
+                symbol_values_array, util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_names must be rank 1.'):
             # symbol_names tensor has too many dimensions.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
-                symbol_values_array, util.convert_to_tensor(other_batch))
+                symbol_values_array, util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
@@ -66,21 +70,24 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 np.array([symbol_values_array]),
-                util.convert_to_tensor(other_batch))
+                util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
             # symbol_values_array tensor has too few dimensions.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array[0], util.convert_to_tensor(other_batch))
+                symbol_values_array[0], util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
             # other_programs tensor has too few dimensions.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(circuit_batch))
+                symbol_values_array, util.convert_to_tensor(circuit_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
@@ -88,21 +95,24 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in other_batch]))
+                util.convert_to_tensor([[x] for x in other_batch]),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Unparseable proto'):
             # circuit tensor has the right type but invalid values.
             inner_product_op.inner_product_adj_grad(
                 ['junk'] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch))
+                util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Could not find symbol in parameter map'):
             # symbol_names tensor has the right type but invalid values.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), ['junk'],
-                symbol_values_array, util.convert_to_tensor(other_batch))
+                symbol_values_array, util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'not found in reference circuit'):
@@ -114,7 +124,8 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]))
+                util.convert_to_tensor([[x] for x in new_circuits]),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'not found in paired circuit'):
@@ -126,45 +137,52 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]))
+                util.convert_to_tensor([[x] for x in new_circuits]),
+                prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # circuits tensor has the wrong type.
             inner_product_op.inner_product_adj_grad(
                 [1.0] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch))
+                util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # symbol_names tensor has the wrong type.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), [0.1234],
-                symbol_values_array, util.convert_to_tensor(other_batch))
+                symbol_values_array, util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
             # symbol_values tensor has the wrong type.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                [['junk']] * batch_size, util.convert_to_tensor(other_batch))
+                [['junk']] * batch_size, util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # other_programs tensor has the wrong type.
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, [[1.0]] * batch_size)
+                symbol_values_array, [[1.0]] * batch_size,
+                prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'missing'):
             # we are missing an argument.
             # pylint: disable=no-value-for-parameter
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array)
+                symbol_values_array,
+                prev_grad)
             # pylint: enable=no-value-for-parameter
 
         with self.assertRaisesRegex(TypeError, 'positional arguments'):
             # pylint: disable=too-many-function-args
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch), [])
+                symbol_values_array, util.convert_to_tensor(other_batch),
+                prev_grad, [])
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
@@ -172,7 +190,8 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]))
+                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]),
+                prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
@@ -180,7 +199,8 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array[::int(batch_size * 0.5)],
-                util.convert_to_tensor(other_batch))
+                util.convert_to_tensor(other_batch),
+                prev_grad)
 
         with self.assertRaisesRegex(
                 tf.errors.InvalidArgumentError,
@@ -189,12 +209,14 @@ def test_inner_product_adj_grad_inputs(self):
             inner_product_op.inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in circuit_batch]))
+                util.convert_to_tensor([[x] for x in circuit_batch]),
+                prev_grad)
 
         res = inner_product_op.inner_product_adj_grad(
             util.convert_to_tensor(circuit_batch), symbol_names,
             symbol_values_array.astype(np.float64),
-            util.convert_to_tensor(other_batch))
+            util.convert_to_tensor(other_batch),
+            prev_grad)
         self.assertDTypeEqual(res, np.complex64)
 
     @parameterized.parameters([
@@ -210,7 +232,7 @@ def test_inner_product_adj_grad_inputs(self):
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10, # ComputeSmall
             'inner_dim_size': 2
         },
         {
@@ -244,24 +266,26 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
         symbol_names_tensor = tf.convert_to_tensor(symbol_names,
                                                    dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor(symbol_values_array)
+        prev_grad = np.random.normal(size=(batch_size, inner_dim_size)).astype(
+            np.float32)
 
         out = inner_product_op.inner_product_adj_grad(programs,
                                                       symbol_names_tensor,
                                                       symbol_values,
-                                                      other_programs)
+                                                      other_programs,
+                                                      prev_grad)
 
-        out_arr = np.empty((batch_size, inner_dim_size, n_params),
-                           dtype=np.complex64)
+        out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
         # dx came from _GRAD_EPS of core/src/adj_util.cc
         dx = 5e-3
-        for i in range(batch_size):
+        for i, resolver in enumerate(resolver_batch):
             for k, name in enumerate(symbol_names):
-                if name in resolver_batch[i].param_dict:
-                    new_resolver = copy.deepcopy(resolver_batch[i])
+                if name in resolver.param_dict:
+                    new_resolver = copy.deepcopy(resolver)
                     new_resolver.param_dict[name] += dx
                     final_circuit_p = cirq.resolve_parameters(
                         circuit_batch[i], new_resolver)
-                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver = copy.deepcopy(resolver)
                     new_resolver.param_dict[name] -= dx
                     final_circuit_m = cirq.resolve_parameters(
                         circuit_batch[i], new_resolver)
@@ -269,9 +293,10 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
                     final_wf_m = cirq.final_state_vector(final_circuit_m)
                     # Performs central finite difference.
                     final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
-                    for j in range(inner_dim_size):
-                        internal_wf = cirq.final_state_vector(other_batch[i][j])
-                        out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
+                    for j, other in enumerate(other_batch[i]):
+                        internal_wf = cirq.final_state_vector(other)
+                        out_arr[i][k] += prev_grad[i][j] * np.vdot(
+                            final_wf_grad, internal_wf)
 
         self.assertAllClose(out, out_arr, atol=1e-3)
 
@@ -314,12 +339,14 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
         other_programs = util.convert_to_tensor(other_batch)
         symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+        prev_grad = np.ones((batch_size, inner_dim_size))
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
             inner_product_op.inner_product_adj_grad(programs, symbol_names,
                                                     symbol_values,
-                                                    other_programs)
+                                                    other_programs,
+                                                    prev_grad)
 
     def test_correctness_empty(self):
         """Tests the inner product adj grad between two empty circuits."""
@@ -328,11 +355,13 @@ def test_correctness_empty(self):
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         empty_values = tf.convert_to_tensor([[]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
+        prev_grad = np.ones((1, 1))
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            out = inner_product_op.inner_product_adj_grad(
-                empty_cicuit, empty_symbols, empty_values, other_program)
+            inner_product_op.inner_product_adj_grad(
+                empty_cicuit, empty_symbols, empty_values, other_program,
+                prev_grad)
 
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         symbol_names = tf.convert_to_tensor(symbol_names,
@@ -343,8 +372,9 @@ def test_correctness_empty(self):
         out = inner_product_op.inner_product_adj_grad(empty_cicuit,
                                                       symbol_names,
                                                       symbol_values,
-                                                      other_program)
-        expected = np.zeros((1, 1, len(symbol_names)), dtype=np.complex64)
+                                                      other_program,
+                                                      prev_grad)
+        expected = np.zeros((1, len(symbol_names)), dtype=np.complex64)
         self.assertAllClose(out, expected)
 
 
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index cdfba65d2..35d82eb05 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -21,7 +21,7 @@
 
 
 def inner_product_adj_grad(programs, symbol_names, symbol_values,
-                           other_programs):
+                           other_programs, prev_grad):
     """Calculate the adjoint gradients of the inner product between circuits.
 
     Compute the gradients of the (potentially many) inner products between
@@ -49,18 +49,14 @@ def inner_product_adj_grad(programs, symbol_names, symbol_values,
     >>> symbol_tensor = tf.convert_to_tensor([s.name for s in symbols])
     >>> values_tensor = tf.convert_to_tensor(np.arange(4).reshape(2, 2))
     >>> other_tensor = tfq.convert_to_tensor([other_circuits, other_circuits])
+    >>> prev_grad = tf.convert_to_tensor(tf.ones((2, 2)))
     >>> grad_ip = tfq.math.inner_product_adj_grad(
     ...               reference_tensor, symbol_tensor, values_tensor,
-    ...               other_tensor)
+    ...               other_tensor, prev_grad)
     >>> grad_ip
     tf.Tensor(
-    [[[ 0+0.j  0+0.j]
-      [ 0.17605604-0.42503685j  0+0.j]
-      [ 0.46005663+1.1106750j  0+0.j]]
-
-     [[ 1.5707562-0.j 0-0.j]
-      [-0.22244042-0.09213787j -0.8559104+1.2809625j]
-      [-0.78537834-0.78537798j -1.5707799+0.j]]], shape=(2, 3, 2),
+    [[ 0.6361127+0.6856381j ,  0.       +0.j        ],
+     [ 0.5629374-0.87751585j, -2.4266903+1.2809625j ]], shape=(2, 2),
       dtype=complex64)
 
 
@@ -87,16 +83,18 @@ def inner_product_adj_grad(programs, symbol_names, symbol_values,
             containing the string representations of the circuits with which to
             compute the overlap on `programs` with. Must not contain any free
             symbols.
+        prev_grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops]
+            backprop of values from downstream in the compute graph.
+
     Returns:
-        `tf.Tensor` with shape [batch_size, n_others, n_symbols] where
-        `out[i][j][k]` is equal to the gradient w.r.t. `symbol_names[k]` of the
-        inner product between `programs[i]` with `symbol_values[i]` resolved in
-        and `other_programs[j]`.
+        tf.Tensor` with shape [batch_size, n_symbols] where `out[i][j]` is equal
+        to the gradient of the inner product between programs[i] and all
+        other_programs[i] w.r.t. `symbol_names[j]` and `programs[i]` is resolved
+        with `symbol_values[i]`.
     """
     return MATH_OP_MODULE.tfq_inner_product_adj_grad(
         programs, symbol_names, tf.cast(symbol_values, tf.float32),
-        other_programs)
-
+        other_programs, tf.cast(prev_grad, tf.float32))
 
 @tf.custom_gradient
 def inner_product(programs, symbol_names, symbol_values, other_programs):
@@ -164,56 +162,11 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     """
 
     def grad(dy):
-        """Calculate the gradients of this inner_product op.
-
-        For empty symbols, this function will just output `None` as the default
-        behavior of tensorflow gradient. For example, no matter what shape is,
-        gradients of a given tensor `x` w.r.t empty `symbol` is None.
-
-
-        >>> qubits = cirq.GridQubit.rect(1, 2)
-        >>> programs = [
-        ...     cirq.Circuit(cirq.H.on_each(qubits)),
-        ...     cirq.Circuit(
-        ...         cirq.X(qubits[0]),
-        ...         cirq.Y(qubits[1]))
-        ... ]
-        >>> other_programs = [
-        ...     cirq.Circuit(cirq.X.on_each(qubits)),
-        ...     cirq.Circuit((cirq.Y**0.125).on_each(qubits)),
-        ...     cirq.Circuit((cirq.X**0.5).on_each(qubits))
-        ... ]
-        >>> programs = tfq.constant([])
-        >>> symbol_names = tf.constant([])
-        >>> symbol_values = tf.constant([])
-        >>> with tf.GradientTape() as t:
-        >>>   t.watch(symbol)
-        >>>   x = tf.constant(tf.ones((3,4)))
-        >>>   y = tfq.math.inner_product(programs, symbol_names,
-        ...                                  symbol_values, other_programs)
-        >>> t.gradient(x, s)  # TensorFlow default behavior.
-        None
-        >>> t.gradient(y, s)
-        None
-
-
-
-        Args:
-            dy: `tf.Tensor` of gradients coming from the next computational op
-                with the shape [batch_size, n_others]
-
-        Returns:
-            `tf.Tensor` with shape [batch_size, n_others, n_symbols] where
-            `out[i][j][k]` is equal to the gradient of the above inner product
-             w.r.t. `symbol_names[k]` merged with the gradient `dy` from the
-             next computational op.
-        """
         if symbol_names.shape[0] == 0:
             return [None, None, None, None]
         inner_prod_grad = inner_product_adj_grad(
-            programs, symbol_names, tf.cast(symbol_values, tf.float32),
-            other_programs)
-        return [None, None, tf.einsum("bos,bo->bos", inner_prod_grad, dy), None]
+            programs, symbol_names, symbol_values, other_programs, dy)
+        return [None, None, inner_prod_grad, None]
 
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
                                             tf.cast(symbol_values, tf.float32),
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 4cadb7476..9e88ccadd 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -311,7 +311,7 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
         self.assertAllClose(out, out_arr, atol=1e-5)
 
     def test_correctness_empty(self):
-        """Tests the inner product between two empty circuits."""
+        """Tests the inner product with empty circuits."""
 
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
@@ -323,6 +323,17 @@ def test_correctness_empty(self):
         expected = np.array([[1.0]], dtype=np.complex64)
         self.assertAllClose(out, expected)
 
+        qubit = cirq.GridQubit(0, 0)
+        non_empty_cicuit = util.convert_to_tensor([cirq.Circuit(cirq.X(qubit))])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found'):
+            inner_product_op.inner_product(non_empty_cicuit, empty_symbols,
+                                           empty_values, other_program)
+
     @parameterized.parameters([
         {
             'n_qubits': 5,
@@ -377,8 +388,7 @@ def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
                                                 symbol_values, other_programs)
         out = tape.gradient(ip, symbol_values)
 
-        out_arr = np.empty((batch_size, inner_dim_size, n_params),
-                           dtype=np.complex64)
+        out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
         # dx came from _GRAD_EPS of core/src/adj_util.cc
         dx = 5e-3
         for i in range(batch_size):
@@ -398,7 +408,7 @@ def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
                     final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
                     for j in range(inner_dim_size):
                         internal_wf = cirq.final_state_vector(other_batch[i][j])
-                        out_arr[i][j][k] = np.vdot(final_wf_grad, internal_wf)
+                        out_arr[i][k] += np.vdot(final_wf_grad, internal_wf)
 
         self.assertAllClose(out, out_arr, atol=1e-3)
 
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
index a66a1d076..cf6fe9c32 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product.cc
@@ -182,6 +182,12 @@ class TfqInnerProductOp : public tensorflow::OpKernel {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
       for (int j = 0; j < other_fused_circuits[i].size(); j++) {
+        // (#679) Just ignore empty program
+        if (fused_circuits[i].size() == 0) {
+          (*output_tensor)(i, j) = std::complex<float>(1, 0);
+          continue;
+        }
+
         ss.SetStateZero(scratch);
         for (int k = 0; k < other_fused_circuits[i][j].size(); k++) {
           qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);
@@ -223,6 +229,13 @@ class TfqInnerProductOp : public tensorflow::OpKernel {
 
         const int nq = num_qubits[cur_batch_index];
 
+        // (#679) Just ignore empty program
+        if (fused_circuits[cur_batch_index].size() == 0) {
+          (*output_tensor)(cur_batch_index, cur_internal_index) =
+              std::complex<float>(1, 0);
+          continue;
+        }
+
         if (cur_batch_index != old_batch_index) {
           // We've run into a new state vector we must compute.
           // Only compute a new state vector when we have to.
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index d7b54806b..91d486b0e 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -50,9 +50,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
     const int num_inputs = context->num_inputs();
-    OP_REQUIRES(context, num_inputs == 4,
+    OP_REQUIRES(context, num_inputs == 5,
                 tensorflow::errors::InvalidArgument(absl::StrCat(
-                    "Expected 4 inputs, got ", num_inputs, " inputs.")));
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
 
     // Create the output Tensor.
     const int output_dim_batch_size = context->input(0).dim_size(0);
@@ -64,12 +64,11 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                     output_dim_symbol_size, " symbols.")));
     tensorflow::TensorShape output_shape;
     output_shape.AddDim(output_dim_batch_size);
-    output_shape.AddDim(output_dim_internal_size);
     output_shape.AddDim(output_dim_symbol_size);
 
     tensorflow::Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    auto output_tensor = output->tensor<std::complex<float>, 3>();
+    auto output_tensor = output->matrix<std::complex<float>>();
 
     // Parse program protos.
     std::vector<Program> programs;
@@ -160,19 +159,39 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
       max_num_qubits = std::max(max_num_qubits, num);
     }
 
+    // Get downstream gradients.
+    std::vector<std::vector<float>> downstream_grads;
+    OP_REQUIRES_OK(context, GetPrevGrads(context, &downstream_grads));
+
+    OP_REQUIRES(context, downstream_grads.size() == programs.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and circuits do not match. Got ",
+                    downstream_grads.size(), " gradients and ", programs.size(),
+                    " circuits.")));
+
+    OP_REQUIRES(
+        context, downstream_grads[0].size() == output_dim_internal_size,
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of gradients and other_programs do not match. Got ",
+            downstream_grads[0].size(), " gradient entries and ",
+            output_dim_internal_size, " other programs.")));
+
+    output_tensor.setZero();
+
     // Cross reference with standard google cloud compute instances
     // Memory ~= 2 * num_threads * (2 * 64 * 2 ** num_qubits in circuits)
-    // e2s2 = 2 CPU, 8GB -> Can safely do 25 since Memory = 4GB
-    // e2s4 = 4 CPU, 16GB -> Can safely do 25 since Memory = 8GB
+    // e2s2 = 2 CPU, 8GB -> Can safely do 23 since Memory = 4GB
+    // e2s4 = 4 CPU, 16GB -> Can safely do 23 since Memory = 8GB
     // ...
-    if (max_num_qubits >= 26 || output_dim_batch_size == 1) {
+    if (max_num_qubits >= 24 || output_dim_batch_size == 1) {
       ComputeLarge(num_qubits, maps, qsim_circuits, fused_circuits,
                    partial_fused_circuits, gradient_gates, other_fused_circuits,
-                   context, &output_tensor);
+                   downstream_grads, context, &output_tensor);
     } else {
       ComputeSmall(num_qubits, max_num_qubits, maps, qsim_circuits,
                    fused_circuits, partial_fused_circuits, gradient_gates,
-                   other_fused_circuits, context, &output_tensor);
+                   other_fused_circuits, downstream_grads, context,
+                   &output_tensor);
     }
   }
 
@@ -185,8 +204,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           partial_fused_circuits,
       const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
       const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      const std::vector<std::vector<float>>& downstream_grads,
       tensorflow::OpKernelContext* context,
-      tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
+      tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
     // Instantiate qsim objects.
     const auto tfq_for = tfq::QsimFor(context);
     using Simulator = qsim::Simulator<const tfq::QsimFor&>;
@@ -197,7 +217,6 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
     Simulator sim = Simulator(tfq_for);
     StateSpace ss = StateSpace(tfq_for);
     auto sv = ss.Create(largest_nq);
-    auto sv_adj = ss.Create(largest_nq);
     auto scratch = ss.Create(largest_nq);
     auto scratch2 = ss.Create(largest_nq);
 
@@ -211,7 +230,6 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         // need to switch to larger statespace.
         largest_nq = nq;
         sv = ss.Create(largest_nq);
-        sv_adj = ss.Create(largest_nq);
         scratch = ss.Create(largest_nq);
         scratch2 = ss.Create(largest_nq);
       }
@@ -220,76 +238,80 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
            j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
+      // Accumulate all other_programs.
+      // |phi> = sum_j downstream_grads[i][j]*|phi[i][j]>
       for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
            j < other_fused_circuits[i].size(); j++) {
-        ss.Copy(sv, sv_adj);
-        ss.SetStateZero(scratch);
+        ss.SetStateZero(scratch2);
         for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
              k < other_fused_circuits[i][j].size(); k++) {
-          qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch);
+          qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch2);
         }
-
-        // now sv is |psi>, scratch is |phi>
-        // initialize gradients for given |psi> and |phi>.
-        for (int k = 0; k < maps[i].size(); k++) {
-          (*output_tensor)(i, j, k) = std::complex<float>(0, 0);
+        ss.Multiply(downstream_grads[i][j], scratch2);
+        if (j == 0) {
+          ss.Copy(scratch2, scratch);
+        } else {
+          ss.Add(scratch2, scratch);
         }
-        // Start adjoint differentiation.
-        for (int l = partial_fused_circuits[i].size() - 1; l >= 0; l--) {
-          for (int k = partial_fused_circuits[i][l].size() - 1; k >= 0; k--) {
-            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv_adj);
-            ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], scratch);
-          }
-          if (l == 0) {
-            // last layer will have no parametrized gates so can break.
-            break;
-          }
-
-          // Hit a parameterized gate.
-          // todo fix this copy.
-          auto cur_gate =
-              qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
-          ApplyGateDagger(sim, cur_gate, sv_adj);
+      }
 
-          // if applicable compute control qubit mask and control value bits.
-          uint64_t mask = 0;
-          uint64_t cbits = 0;
-          for (std::vector<unsigned int>::size_type k = 0;
-               k < cur_gate.controlled_by.size(); k++) {
-            uint64_t control_loc = cur_gate.controlled_by[k];
-            mask |= uint64_t{1} << control_loc;
-            cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
-          }
+      // now sv is |psi>
+      // scratch contains sum_j downstream_grads[i][j]*|phi[i][j]>
+      // Start adjoint differentiation.
+      for (int l = partial_fused_circuits[i].size() - 1; l >= 0; l--) {
+        for (int k = partial_fused_circuits[i][l].size() - 1; k >= 0; k--) {
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv);
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], scratch);
+        }
+        if (l == 0) {
+          // last layer will have no parametrized gates so can break.
+          break;
+        }
 
-          for (std::vector<QsimGate>::size_type k = 0;
-               k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
-            // Copy sv onto scratch2 in anticipation of non-unitary "gradient
-            // gate".
-            ss.Copy(sv_adj, scratch2);
-            if (!cur_gate.controlled_by.empty()) {
-              // Gradient of controlled gates puts zeros on diagonal which is
-              // the same as collapsing the state and then applying the
-              // non-controlled version of the gradient gate.
-              ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
-            }
-            qsim::ApplyGate(sim, gradient_gates[i][l - 1].grad_gates[k],
-                            scratch2);
+        // Hit a parameterized gate.
+        // todo fix this copy.
+        auto cur_gate =
+            qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
+        ApplyGateDagger(sim, cur_gate, sv);
+
+        // if applicable compute control qubit mask and control value bits.
+        uint64_t mask = 0;
+        uint64_t cbits = 0;
+        for (std::vector<unsigned int>::size_type k = 0;
+             k < cur_gate.controlled_by.size(); k++) {
+          uint64_t control_loc = cur_gate.controlled_by[k];
+          mask |= uint64_t{1} << control_loc;
+          cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+        }
 
-            // don't need not-found check since this is done upstream already.
-            const auto it = maps[i].find(gradient_gates[i][l - 1].params[k]);
-            const int loc = it->second.first;
-            // Apply finite differencing for adjoint gradients.
-            // Finite differencing enables applying multiple `gradient_gate`
-            // of a symbol at the same circuit. For analytic methods like
-            // parameter-shift we need to apply a single `gradient_gate`
-            // per a symbol.
-            std::complex<double> result = ss.InnerProduct(scratch2, scratch);
-            (*output_tensor)(i, j, loc) +=
-                std::complex<float>(static_cast<float>(result.real()),
-                                    static_cast<float>(result.imag()));
+        for (std::vector<QsimGate>::size_type k = 0;
+             k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
+          // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+          // gate".
+          ss.Copy(sv, scratch2);
+          if (!cur_gate.controlled_by.empty()) {
+            // Gradient of controlled gates puts zeros on diagonal which is
+            // the same as collapsing the state and then applying the
+            // non-controlled version of the gradient gate.
+            ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
           }
-          ApplyGateDagger(sim, cur_gate, scratch);
+          qsim::ApplyGate(sim, gradient_gates[i][l - 1].grad_gates[k],
+                          scratch2);
+
+          // don't need not-found check since this is done upstream already.
+          const auto it = maps[i].find(gradient_gates[i][l - 1].params[k]);
+          const int loc = it->second.first;
+          // Apply finite differencing for adjoint gradients.
+          // Finite differencing enables applying multiple `gradient_gate`
+          // of a symbol at the same circuit. For analytic methods like
+          // parameter-shift we need to apply a single `gradient_gate`
+          // per a symbol.
+          std::complex<double> result = ss.InnerProduct(scratch2, scratch);
+          (*output_tensor)(i, loc) +=
+              std::complex<float>(static_cast<float>(result.real()),
+                                  static_cast<float>(result.imag()));
         }
+        ApplyGateDagger(sim, cur_gate, scratch);
       }
     }
   }
@@ -303,13 +325,14 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           partial_fused_circuits,
       const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
       const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      const std::vector<std::vector<float>>& downstream_grads,
       tensorflow::OpKernelContext* context,
-      tensorflow::TTypes<std::complex<float>, 3>::Tensor* output_tensor) {
+      tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
     const auto tfq_for = qsim::SequentialFor(1);
     using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
     using StateSpace = Simulator::StateSpace;
 
-    const int output_dim_internal_size = output_tensor->dimension(1);
+    const int output_dim_internal_size = other_fused_circuits[0].size();
 
     auto DoWork = [&](int start, int end) {
       int old_batch_index = -2;
@@ -339,8 +362,6 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             scratch = ss.Create(largest_nq);
             scratch2 = ss.Create(largest_nq);
           }
-          // no need to update scratch_state since ComputeExpectation
-          // will take care of things for us.
           ss.SetStateZero(sv);
           for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
                j < fused_circuits[cur_batch_index].size(); j++) {
@@ -348,7 +369,6 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           }
         }
 
-        ss.Copy(sv, sv_adj);
         ss.SetStateZero(scratch);
         for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
              k <
@@ -358,14 +378,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
               sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
               scratch);
         }
-
         // now sv is |psi>, scratch is |phi>
-        // initialize gradients for given |psi> and |phi>.
-        for (int k = 0; k < maps[cur_batch_index].size(); k++) {
-          (*output_tensor)(cur_batch_index, cur_internal_index, k) =
-              std::complex<float>(0, 0);
-        }
         // Start adjoint differentiation.
+        ss.Copy(sv, sv_adj);
         for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0;
              l--) {
           for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
@@ -399,8 +414,8 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
           for (int k = 0;
                k < gradient_gates[cur_batch_index][l - 1].grad_gates.size();
                k++) {
-            // Copy sv onto scratch2 in anticipation of non-unitary "gradient
-            // gate".
+            // Copy sv_adj onto scratch2 in anticipation of non-unitary
+            // "gradient gate".
             ss.Copy(sv_adj, scratch2);
             if (!cur_gate.controlled_by.empty()) {
               // Gradient of controlled gates puts zeros on diagonal which is
@@ -422,9 +437,10 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             // parameter-shift we need to apply a single `gradient_gate`
             // per a symbol.
             std::complex<double> result = ss.InnerProduct(scratch2, scratch);
-            (*output_tensor)(cur_batch_index, cur_internal_index, loc) +=
+            (*output_tensor)(cur_batch_index, loc) +=
+                (downstream_grads[cur_batch_index][cur_internal_index] *
                 std::complex<float>(static_cast<float>(result.real()),
-                                    static_cast<float>(result.imag()));
+                                    static_cast<float>(result.imag())));
           }
           ApplyGateDagger(sim, cur_gate, scratch);
         }
@@ -448,7 +464,8 @@ REGISTER_OP("TfqInnerProductAdjGrad")
     .Input("symbol_names: string")
     .Input("symbol_values: float")
     .Input("other_programs: string")
-    .Output("inner_products: complex64")
+    .Input("downstream_grads: float")
+    .Output("inner_products_adj_grad: complex64")
     .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
       tensorflow::shape_inference::ShapeHandle programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
@@ -462,14 +479,15 @@ REGISTER_OP("TfqInnerProductAdjGrad")
       tensorflow::shape_inference::ShapeHandle other_programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &other_programs_shape));
 
+      tensorflow::shape_inference::ShapeHandle downstream_grads_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &downstream_grads_shape));
+
       tensorflow::shape_inference::DimensionHandle output_rows =
           c->Dim(programs_shape, 0);
       tensorflow::shape_inference::DimensionHandle output_cols =
-          c->Dim(other_programs_shape, 1);
-      tensorflow::shape_inference::DimensionHandle n_symbols =
           c->Dim(symbol_names_shape, 0);
       std::vector<tensorflow::shape_inference::DimensionHandle> dims = {
-          output_rows, output_cols, n_symbols};
+          output_rows, output_cols};
       c->set_output(0, c->MakeShape(dims));
 
       return tensorflow::Status::OK();

From dbc80d6f896138ee31cb696192c22e8d5e516137 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Wed, 10 Mar 2021 19:46:25 +0900
Subject: [PATCH 16/21] Mike's feedback

---
 ...ner_product_adj_grad_op_test.py => inner_product_grad_test.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tensorflow_quantum/core/ops/math_ops/{inner_product_adj_grad_op_test.py => inner_product_grad_test.py} (100%)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
similarity index 100%
rename from tensorflow_quantum/core/ops/math_ops/inner_product_adj_grad_op_test.py
rename to tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py

From 58937209e3f5ee6c9e80ba93796938e0bd74a02b Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Wed, 10 Mar 2021 19:47:27 +0900
Subject: [PATCH 17/21] Mike's feedback

---
 tensorflow_quantum/core/ops/cirq_ops_test.py  |   8 +-
 tensorflow_quantum/core/ops/math_ops/BUILD    |   4 +-
 .../ops/math_ops/inner_product_grad_test.py   | 180 ++++++++----------
 .../core/ops/math_ops/inner_product_op.py     |  58 ++----
 .../ops/math_ops/inner_product_op_test.py     |  57 ++----
 .../math_ops/tfq_inner_product_adj_grad.cc    |  37 ++--
 .../core/ops/tfq_simulate_ops_test.py         |   4 +-
 tensorflow_quantum/core/src/util_qsim.h       |  34 ++++
 8 files changed, 172 insertions(+), 210 deletions(-)

diff --git a/tensorflow_quantum/core/ops/cirq_ops_test.py b/tensorflow_quantum/core/ops/cirq_ops_test.py
index 957a7939f..94f945640 100644
--- a/tensorflow_quantum/core/ops/cirq_ops_test.py
+++ b/tensorflow_quantum/core/ops/cirq_ops_test.py
@@ -414,8 +414,8 @@ def test_sampling_output_padding(self, op, all_n_qubits, n_samples):
             this_expected_output[:, :max(all_n_qubits) - n_qubits] = -2
             expected_outputs.append(this_expected_output)
             circuits.append(
-                cirq.Circuit(
-                    *cirq.X.on_each(*cirq.GridQubit.rect(1, n_qubits))))
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
         results = op(util.convert_to_tensor(circuits), [], [[]] * len(circuits),
                      [n_samples]).numpy()
         self.assertAllClose(expected_outputs, results)
@@ -461,8 +461,8 @@ def run_sweep(self, program, params, repetitions):
         circuits = []
         for n_qubits in all_n_qubits:
             circuits.append(
-                cirq.Circuit(
-                    *cirq.X.on_each(*cirq.GridQubit.rect(1, n_qubits))))
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
         test_results = this_op(util.convert_to_tensor(circuits), [],
                                [[]] * len(circuits), [n_samples]).numpy()
 
diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index 05645398c..af765903e 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -86,8 +86,8 @@ py_test(
 )
 
 py_test(
-    name = "inner_product_adj_grad_op_test",
-    srcs = ["inner_product_adj_grad_op_test.py"],
+    name = "inner_product_grad_test",
+    srcs = ["inner_product_grad_test.py"],
     python_version = "PY3",
     deps = [
         ":inner_product_op_py",
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
index 5664762b4..444974a38 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
@@ -51,68 +51,65 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'programs must be rank 1'):
             # Circuit tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor([circuit_batch]), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor([circuit_batch]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_names must be rank 1.'):
             # symbol_names tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
-                symbol_values_array, util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                np.array([symbol_names]), symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
             # symbol_values_array tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 np.array([symbol_values_array]),
-                util.convert_to_tensor(other_batch),
-                prev_grad)
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
             # symbol_values_array tensor has too few dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array[0], util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array[0],
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
             # other_programs tensor has too few dimensions.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(circuit_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(circuit_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
             # pauli_sums tensor has too many dimensions.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in other_batch]),
-                prev_grad)
+                util.convert_to_tensor([[x] for x in other_batch]), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Unparseable proto'):
             # circuit tensor has the right type but invalid values.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 ['junk'] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch),
-                prev_grad)
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Could not find symbol in parameter map'):
             # symbol_names tensor has the right type but invalid values.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), ['junk'],
-                symbol_values_array, util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                ['junk'], symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'not found in reference circuit'):
@@ -121,11 +118,10 @@ def test_inner_product_adj_grad_inputs(self):
             new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
             new_circuits, _ = util.random_circuit_resolver_batch(
                 new_qubits, batch_size)
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]),
-                prev_grad)
+                util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'not found in paired circuit'):
@@ -134,60 +130,56 @@ def test_inner_product_adj_grad_inputs(self):
             new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
             new_circuits, _ = util.random_circuit_resolver_batch(
                 new_qubits, batch_size)
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in new_circuits]),
-                prev_grad)
+                util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # circuits tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 [1.0] * batch_size, symbol_names, symbol_values_array,
-                util.convert_to_tensor(other_batch),
-                prev_grad)
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # symbol_names tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), [0.1234],
-                symbol_values_array, util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                [0.1234], symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
             # symbol_values tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                [['junk']] * batch_size, util.convert_to_tensor(other_batch),
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, [['junk']] * batch_size,
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # other_programs tensor has the wrong type.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, [[1.0]] * batch_size,
-                prev_grad)
+                symbol_values_array, [[1.0]] * batch_size, prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'missing'):
             # we are missing an argument.
             # pylint: disable=no-value-for-parameter
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array,
-                prev_grad)
+                symbol_values_array, prev_grad)
             # pylint: enable=no-value-for-parameter
 
         with self.assertRaisesRegex(TypeError, 'positional arguments'):
             # pylint: disable=too-many-function-args
-            inner_product_op.inner_product_adj_grad(
-                util.convert_to_tensor(circuit_batch), symbol_names,
-                symbol_values_array, util.convert_to_tensor(other_batch),
-                prev_grad, [])
+            inner_product_op._inner_product_adj_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad, [])
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
             # batch programs has wrong batch size.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]),
@@ -196,48 +188,45 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
             # batch programs has wrong batch size.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array[::int(batch_size * 0.5)],
-                util.convert_to_tensor(other_batch),
-                prev_grad)
+                util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(
                 tf.errors.InvalidArgumentError,
                 expected_regex='Found symbols in other_programs'):
             # other_programs has symbols.
-            inner_product_op.inner_product_adj_grad(
+            inner_product_op._inner_product_adj_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
-                util.convert_to_tensor([[x] for x in circuit_batch]),
-                prev_grad)
+                util.convert_to_tensor([[x] for x in circuit_batch]), prev_grad)
 
-        res = inner_product_op.inner_product_adj_grad(
+        res = inner_product_op._inner_product_adj_grad(
             util.convert_to_tensor(circuit_batch), symbol_names,
             symbol_values_array.astype(np.float64),
-            util.convert_to_tensor(other_batch),
-            prev_grad)
+            util.convert_to_tensor(other_batch), prev_grad)
         self.assertDTypeEqual(res, np.complex64)
 
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10, # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 5
         },
     ])
@@ -266,14 +255,14 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
         symbol_names_tensor = tf.convert_to_tensor(symbol_names,
                                                    dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor(symbol_values_array)
-        prev_grad = np.random.normal(size=(batch_size, inner_dim_size)).astype(
-            np.float32)
+        prev_grad = tf.cast(tf.random.normal((batch_size, inner_dim_size)),
+                            dtype=tf.complex64)
 
-        out = inner_product_op.inner_product_adj_grad(programs,
-                                                      symbol_names_tensor,
-                                                      symbol_values,
-                                                      other_programs,
-                                                      prev_grad)
+        out = inner_product_op._inner_product_adj_grad(programs,
+                                                       symbol_names_tensor,
+                                                       symbol_values,
+                                                       other_programs,
+                                                       prev_grad)
 
         out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
         # dx came from _GRAD_EPS of core/src/adj_util.cc
@@ -295,30 +284,30 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
                     final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
                     for j, other in enumerate(other_batch[i]):
                         internal_wf = cirq.final_state_vector(other)
-                        out_arr[i][k] += prev_grad[i][j] * np.vdot(
-                            final_wf_grad, internal_wf)
+                        out_arr[i][k] += (prev_grad[i][j] *
+                                          np.vdot(final_wf_grad, internal_wf))
 
-        self.assertAllClose(out, out_arr, atol=1e-3)
+        self.assertAllClose(out, np.conj(out_arr), atol=1e-3)
 
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 5
         },
     ])
@@ -343,10 +332,9 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            inner_product_op.inner_product_adj_grad(programs, symbol_names,
-                                                    symbol_values,
-                                                    other_programs,
-                                                    prev_grad)
+            inner_product_op._inner_product_adj_grad(programs, symbol_names,
+                                                     symbol_values,
+                                                     other_programs, prev_grad)
 
     def test_correctness_empty(self):
         """Tests the inner product adj grad between two empty circuits."""
@@ -359,9 +347,10 @@ def test_correctness_empty(self):
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            inner_product_op.inner_product_adj_grad(
-                empty_cicuit, empty_symbols, empty_values, other_program,
-                prev_grad)
+            inner_product_op._inner_product_adj_grad(empty_cicuit,
+                                                     empty_symbols,
+                                                     empty_values,
+                                                     other_program, prev_grad)
 
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         symbol_names = tf.convert_to_tensor(symbol_names,
@@ -369,11 +358,10 @@ def test_correctness_empty(self):
         symbol_values = tf.convert_to_tensor([[0.0 for _ in range(2)]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
-        out = inner_product_op.inner_product_adj_grad(empty_cicuit,
-                                                      symbol_names,
-                                                      symbol_values,
-                                                      other_program,
-                                                      prev_grad)
+        out = inner_product_op._inner_product_adj_grad(empty_cicuit,
+                                                       symbol_names,
+                                                       symbol_values,
+                                                       other_program, prev_grad)
         expected = np.zeros((1, len(symbol_names)), dtype=np.complex64)
         self.assertAllClose(out, expected)
 
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index 35d82eb05..dd371dd33 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -20,8 +20,8 @@
 MATH_OP_MODULE = load_module(os.path.join("math_ops", "_tfq_math_ops.so"))
 
 
-def inner_product_adj_grad(programs, symbol_names, symbol_values,
-                           other_programs, prev_grad):
+def _inner_product_adj_grad(programs, symbol_names, symbol_values,
+                            other_programs, prev_grad):
     """Calculate the adjoint gradients of the inner product between circuits.
 
     Compute the gradients of the (potentially many) inner products between
@@ -32,40 +32,9 @@ def inner_product_adj_grad(programs, symbol_names, symbol_values,
         \psi_{\text{other_programs[j]}} \rangle $
 
 
-    >>> symbols = sympy.symbols('alpha beta')
-    >>> qubits = cirq.GridQubit.rect(1, 2)
-    >>> reference_circuits = [
-    ...     cirq.Circuit((cirq.H**symbols[0]).on_each(qubits)),
-    ...     cirq.Circuit(
-    ...         cirq.X(qubits[0]) ** symbols[0],
-    ...         cirq.Y(qubits[1]) ** symbols[1])
-    ... ]
-    >>> other_circuits = [
-    ...     cirq.Circuit(cirq.X.on_each(qubits)),
-    ...     cirq.Circuit((cirq.Y**0.125).on_each(qubits)),
-    ...     cirq.Circuit((cirq.X**0.5).on_each(qubits))
-    ... ]
-    >>> reference_tensor = tfq.convert_to_tensor(reference_circuits)
-    >>> symbol_tensor = tf.convert_to_tensor([s.name for s in symbols])
-    >>> values_tensor = tf.convert_to_tensor(np.arange(4).reshape(2, 2))
-    >>> other_tensor = tfq.convert_to_tensor([other_circuits, other_circuits])
-    >>> prev_grad = tf.convert_to_tensor(tf.ones((2, 2)))
-    >>> grad_ip = tfq.math.inner_product_adj_grad(
-    ...               reference_tensor, symbol_tensor, values_tensor,
-    ...               other_tensor, prev_grad)
-    >>> grad_ip
-    tf.Tensor(
-    [[ 0.6361127+0.6856381j ,  0.       +0.j        ],
-     [ 0.5629374-0.87751585j, -2.4266903+1.2809625j ]], shape=(2, 2),
-      dtype=complex64)
-
-
-
     Note: `other_programs` must not contain any free symbols. These can
         be resolved beforehand with `tfq.resolve_parameters`.
 
-    Note: Currently this op is not differentiable.
-
     Note: len(symbol_names) (=n_params) should be a positive integer.
 
     Args:
@@ -92,9 +61,12 @@ def inner_product_adj_grad(programs, symbol_names, symbol_values,
         other_programs[i] w.r.t. `symbol_names[j]` and `programs[i]` is resolved
         with `symbol_values[i]`.
     """
-    return MATH_OP_MODULE.tfq_inner_product_adj_grad(
-        programs, symbol_names, tf.cast(symbol_values, tf.float32),
-        other_programs, tf.cast(prev_grad, tf.float32))
+    # Due to TF gradient scheme, we return complex conjugate derivative.
+    return tf.math.conj(
+        MATH_OP_MODULE.tfq_inner_product_adj_grad(
+            programs, symbol_names, tf.cast(symbol_values, tf.float32),
+            other_programs, tf.cast(prev_grad, tf.float32)))
+
 
 @tf.custom_gradient
 def inner_product(programs, symbol_names, symbol_values, other_programs):
@@ -138,8 +110,6 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     Note: `other_programs` must not contain any free symbols. These can
         be resolved beforehand with `tfq.resolve_parameters`.
 
-    Note: Currently this op is differentiable via adjoint differentiation.
-
     Args:
         programs: `tf.Tensor` of strings with shape [batch_size] containing
             the string representations of the circuits
@@ -162,10 +132,14 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     """
 
     def grad(dy):
-        if symbol_names.shape[0] == 0:
-            return [None, None, None, None]
-        inner_prod_grad = inner_product_adj_grad(
-            programs, symbol_names, symbol_values, other_programs, dy)
+
+        def _true_grad():
+            return _inner_product_adj_grad(programs, symbol_names,
+                                           symbol_values, other_programs, dy)
+
+        inner_prod_grad = tf.cond(tf.math.equal(symbol_names.shape[0], 0),
+                                  lambda: tf.zeros_like(symbol_values),
+                                  _true_grad)
         return [None, None, inner_prod_grad, None]
 
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index fcf74da5b..7d39cc2ec 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -200,22 +200,22 @@ def test_inner_product_inputs(self):
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 5
         },
     ])
@@ -261,22 +261,22 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 2,
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 3,
             'inner_dim_size': 2
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 10,
             'inner_dim_size': 5
         },
     ])
@@ -313,48 +313,39 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
     def test_correctness_empty(self):
         """Tests the inner product with empty circuits."""
 
-        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_circuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         empty_values = tf.convert_to_tensor([[]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
-        out = inner_product_op.inner_product(empty_cicuit, empty_symbols,
+        out = inner_product_op.inner_product(empty_circuit, empty_symbols,
                                              empty_values, other_program)
         expected = np.array([[1.0]], dtype=np.complex64)
         self.assertAllClose(out, expected)
 
         qubit = cirq.GridQubit(0, 0)
-        non_empty_cicuit = util.convert_to_tensor([cirq.Circuit(cirq.X(qubit))])
+        non_empty_circuit = util.convert_to_tensor(
+            [cirq.Circuit(cirq.X(qubit))])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         empty_values = tf.convert_to_tensor([[]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'qubits not found'):
-            inner_product_op.inner_product(non_empty_cicuit, empty_symbols,
+            inner_product_op.inner_product(non_empty_circuit, empty_symbols,
                                            empty_values, other_program)
 
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
-            'inner_dim_size': 1
-        },
-        {
-            'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 3,
             'inner_dim_size': 2
         },
-        {
-            'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
-            'inner_dim_size': 5
-        },
     ])
     def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
                                                   inner_dim_size):
@@ -410,29 +401,19 @@ def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
                         internal_wf = cirq.final_state_vector(other_batch[i][j])
                         out_arr[i][k] += np.vdot(final_wf_grad, internal_wf)
 
-        self.assertAllClose(out, out_arr, atol=1e-3)
+        self.assertAllClose(out, np.conj(out_arr), atol=1e-3)
 
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 1,  # ComputeLarge
+            'batch_size': 1,
             'inner_dim_size': 5
         },
         {
             'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
-            'inner_dim_size': 1
-        },
-        {
-            'n_qubits': 10,
-            'batch_size': 10,  # ComputeSmall
+            'batch_size': 3,
             'inner_dim_size': 2
         },
-        {
-            'n_qubits': 5,
-            'batch_size': 10,  # ComputeSmall
-            'inner_dim_size': 5
-        },
     ])
     def test_tf_gradient_correctness_without_symbols(self, n_qubits, batch_size,
                                                      inner_dim_size):
@@ -457,7 +438,7 @@ def test_tf_gradient_correctness_without_symbols(self, n_qubits, batch_size,
             ip = inner_product_op.inner_product(programs, symbol_names,
                                                 symbol_values, other_programs)
         out = tape.gradient(ip, symbol_values)
-        self.assertIsNone(out)
+        self.assertAllClose(out, tf.zeros_like(symbol_values), atol=1e-3)
 
     def test_correctness_no_circuit(self):
         """Test the inner product between no circuits."""
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index 91d486b0e..ee7ec9042 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -169,12 +169,11 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
                     downstream_grads.size(), " gradients and ", programs.size(),
                     " circuits.")));
 
-    OP_REQUIRES(
-        context, downstream_grads[0].size() == output_dim_internal_size,
-        tensorflow::errors::InvalidArgument(absl::StrCat(
-            "Number of gradients and other_programs do not match. Got ",
-            downstream_grads[0].size(), " gradient entries and ",
-            output_dim_internal_size, " other programs.")));
+    OP_REQUIRES(context, downstream_grads[0].size() == output_dim_internal_size,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and other_programs do not match. Got ",
+                    downstream_grads[0].size(), " gradient entries and ",
+                    output_dim_internal_size, " other programs.")));
 
     output_tensor.setZero();
 
@@ -238,22 +237,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
            j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      // Accumulate all other_programs.
-      // |phi> = sum_j downstream_grads[i][j]*|phi[i][j]>
-      for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
-           j < other_fused_circuits[i].size(); j++) {
-        ss.SetStateZero(scratch2);
-        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
-             k < other_fused_circuits[i][j].size(); k++) {
-          qsim::ApplyFusedGate(sim, other_fused_circuits[i][j][k], scratch2);
-        }
-        ss.Multiply(downstream_grads[i][j], scratch2);
-        if (j == 0) {
-          ss.Copy(scratch2, scratch);
-        } else {
-          ss.Add(scratch2, scratch);
-        }
-      }
+
+      auto status = AccumulateFusedCircuits(
+          downstream_grads, other_fused_circuits, sim, ss, scratch2, scratch);
 
       // now sv is |psi>
       // scratch contains sum_j downstream_grads[i][j]*|phi[i][j]>
@@ -270,8 +256,7 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 
         // Hit a parameterized gate.
         // todo fix this copy.
-        auto cur_gate =
-            qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
+        auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
         ApplyGateDagger(sim, cur_gate, sv);
 
         // if applicable compute control qubit mask and control value bits.
@@ -439,8 +424,8 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
             std::complex<double> result = ss.InnerProduct(scratch2, scratch);
             (*output_tensor)(cur_batch_index, loc) +=
                 (downstream_grads[cur_batch_index][cur_internal_index] *
-                std::complex<float>(static_cast<float>(result.real()),
-                                    static_cast<float>(result.imag())));
+                 std::complex<float>(static_cast<float>(result.real()),
+                                     static_cast<float>(result.imag())));
           }
           ApplyGateDagger(sim, cur_gate, scratch);
         }
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_test.py b/tensorflow_quantum/core/ops/tfq_simulate_ops_test.py
index ae4addb58..e81604624 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_ops_test.py
+++ b/tensorflow_quantum/core/ops/tfq_simulate_ops_test.py
@@ -453,8 +453,8 @@ def test_sampling_output_padding(self, all_n_qubits, n_samples):
             this_expected_output[:, :max(all_n_qubits) - n_qubits] = -2
             expected_outputs.append(this_expected_output)
             circuits.append(
-                cirq.Circuit(
-                    *cirq.X.on_each(*cirq.GridQubit.rect(1, n_qubits))))
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
         results = op(util.convert_to_tensor(circuits), [], [[]] * len(circuits),
                      [n_samples]).numpy()
         self.assertAllClose(expected_outputs, results)
diff --git a/tensorflow_quantum/core/src/util_qsim.h b/tensorflow_quantum/core/src/util_qsim.h
index 5024d47bf..1db22506b 100644
--- a/tensorflow_quantum/core/src/util_qsim.h
+++ b/tensorflow_quantum/core/src/util_qsim.h
@@ -35,6 +35,7 @@ namespace tfq {
 
 typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
 
 // Custom FOR loop struct to use TF threadpool instead of native
 // qsim OpenMP or serial FOR implementations.
@@ -315,6 +316,39 @@ tensorflow::Status AccumulateOperators(
   return status;
 }
 
+// Assumes coefficients.size() == fused_circuits.size() and
+// coefficients[0].size() == fused_circuits[0].size().
+// scratch has been created, but does not require initialization.
+// dest has been created, but does not require initialization.
+// |phi> = sum_j coefficients[i][j]*|phi[i][j]>
+template <typename SimT, typename StateSpaceT, typename StateT>
+tensorflow::Status AccumulateFusedCircuits(
+    const std::vector<std::vector<float>>& coefficients,
+    const std::vector<std::vector<QsimFusedCircuit>>& fused_circuits,
+    const SimT& sim, const StateSpaceT& ss, StateT& scratch, StateT& dest) {
+  tensorflow::Status status = tensorflow::Status::OK();
+  ss.SetAllZeros(dest);
+
+  DCHECK_EQ(coefficients.size(), fused_circuits.size());
+  DCHECK_EQ(coefficients[0].size(), fused_circuits[0].size());
+
+  for (std::vector<qsim::GateFused<QsimGate>>::size_type i = 0;
+       i < fused_circuits.size(); i++) {
+    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
+         j < fused_circuits[i].size(); j++) {
+      ss.SetStateZero(scratch);
+      for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
+           k < fused_circuits[i][j].size(); k++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j][k], scratch);
+      }
+      ss.Multiply(coefficients[i][j], scratch);
+      ss.Add(scratch, dest);
+    }
+  }
+
+  return status;
+}
+
 }  // namespace tfq
 
 #endif  // UTIL_QSIM_H_

From 7e5cd811982c2f5bde802aad0a6ac82e952c4f0c Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Wed, 10 Mar 2021 22:44:42 +0900
Subject: [PATCH 18/21] Add Mike's last feedback

---
 .../ops/math_ops/inner_product_grad_test.py   | 15 ++++
 .../ops/math_ops/inner_product_op_test.py     | 15 ++++
 .../math_ops/tfq_inner_product_adj_grad.cc    |  5 +-
 tensorflow_quantum/core/src/util_qsim.h       | 29 +++----
 tensorflow_quantum/core/src/util_qsim_test.cc | 75 +++++++++++++++++++
 5 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
index 444974a38..3bbefb17d 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
@@ -365,6 +365,21 @@ def test_correctness_empty(self):
         expected = np.zeros((1, len(symbol_names)), dtype=np.complex64)
         self.assertAllClose(out, expected)
 
+    def test_correctness_no_circuit(self):
+        """Test the inner product grad between no circuits."""
+
+        empty_circuit = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_symbols = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_values = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+        other_program = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string)
+        empty_pred_grad = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'number of symbols must be a positive'):
+            out = inner_product_op._inner_product_adj_grad(
+                empty_circuit, empty_symbols, empty_values, other_program,
+                empty_pred_grad)
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index 7ed414b31..ee9fd08b4 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -461,6 +461,21 @@ def test_correctness_no_circuit(self):
                                              empty_values, other_program)
         self.assertShapeEqual(np.zeros((0, 0)), out)
 
+    def test_tf_gradient_correctness_no_circuit(self):
+        """Test the inner product grad between no circuits."""
+
+        empty_circuit = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_symbols = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_values = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+        other_program = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string)
+
+        with tf.GradientTape() as tape:
+            tape.watch(empty_values)
+            out = inner_product_op.inner_product(empty_circuit, empty_symbols,
+                                                 empty_values, other_program)
+
+        self.assertShapeEqual(np.zeros((0, 0)), out)
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
index ee7ec9042..04605f4ee 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
@@ -238,8 +238,9 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
 
-      auto status = AccumulateFusedCircuits(
-          downstream_grads, other_fused_circuits, sim, ss, scratch2, scratch);
+      auto status =
+          AccumulateFusedCircuits(downstream_grads[i], other_fused_circuits[i],
+                                  sim, ss, scratch2, scratch);
 
       // now sv is |psi>
       // scratch contains sum_j downstream_grads[i][j]*|phi[i][j]>
diff --git a/tensorflow_quantum/core/src/util_qsim.h b/tensorflow_quantum/core/src/util_qsim.h
index 1db22506b..955085159 100644
--- a/tensorflow_quantum/core/src/util_qsim.h
+++ b/tensorflow_quantum/core/src/util_qsim.h
@@ -316,34 +316,29 @@ tensorflow::Status AccumulateOperators(
   return status;
 }
 
-// Assumes coefficients.size() == fused_circuits.size() and
-// coefficients[0].size() == fused_circuits[0].size().
+// Assumes coefficients.size() == fused_circuits.size().
+// These are checked at the upstream.
 // scratch has been created, but does not require initialization.
 // dest has been created, but does not require initialization.
-// |phi> = sum_j coefficients[i][j]*|phi[i][j]>
+// scratch has garbage value.
+// |psi> = sum_i coefficients[i]*|phi[i]>
 template <typename SimT, typename StateSpaceT, typename StateT>
 tensorflow::Status AccumulateFusedCircuits(
-    const std::vector<std::vector<float>>& coefficients,
-    const std::vector<std::vector<QsimFusedCircuit>>& fused_circuits,
-    const SimT& sim, const StateSpaceT& ss, StateT& scratch, StateT& dest) {
+    const std::vector<float>& coefficients,
+    const std::vector<QsimFusedCircuit>& fused_circuits, const SimT& sim,
+    const StateSpaceT& ss, StateT& scratch, StateT& dest) {
   tensorflow::Status status = tensorflow::Status::OK();
   ss.SetAllZeros(dest);
 
-  DCHECK_EQ(coefficients.size(), fused_circuits.size());
-  DCHECK_EQ(coefficients[0].size(), fused_circuits[0].size());
-
   for (std::vector<qsim::GateFused<QsimGate>>::size_type i = 0;
        i < fused_circuits.size(); i++) {
-    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type j = 0;
+    ss.SetStateZero(scratch);
+    for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
          j < fused_circuits[i].size(); j++) {
-      ss.SetStateZero(scratch);
-      for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
-           k < fused_circuits[i][j].size(); k++) {
-        qsim::ApplyFusedGate(sim, fused_circuits[i][j][k], scratch);
-      }
-      ss.Multiply(coefficients[i][j], scratch);
-      ss.Add(scratch, dest);
+      qsim::ApplyFusedGate(sim, fused_circuits[i][j], scratch);
     }
+    ss.Multiply(coefficients[i], scratch);
+    ss.Add(scratch, dest);
   }
 
   return status;
diff --git a/tensorflow_quantum/core/src/util_qsim_test.cc b/tensorflow_quantum/core/src/util_qsim_test.cc
index 324d547bd..252621a68 100644
--- a/tensorflow_quantum/core/src/util_qsim_test.cc
+++ b/tensorflow_quantum/core/src/util_qsim_test.cc
@@ -40,6 +40,7 @@ using ::tfq::proto::PauliTerm;
 typedef absl::flat_hash_map<std::string, std::pair<int, float>> SymbolMap;
 typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
 
 class TwoTermSampledExpectationFixture
     : public ::testing::TestWithParam<std::tuple<std::string, float>> {};
@@ -551,5 +552,79 @@ TEST(UtilQsimTest, AccumulateOperatorsEmpty) {
   EXPECT_NEAR(ss.GetAmpl(scratch, 3).imag(), 0.0, 1e-5);
 }
 
+TEST(UtilQsimTest, AccumulateFusedCircuitsBasic) {
+  // Create circuit to prepare initial state.
+  std::vector<QsimCircuit> simple_circuits(2, QsimCircuit());
+  simple_circuits[0].num_qubits = 2;
+  simple_circuits[0].gates.push_back(
+      qsim::Cirq::XPowGate<float>::Create(0, 1, 0.25, 0.0));
+  simple_circuits[1].num_qubits = 2;
+  simple_circuits[1].gates.push_back(
+      qsim::Cirq::CXPowGate<float>::Create(1, 1, 0, 1.0, 0.0));
+  simple_circuits[1].gates.push_back(
+      qsim::Cirq::YPowGate<float>::Create(2, 0, 0.5, 0.0));
+
+  // Initialize fused circuits.
+  std::vector<QsimFusedCircuit> fused_circuits;
+  for (int i = 0; i < 2; i++) {
+    fused_circuits.push_back(
+        qsim::BasicGateFuser<qsim::IO, QsimGate>().FuseGates(
+            qsim::BasicGateFuser<qsim::IO, QsimGate>::Parameter(),
+            simple_circuits[i].num_qubits, simple_circuits[i].gates));
+  }
+
+  // Instantiate qsim objects.
+  qsim::Simulator<qsim::SequentialFor> sim(1);
+  qsim::Simulator<qsim::SequentialFor>::StateSpace ss(1);
+  auto sv = ss.Create(2);
+  auto scratch = ss.Create(2);
+  auto dest = ss.Create(2);
+
+  // Initialize coeffs.
+  std::vector<float> coeffs = {1.23, 4.56};
+
+  AccumulateFusedCircuits(coeffs, fused_circuits, sim, ss, scratch, dest);
+
+  // Scratch has coeffs[r][c] * fused circuits[r][c] where r, c = last indices.
+  // Check that dest got accumulated onto.
+  double accumulated_real[4] = {0.0, 0.0, 0.0, 0.0};
+  double accumulated_imag[4] = {0.0, 0.0, 0.0, 0.0};
+  for (unsigned int i = 0; i < 2; i++) {
+    ss.SetStateZero(sv);
+    for (const qsim::GateFused<QsimGate>& fused_gate : fused_circuits[i]) {
+      qsim::ApplyFusedGate(sim, fused_gate, sv);
+    }
+    for (unsigned int k = 0; k < 4; k++) {
+      accumulated_real[k] += coeffs[i] * ss.GetAmpl(sv, k).real();
+      accumulated_imag[k] += coeffs[i] * ss.GetAmpl(sv, k).imag();
+    }
+  }
+  for (unsigned int k = 0; k < 4; k++) {
+    EXPECT_NEAR(ss.GetAmpl(dest, k).real(), accumulated_real[k], 1e-5);
+    EXPECT_NEAR(ss.GetAmpl(dest, k).imag(), accumulated_imag[k], 1e-5);
+  }
+}
+
+TEST(UtilQsimTest, AccumulateFusedCircuitsEmpty) {
+  // Instantiate qsim objects.
+  qsim::Simulator<qsim::SequentialFor> sim(1);
+  qsim::Simulator<qsim::SequentialFor>::StateSpace ss(1);
+  auto scratch = ss.Create(2);
+  auto dest = ss.Create(2);
+
+  AccumulateFusedCircuits({}, {}, sim, ss, scratch, dest);
+
+  // scratch has garbage value.
+  // Check that dest contains all zeros.
+  EXPECT_NEAR(ss.GetAmpl(dest, 0).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 0).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 1).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 1).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 2).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 2).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 3).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(scratch, 3).imag(), 0.0, 1e-5);
+}
+
 }  // namespace
 }  // namespace tfq

From e8ad8a3a47b7e0ae8f8bd1487140e9662500a3d7 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Wed, 10 Mar 2021 22:59:21 +0900
Subject: [PATCH 19/21] fix lint

---
 .../core/ops/math_ops/inner_product_grad_test.py              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
index 3bbefb17d..0488b0d42 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
@@ -256,7 +256,7 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
                                                    dtype=tf.dtypes.string)
         symbol_values = tf.convert_to_tensor(symbol_values_array)
         prev_grad = tf.cast(tf.random.normal((batch_size, inner_dim_size)),
-                            dtype=tf.complex64)
+                            tf.complex64)
 
         out = inner_product_op._inner_product_adj_grad(programs,
                                                        symbol_names_tensor,
@@ -376,7 +376,7 @@ def test_correctness_no_circuit(self):
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'number of symbols must be a positive'):
-            out = inner_product_op._inner_product_adj_grad(
+            _ = inner_product_op._inner_product_adj_grad(
                 empty_circuit, empty_symbols, empty_values, other_program,
                 empty_pred_grad)
 

From bdf9125a6b432ed80785932eccb762fc929fb91b Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Wed, 10 Mar 2021 23:43:03 +0900
Subject: [PATCH 20/21] Fix test error

---
 tensorflow_quantum/core/src/util_qsim_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow_quantum/core/src/util_qsim_test.cc b/tensorflow_quantum/core/src/util_qsim_test.cc
index 252621a68..0740afa18 100644
--- a/tensorflow_quantum/core/src/util_qsim_test.cc
+++ b/tensorflow_quantum/core/src/util_qsim_test.cc
@@ -623,7 +623,6 @@ TEST(UtilQsimTest, AccumulateFusedCircuitsEmpty) {
   EXPECT_NEAR(ss.GetAmpl(dest, 2).real(), 0.0, 1e-5);
   EXPECT_NEAR(ss.GetAmpl(dest, 2).imag(), 0.0, 1e-5);
   EXPECT_NEAR(ss.GetAmpl(dest, 3).real(), 0.0, 1e-5);
-  EXPECT_NEAR(ss.GetAmpl(scratch, 3).imag(), 0.0, 1e-5);
 }
 
 }  // namespace

From ecfca7f8b1165f5f8659721e3b64a4dc9d7e2a29 Mon Sep 17 00:00:00 2001
From: Jae Yoo <jaeyoo@google.com>
Date: Mon, 15 Mar 2021 22:09:37 +0900
Subject: [PATCH 21/21] Mike's feedback

---
 tensorflow_quantum/core/ops/math_ops/BUILD    |  2 +-
 .../ops/math_ops/inner_product_grad_test.py   | 85 ++++++++++---------
 .../core/ops/math_ops/inner_product_op.py     | 13 +--
 ..._adj_grad.cc => tfq_inner_product_grad.cc} | 12 +--
 4 files changed, 57 insertions(+), 55 deletions(-)
 rename tensorflow_quantum/core/ops/math_ops/{tfq_inner_product_adj_grad.cc => tfq_inner_product_grad.cc} (98%)

diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index af765903e..aa13ccadf 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -14,7 +14,7 @@ cc_binary(
     name = "_tfq_math_ops.so",
     srcs = [
         "tfq_inner_product.cc",
-        "tfq_inner_product_adj_grad.cc",
+        "tfq_inner_product_grad.cc",
     ],
     copts = select({
         ":windows": [
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
index 0488b0d42..6a4e8423e 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests that specifically target tfq_inner_product_adj_grad."""
+"""Tests that specifically target tfq_inner_product_grad."""
 import copy
 import numpy as np
 from absl.testing import parameterized
@@ -24,9 +24,9 @@
 
 
 class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
-    """Tests tfq_inner_product_adj_grad."""
+    """Tests tfq_inner_product_grad."""
 
-    def test_inner_product_adj_grad_inputs(self):
+    def test_inner_product_grad_inputs(self):
         """Makes sure that inner_product_adj_grad fails on bad inputs."""
         n_qubits = 5
         batch_size = 5
@@ -51,7 +51,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'programs must be rank 1'):
             # Circuit tensor has too many dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor([circuit_batch]),
                 symbol_names, symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -59,7 +59,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_names must be rank 1.'):
             # symbol_names tensor has too many dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 np.array([symbol_names]), symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -67,7 +67,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
             # symbol_values_array tensor has too many dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 np.array([symbol_values_array]),
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -75,7 +75,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbol_values must be rank 2.'):
             # symbol_values_array tensor has too few dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 symbol_names, symbol_values_array[0],
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -83,7 +83,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
             # other_programs tensor has too few dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 symbol_names, symbol_values_array,
                 util.convert_to_tensor(circuit_batch), prev_grad)
@@ -91,7 +91,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'other_programs must be rank 2.'):
             # pauli_sums tensor has too many dimensions.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor([[x] for x in other_batch]), prev_grad)
@@ -99,14 +99,14 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Unparseable proto'):
             # circuit tensor has the right type but invalid values.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 ['junk'] * batch_size, symbol_names, symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'Could not find symbol in parameter map'):
             # symbol_names tensor has the right type but invalid values.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 ['junk'], symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -118,7 +118,7 @@ def test_inner_product_adj_grad_inputs(self):
             new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
             new_circuits, _ = util.random_circuit_resolver_batch(
                 new_qubits, batch_size)
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
@@ -130,48 +130,48 @@ def test_inner_product_adj_grad_inputs(self):
             new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
             new_circuits, _ = util.random_circuit_resolver_batch(
                 new_qubits, batch_size)
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # circuits tensor has the wrong type.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 [1.0] * batch_size, symbol_names, symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # symbol_names tensor has the wrong type.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 [0.1234], symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
             # symbol_values tensor has the wrong type.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 symbol_names, [['junk']] * batch_size,
                 util.convert_to_tensor(other_batch), prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'Cannot convert'):
             # other_programs tensor has the wrong type.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array, [[1.0]] * batch_size, prev_grad)
 
         with self.assertRaisesRegex(TypeError, 'missing'):
             # we are missing an argument.
             # pylint: disable=no-value-for-parameter
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array, prev_grad)
             # pylint: enable=no-value-for-parameter
 
         with self.assertRaisesRegex(TypeError, 'positional arguments'):
             # pylint: disable=too-many-function-args
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch),
                 symbol_names, symbol_values_array,
                 util.convert_to_tensor(other_batch), prev_grad, [])
@@ -179,7 +179,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
             # batch programs has wrong batch size.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]),
@@ -188,7 +188,7 @@ def test_inner_product_adj_grad_inputs(self):
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     expected_regex='do not match'):
             # batch programs has wrong batch size.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array[::int(batch_size * 0.5)],
                 util.convert_to_tensor(other_batch), prev_grad)
@@ -197,12 +197,12 @@ def test_inner_product_adj_grad_inputs(self):
                 tf.errors.InvalidArgumentError,
                 expected_regex='Found symbols in other_programs'):
             # other_programs has symbols.
-            inner_product_op._inner_product_adj_grad(
+            inner_product_op._inner_product_grad(
                 util.convert_to_tensor(circuit_batch), symbol_names,
                 symbol_values_array,
                 util.convert_to_tensor([[x] for x in circuit_batch]), prev_grad)
 
-        res = inner_product_op._inner_product_adj_grad(
+        res = inner_product_op._inner_product_grad(
             util.convert_to_tensor(circuit_batch), symbol_names,
             symbol_values_array.astype(np.float64),
             util.convert_to_tensor(other_batch), prev_grad)
@@ -258,11 +258,10 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
         prev_grad = tf.cast(tf.random.normal((batch_size, inner_dim_size)),
                             tf.complex64)
 
-        out = inner_product_op._inner_product_adj_grad(programs,
-                                                       symbol_names_tensor,
-                                                       symbol_values,
-                                                       other_programs,
-                                                       prev_grad)
+        out = inner_product_op._inner_product_grad(programs,
+                                                   symbol_names_tensor,
+                                                   symbol_values,
+                                                   other_programs, prev_grad)
 
         out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
         # dx came from _GRAD_EPS of core/src/adj_util.cc
@@ -332,9 +331,9 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            inner_product_op._inner_product_adj_grad(programs, symbol_names,
-                                                     symbol_values,
-                                                     other_programs, prev_grad)
+            inner_product_op._inner_product_grad(programs, symbol_names,
+                                                 symbol_values, other_programs,
+                                                 prev_grad)
 
     def test_correctness_empty(self):
         """Tests the inner product adj grad between two empty circuits."""
@@ -347,10 +346,9 @@ def test_correctness_empty(self):
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'symbols must be a positive integer'):
-            inner_product_op._inner_product_adj_grad(empty_cicuit,
-                                                     empty_symbols,
-                                                     empty_values,
-                                                     other_program, prev_grad)
+            inner_product_op._inner_product_grad(empty_cicuit, empty_symbols,
+                                                 empty_values, other_program,
+                                                 prev_grad)
 
         empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
         symbol_names = tf.convert_to_tensor(symbol_names,
@@ -358,10 +356,9 @@ def test_correctness_empty(self):
         symbol_values = tf.convert_to_tensor([[0.0 for _ in range(2)]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
-        out = inner_product_op._inner_product_adj_grad(empty_cicuit,
-                                                       symbol_names,
-                                                       symbol_values,
-                                                       other_program, prev_grad)
+        out = inner_product_op._inner_product_grad(empty_cicuit, symbol_names,
+                                                   symbol_values, other_program,
+                                                   prev_grad)
         expected = np.zeros((1, len(symbol_names)), dtype=np.complex64)
         self.assertAllClose(out, expected)
 
@@ -376,9 +373,13 @@ def test_correctness_no_circuit(self):
 
         with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                     'number of symbols must be a positive'):
-            _ = inner_product_op._inner_product_adj_grad(
-                empty_circuit, empty_symbols, empty_values, other_program,
-                empty_pred_grad)
+            # When using `tf.gradients`, a user will never encounter this error
+            # thanks to the `tf.cond` inside of the custom gradient.
+            _ = inner_product_op._inner_product_grad(empty_circuit,
+                                                     empty_symbols,
+                                                     empty_values,
+                                                     other_program,
+                                                     empty_pred_grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index dd371dd33..b510132c1 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -20,8 +20,8 @@
 MATH_OP_MODULE = load_module(os.path.join("math_ops", "_tfq_math_ops.so"))
 
 
-def _inner_product_adj_grad(programs, symbol_names, symbol_values,
-                            other_programs, prev_grad):
+def _inner_product_grad(programs, symbol_names, symbol_values, other_programs,
+                        prev_grad):
     """Calculate the adjoint gradients of the inner product between circuits.
 
     Compute the gradients of the (potentially many) inner products between
@@ -63,7 +63,7 @@ def _inner_product_adj_grad(programs, symbol_names, symbol_values,
     """
     # Due to TF gradient scheme, we return complex conjugate derivative.
     return tf.math.conj(
-        MATH_OP_MODULE.tfq_inner_product_adj_grad(
+        MATH_OP_MODULE.tfq_inner_product_grad(
             programs, symbol_names, tf.cast(symbol_values, tf.float32),
             other_programs, tf.cast(prev_grad, tf.float32)))
 
@@ -134,10 +134,11 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     def grad(dy):
 
         def _true_grad():
-            return _inner_product_adj_grad(programs, symbol_names,
-                                           symbol_values, other_programs, dy)
+            return _inner_product_grad(programs, symbol_names, symbol_values,
+                                       other_programs, dy)
 
-        inner_prod_grad = tf.cond(tf.math.equal(symbol_names.shape[0], 0),
+        ret_zero = tf.equal(tf.size(symbol_names), 0)
+        inner_prod_grad = tf.cond(ret_zero,
                                   lambda: tf.zeros_like(symbol_values),
                                   _true_grad)
         return [None, None, inner_prod_grad, None]
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc
similarity index 98%
rename from tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
rename to tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc
index 04605f4ee..adc96e029 100644
--- a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_adj_grad.cc
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc
@@ -42,9 +42,9 @@ typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
 typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
 
-class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
+class TfqInnerProductGradOp : public tensorflow::OpKernel {
  public:
-  explicit TfqInnerProductAdjGradOp(tensorflow::OpKernelConstruction* context)
+  explicit TfqInnerProductGradOp(tensorflow::OpKernelConstruction* context)
       : OpKernel(context) {}
 
   void Compute(tensorflow::OpKernelContext* context) override {
@@ -442,16 +442,16 @@ class TfqInnerProductAdjGradOp : public tensorflow::OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("TfqInnerProductAdjGrad").Device(tensorflow::DEVICE_CPU),
-    TfqInnerProductAdjGradOp);
+    Name("TfqInnerProductGrad").Device(tensorflow::DEVICE_CPU),
+    TfqInnerProductGradOp);
 
-REGISTER_OP("TfqInnerProductAdjGrad")
+REGISTER_OP("TfqInnerProductGrad")
     .Input("programs: string")
     .Input("symbol_names: string")
     .Input("symbol_values: float")
     .Input("other_programs: string")
     .Input("downstream_grads: float")
-    .Output("inner_products_adj_grad: complex64")
+    .Output("inner_products_grad: complex64")
     .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
       tensorflow::shape_inference::ShapeHandle programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));