diff --git a/tensorflow_quantum/core/ops/cirq_ops_test.py b/tensorflow_quantum/core/ops/cirq_ops_test.py
index 957a7939f..94f945640 100644
--- a/tensorflow_quantum/core/ops/cirq_ops_test.py
+++ b/tensorflow_quantum/core/ops/cirq_ops_test.py
@@ -414,8 +414,8 @@ def test_sampling_output_padding(self, op, all_n_qubits, n_samples):
             this_expected_output[:, :max(all_n_qubits) - n_qubits] = -2
             expected_outputs.append(this_expected_output)
             circuits.append(
-                cirq.Circuit(
-                    *cirq.X.on_each(*cirq.GridQubit.rect(1, n_qubits))))
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
         results = op(util.convert_to_tensor(circuits), [], [[]] * len(circuits),
                      [n_samples]).numpy()
         self.assertAllClose(expected_outputs, results)
@@ -461,8 +461,8 @@ def run_sweep(self, program, params, repetitions):
         circuits = []
         for n_qubits in all_n_qubits:
             circuits.append(
-                cirq.Circuit(
-                    *cirq.X.on_each(*cirq.GridQubit.rect(1, n_qubits))))
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
         test_results = this_op(util.convert_to_tensor(circuits), [],
                                [[]] * len(circuits), [n_samples]).numpy()
 
diff --git a/tensorflow_quantum/core/ops/math_ops/BUILD b/tensorflow_quantum/core/ops/math_ops/BUILD
index 5db4c7d0c..aa13ccadf 100644
--- a/tensorflow_quantum/core/ops/math_ops/BUILD
+++ b/tensorflow_quantum/core/ops/math_ops/BUILD
@@ -14,6 +14,7 @@ cc_binary(
     name = "_tfq_math_ops.so",
     srcs = [
         "tfq_inner_product.cc",
+        "tfq_inner_product_grad.cc",
     ],
     copts = select({
         ":windows": [
@@ -58,8 +59,9 @@ cc_binary(
     deps = [
         "//tensorflow_quantum/core/ops:parse_context",
         "//tensorflow_quantum/core/ops:tfq_simulate_utils",
-        "//tensorflow_quantum/core/src:util_qsim",
+        "//tensorflow_quantum/core/src:adj_util",
         "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "//tensorflow_quantum/core/src:util_qsim",
         "@qsim//lib:qsim_lib",
     ],
 )
@@ -82,3 +84,13 @@ py_test(
         "//tensorflow_quantum/python:util",
     ],
 )
+
+py_test(
+    name = "inner_product_grad_test",
+    srcs = ["inner_product_grad_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":inner_product_op_py",
+        "//tensorflow_quantum/python:util",
+    ],
+)
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
new file mode 100644
index 000000000..6a4e8423e
--- /dev/null
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_grad_test.py
@@ -0,0 +1,386 @@
+# Copyright 2021 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that specifically target tfq_inner_product_grad."""
+import copy
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops.math_ops import inner_product_op
+from tensorflow_quantum.python import util
+
+
+class InnerProductAdjGradTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_inner_product_grad."""
+
+    def test_inner_product_grad_inputs(self):
+        """Makes sure that inner_product_adj_grad fails on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        n_other_programs = 3
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        prev_grad = np.ones((batch_size, n_other_programs))
+        circuit_batch, resolver_batch = \
+          util.random_symbol_circuit_resolver_batch(
+              qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, n_other_programs)[0]
+            for i in range(batch_size)
+        ]
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor([circuit_batch]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                np.array([symbol_names]), symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array[0],
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # other_programs tensor has too few dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(circuit_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'other_programs must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in other_batch]), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            inner_product_op._inner_product_grad(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                ['junk'], symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in reference circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'not found in paired circuit'):
+            # other_programs tensor has the right type but operates on
+            # qubits that the reference ciruit doesn't have.
+            new_qubits = cirq.GridQubit.rect(1, n_qubits - 1)
+            new_circuits, _ = util.random_circuit_resolver_batch(
+                new_qubits, batch_size)
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_circuits]), prev_grad)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            inner_product_op._inner_product_grad(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                [0.1234], symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, [['junk']] * batch_size,
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # other_programs tensor has the wrong type.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size, prev_grad)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, prev_grad)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(other_batch), prev_grad, [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor(other_batch[:int(batch_size * 0.5)]),
+                prev_grad)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # batch programs has wrong batch size.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[::int(batch_size * 0.5)],
+                util.convert_to_tensor(other_batch), prev_grad)
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='Found symbols in other_programs'):
+            # other_programs has symbols.
+            inner_product_op._inner_product_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in circuit_batch]), prev_grad)
+
+        res = inner_product_op._inner_product_grad(
+            util.convert_to_tensor(circuit_batch), symbol_names,
+            symbol_values_array.astype(np.float64),
+            util.convert_to_tensor(other_batch), prev_grad)
+        self.assertDTypeEqual(res, np.complex64)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_with_symbols(self, n_qubits, batch_size,
+                                      inner_dim_size):
+        """Tests that inner_product works with symbols."""
+        symbol_names = ['alpha', 'beta', 'gamma']
+        n_params = len(symbol_names)
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+          util.random_symbol_circuit_resolver_batch(
+              qubits, symbol_names, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                                   dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor(symbol_values_array)
+        prev_grad = tf.cast(tf.random.normal((batch_size, inner_dim_size)),
+                            tf.complex64)
+
+        out = inner_product_op._inner_product_grad(programs,
+                                                   symbol_names_tensor,
+                                                   symbol_values,
+                                                   other_programs, prev_grad)
+
+        out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
+        # dx came from _GRAD_EPS of core/src/adj_util.cc
+        dx = 5e-3
+        for i, resolver in enumerate(resolver_batch):
+            for k, name in enumerate(symbol_names):
+                if name in resolver.param_dict:
+                    new_resolver = copy.deepcopy(resolver)
+                    new_resolver.param_dict[name] += dx
+                    final_circuit_p = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    new_resolver = copy.deepcopy(resolver)
+                    new_resolver.param_dict[name] -= dx
+                    final_circuit_m = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    final_wf_p = cirq.final_state_vector(final_circuit_p)
+                    final_wf_m = cirq.final_state_vector(final_circuit_m)
+                    # Performs central finite difference.
+                    final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
+                    for j, other in enumerate(other_batch[i]):
+                        internal_wf = cirq.final_state_vector(other)
+                        out_arr[i][k] += (prev_grad[i][j] *
+                                          np.vdot(final_wf_grad, internal_wf))
+
+        self.assertAllClose(out, np.conj(out_arr), atol=1e-3)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 1
+        },
+        {
+            'n_qubits': 10,
+            'batch_size': 10,
+            'inner_dim_size': 2
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 10,
+            'inner_dim_size': 5
+        },
+    ])
+    def test_correctness_without_symbols(self, n_qubits, batch_size,
+                                         inner_dim_size):
+        """Tests that inner_product_adj_grad works without symbols."""
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, _ = \
+          util.random_circuit_resolver_batch(
+              qubits, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+        prev_grad = np.ones((batch_size, inner_dim_size))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbols must be a positive integer'):
+            inner_product_op._inner_product_grad(programs, symbol_names,
+                                                 symbol_values, other_programs,
+                                                 prev_grad)
+
+    def test_correctness_empty(self):
+        """Tests the inner product adj grad between two empty circuits."""
+        symbol_names = ['alpha', 'beta']
+        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+        prev_grad = np.ones((1, 1))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbols must be a positive integer'):
+            inner_product_op._inner_product_grad(empty_cicuit, empty_symbols,
+                                                 empty_values, other_program,
+                                                 prev_grad)
+
+        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        symbol_names = tf.convert_to_tensor(symbol_names,
+                                            dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[0.0 for _ in range(2)]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        out = inner_product_op._inner_product_grad(empty_cicuit, symbol_names,
+                                                   symbol_values, other_program,
+                                                   prev_grad)
+        expected = np.zeros((1, len(symbol_names)), dtype=np.complex64)
+        self.assertAllClose(out, expected)
+
+    def test_correctness_no_circuit(self):
+        """Test the inner product grad between no circuits."""
+
+        empty_circuit = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_symbols = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_values = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+        other_program = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string)
+        empty_pred_grad = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'number of symbols must be a positive'):
+            # When using `tf.gradients`, a user will never encounter this error
+            # thanks to the `tf.cond` inside of the custom gradient.
+            _ = inner_product_op._inner_product_grad(empty_circuit,
+                                                     empty_symbols,
+                                                     empty_values,
+                                                     other_program,
+                                                     empty_pred_grad)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
index ed2fb5c43..b510132c1 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op.py
@@ -20,6 +20,55 @@
 MATH_OP_MODULE = load_module(os.path.join("math_ops", "_tfq_math_ops.so"))
 
 
+def _inner_product_grad(programs, symbol_names, symbol_values, other_programs,
+                        prev_grad):
+    """Calculate the adjoint gradients of the inner product between circuits.
+
+    Compute the gradients of the (potentially many) inner products between
+    the given circuits and the symbol free comparison circuits.
+
+    Calculates out[i][j][k] = $ \frac{\langle \psi_{\text{programs[i]}} \\
+        (\text{symbol_values[i]})}{\partial \text{symbol_names[k]}} | \\
+        \psi_{\text{other_programs[j]}} \rangle $
+
+
+    Note: `other_programs` must not contain any free symbols. These can
+        be resolved beforehand with `tfq.resolve_parameters`.
+
+    Note: len(symbol_names) (=n_params) should be a positive integer.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        other_programs: `tf.Tensor` of strings with shape [batch_size, n_others]
+            containing the string representations of the circuits with which to
+            compute the overlap on `programs` with. Must not contain any free
+            symbols.
+        prev_grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops]
+            backprop of values from downstream in the compute graph.
+
+    Returns:
+        tf.Tensor` with shape [batch_size, n_symbols] where `out[i][j]` is equal
+        to the gradient of the inner product between programs[i] and all
+        other_programs[i] w.r.t. `symbol_names[j]` and `programs[i]` is resolved
+        with `symbol_values[i]`.
+    """
+    # Due to TF gradient scheme, we return complex conjugate derivative.
+    return tf.math.conj(
+        MATH_OP_MODULE.tfq_inner_product_grad(
+            programs, symbol_names, tf.cast(symbol_values, tf.float32),
+            other_programs, tf.cast(prev_grad, tf.float32)))
+
+
+@tf.custom_gradient
 def inner_product(programs, symbol_names, symbol_values, other_programs):
     """Calculate the inner product between circuits.
 
@@ -61,8 +110,6 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
     Note: `other_programs` must not contain any free symbols. These can
         be resolved beforehand with `tfq.resolve_parameters`.
 
-    Note: Currently this op is not differentiable.
-
     Args:
         programs: `tf.Tensor` of strings with shape [batch_size] containing
             the string representations of the circuits
@@ -82,8 +129,20 @@ def inner_product(programs, symbol_names, symbol_values, other_programs):
         `tf.Tensor` with shape [batch_size, n_others] where `out[i][j]` is equal
             to the inner product of `programs[i]` with `symbol_values[i]`
             resolved in and `other_programs[i][j]`.
-
     """
+
+    def grad(dy):
+
+        def _true_grad():
+            return _inner_product_grad(programs, symbol_names, symbol_values,
+                                       other_programs, dy)
+
+        ret_zero = tf.equal(tf.size(symbol_names), 0)
+        inner_prod_grad = tf.cond(ret_zero,
+                                  lambda: tf.zeros_like(symbol_values),
+                                  _true_grad)
+        return [None, None, inner_prod_grad, None]
+
     return MATH_OP_MODULE.tfq_inner_product(programs, symbol_names,
                                             tf.cast(symbol_values, tf.float32),
-                                            other_programs)
+                                            other_programs), grad
diff --git a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
index e05152d70..ee9fd08b4 100644
--- a/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
+++ b/tensorflow_quantum/core/ops/math_ops/inner_product_op_test.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests that specifically target tfq_simulate_ops."""
+"""Tests that specifically target tfq_inner_product."""
+import copy
 import numpy as np
 from absl.testing import parameterized
 import tensorflow as tf
@@ -26,7 +27,7 @@ class InnerProductTest(tf.test.TestCase, parameterized.TestCase):
     """Tests tfq_inner_product."""
 
     def test_inner_product_inputs(self):
-        """Make sure that inner_product fails gracefully on bad inputs."""
+        """Makes sure that inner_product fails gracefully on bad inputs."""
         n_qubits = 5
         batch_size = 5
         symbol_names = ['alpha']
@@ -206,6 +207,11 @@ def test_inner_product_inputs(self):
         self.assertDTypeEqual(res, np.complex64)
 
     @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
         {
             'n_qubits': 5,
             'batch_size': 10,
@@ -224,7 +230,7 @@ def test_inner_product_inputs(self):
     ])
     def test_correctness_with_symbols(self, n_qubits, batch_size,
                                       inner_dim_size):
-        """Test that inner_product works with symbols."""
+        """Tests that inner_product works with symbols."""
         symbol_names = ['alpha', 'beta', 'gamma']
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, resolver_batch = \
@@ -264,12 +270,17 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     @parameterized.parameters([
         {
             'n_qubits': 5,
-            'batch_size': 10,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 2,
             'inner_dim_size': 1
         },
         {
             'n_qubits': 10,
-            'batch_size': 10,
+            'batch_size': 3,
             'inner_dim_size': 2
         },
         {
@@ -280,7 +291,7 @@ def test_correctness_with_symbols(self, n_qubits, batch_size,
     ])
     def test_correctness_without_symbols(self, n_qubits, batch_size,
                                          inner_dim_size):
-        """Test that inner_product works with symbols."""
+        """Tests that inner_product works without symbols."""
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, _ = \
             util.random_circuit_resolver_batch(
@@ -309,18 +320,135 @@ def test_correctness_without_symbols(self, n_qubits, batch_size,
         self.assertAllClose(out, out_arr, atol=1e-5)
 
     def test_correctness_empty(self):
-        """Test the inner product between two empty circuits."""
+        """Tests the inner product with empty circuits."""
 
-        empty_cicuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_circuit = util.convert_to_tensor([cirq.Circuit()])
         empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
         empty_values = tf.convert_to_tensor([[]])
         other_program = util.convert_to_tensor([[cirq.Circuit()]])
 
-        out = inner_product_op.inner_product(empty_cicuit, empty_symbols,
+        out = inner_product_op.inner_product(empty_circuit, empty_symbols,
                                              empty_values, other_program)
         expected = np.array([[1.0]], dtype=np.complex64)
         self.assertAllClose(out, expected)
 
+        qubit = cirq.GridQubit(0, 0)
+        non_empty_circuit = util.convert_to_tensor(
+            [cirq.Circuit(cirq.X(qubit))])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        other_program = util.convert_to_tensor([[cirq.Circuit()]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found'):
+            inner_product_op.inner_product(non_empty_circuit, empty_symbols,
+                                           empty_values, other_program)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 3,
+            'inner_dim_size': 2
+        },
+    ])
+    def test_tf_gradient_correctness_with_symbols(self, n_qubits, batch_size,
+                                                  inner_dim_size):
+        """Tests that tf.gradient of inner_product works with symbols."""
+        symbol_names = ['alpha', 'beta', 'gamma']
+        n_params = len(symbol_names)
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names_tensor = tf.convert_to_tensor(symbol_names,
+                                                   dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor(symbol_values_array)
+
+        with tf.GradientTape() as tape:
+            tape.watch(symbol_values)
+            ip = inner_product_op.inner_product(programs, symbol_names_tensor,
+                                                symbol_values, other_programs)
+        out = tape.gradient(ip, symbol_values)
+
+        out_arr = np.zeros((batch_size, n_params), dtype=np.complex64)
+        # dx came from _GRAD_EPS of core/src/adj_util.cc
+        dx = 5e-3
+        for i in range(batch_size):
+            for k, name in enumerate(symbol_names):
+                if name in resolver_batch[i].param_dict:
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] += dx
+                    final_circuit_p = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    new_resolver = copy.deepcopy(resolver_batch[i])
+                    new_resolver.param_dict[name] -= dx
+                    final_circuit_m = cirq.resolve_parameters(
+                        circuit_batch[i], new_resolver)
+                    final_wf_p = cirq.final_state_vector(final_circuit_p)
+                    final_wf_m = cirq.final_state_vector(final_circuit_m)
+                    # Performs central finite difference.
+                    final_wf_grad = 0.5 * (final_wf_p - final_wf_m) / dx
+                    for j in range(inner_dim_size):
+                        internal_wf = cirq.final_state_vector(other_batch[i][j])
+                        out_arr[i][k] += np.vdot(final_wf_grad, internal_wf)
+
+        self.assertAllClose(out, np.conj(out_arr), atol=1e-3)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 5,
+            'batch_size': 1,
+            'inner_dim_size': 5
+        },
+        {
+            'n_qubits': 5,
+            'batch_size': 3,
+            'inner_dim_size': 2
+        },
+    ])
+    def test_tf_gradient_correctness_without_symbols(self, n_qubits, batch_size,
+                                                     inner_dim_size):
+        """Tests that tf.gradient of inner_product works without symbols."""
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, _ = \
+            util.random_circuit_resolver_batch(
+                qubits, batch_size)
+
+        other_batch = [
+            util.random_circuit_resolver_batch(qubits, inner_dim_size)[0]
+            for i in range(batch_size)
+        ]
+
+        programs = util.convert_to_tensor(circuit_batch)
+        other_programs = util.convert_to_tensor(other_batch)
+        symbol_names = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        symbol_values = tf.convert_to_tensor([[] for _ in range(batch_size)])
+
+        with tf.GradientTape() as tape:
+            tape.watch(symbol_values)
+            ip = inner_product_op.inner_product(programs, symbol_names,
+                                                symbol_values, other_programs)
+        out = tape.gradient(ip, symbol_values)
+        self.assertAllClose(out, tf.zeros_like(symbol_values), atol=1e-3)
+
     def test_correctness_no_circuit(self):
         """Test the inner product between no circuits."""
 
@@ -333,6 +461,21 @@ def test_correctness_no_circuit(self):
                                              empty_values, other_program)
         self.assertShapeEqual(np.zeros((0, 0)), out)
 
+    def test_tf_gradient_correctness_no_circuit(self):
+        """Test the inner product grad between no circuits."""
+
+        empty_circuit = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_symbols = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_values = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+        other_program = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string)
+
+        with tf.GradientTape() as tape:
+            tape.watch(empty_values)
+            out = inner_product_op.inner_product(empty_circuit, empty_symbols,
+                                                 empty_values, other_program)
+
+        self.assertShapeEqual(np.zeros((0, 0)), out)
+
 
 if __name__ == "__main__":
     tf.test.main()
diff --git a/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc
new file mode 100644
index 000000000..adc96e029
--- /dev/null
+++ b/tensorflow_quantum/core/ops/math_ops/tfq_inner_product_grad.cc
@@ -0,0 +1,482 @@
+/* Copyright 2021 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simmux.h"
+#include "cirq/google/api/v2/program.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/src/adj_util.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::cirq::google::api::v2::Program;
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
+
+class TfqInnerProductGradOp : public tensorflow::OpKernel {
+ public:
+  explicit TfqInnerProductGradOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 5,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_internal_size = context->input(3).dim_size(1);
+    const int output_dim_symbol_size = context->input(1).dim_size(0);
+    OP_REQUIRES(context, output_dim_symbol_size > 0,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "The number of symbols must be a positive integer, got ",
+                    output_dim_symbol_size, " symbols.")));
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_symbol_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->matrix<std::complex<float>>();
+
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<Program>> other_programs;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits,
+                                           &other_programs));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+    OP_REQUIRES(context, output_dim_symbol_size == maps[0].size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of symbols and symbol maps do not match. Got ",
+                    output_dim_symbol_size, " symbols and ", maps[0].size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits for programs.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<QsimFusedCircuit> fused_circuits(programs.size(),
+                                                 QsimFusedCircuit({}));
+
+    // track metadata.
+    std::vector<std::vector<tfq::GateMetaData>> gate_meta(
+        programs.size(), std::vector<tfq::GateMetaData>({}));
+
+    // Construct qsim circuits.
+    std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>
+        partial_fused_circuits(
+            programs.size(),
+            std::vector<std::vector<qsim::GateFused<QsimGate>>>({}));
+
+    // track gradients
+    std::vector<std::vector<GradientOfGate>> gradient_gates(
+        programs.size(), std::vector<GradientOfGate>({}));
+
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        OP_REQUIRES_OK(
+            context, QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                            &qsim_circuits[i],
+                                            &fused_circuits[i], &gate_meta[i]));
+
+        CreateGradientCircuit(qsim_circuits[i], gate_meta[i],
+                              &partial_fused_circuits[i], &gradient_gates[i]);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        output_dim_batch_size, num_cycles, construct_f);
+
+    // Construct qsim circuits for other_programs.
+    std::vector<std::vector<QsimCircuit>> other_qsim_circuits(
+        output_dim_batch_size,
+        std::vector<QsimCircuit>(output_dim_internal_size, QsimCircuit()));
+    std::vector<std::vector<QsimFusedCircuit>> other_fused_circuits(
+        output_dim_batch_size,
+        std::vector<QsimFusedCircuit>(output_dim_internal_size,
+                                      QsimFusedCircuit({})));
+
+    auto construct_f2 = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        int ii = i / output_dim_internal_size;
+        int jj = i % output_dim_internal_size;
+        Status status = QsimCircuitFromProgram(
+            other_programs[ii][jj], {}, num_qubits[ii],
+            &other_qsim_circuits[ii][jj], &other_fused_circuits[ii][jj]);
+        OP_REQUIRES(context, status.ok(),
+                    tensorflow::errors::InvalidArgument(absl::StrCat(
+                        "Found symbols in other_programs.",
+                        "No symbols are allowed in these circuits.")));
+      }
+    };
+
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        output_dim_batch_size * output_dim_internal_size, num_cycles,
+        construct_f2);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    // Get downstream gradients.
+    std::vector<std::vector<float>> downstream_grads;
+    OP_REQUIRES_OK(context, GetPrevGrads(context, &downstream_grads));
+
+    OP_REQUIRES(context, downstream_grads.size() == programs.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and circuits do not match. Got ",
+                    downstream_grads.size(), " gradients and ", programs.size(),
+                    " circuits.")));
+
+    OP_REQUIRES(context, downstream_grads[0].size() == output_dim_internal_size,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and other_programs do not match. Got ",
+                    downstream_grads[0].size(), " gradient entries and ",
+                    output_dim_internal_size, " other programs.")));
+
+    output_tensor.setZero();
+
+    // Cross reference with standard google cloud compute instances
+    // Memory ~= 2 * num_threads * (2 * 64 * 2 ** num_qubits in circuits)
+    // e2s2 = 2 CPU, 8GB -> Can safely do 23 since Memory = 4GB
+    // e2s4 = 4 CPU, 16GB -> Can safely do 23 since Memory = 8GB
+    // ...
+    if (max_num_qubits >= 24 || output_dim_batch_size == 1) {
+      ComputeLarge(num_qubits, maps, qsim_circuits, fused_circuits,
+                   partial_fused_circuits, gradient_gates, other_fused_circuits,
+                   downstream_grads, context, &output_tensor);
+    } else {
+      ComputeSmall(num_qubits, max_num_qubits, maps, qsim_circuits,
+                   fused_circuits, partial_fused_circuits, gradient_gates,
+                   other_fused_circuits, downstream_grads, context,
+                   &output_tensor);
+    }
+  }
+
+ private:
+  void ComputeLarge(
+      const std::vector<int>& num_qubits, const std::vector<SymbolMap>& maps,
+      const std::vector<QsimCircuit>& qsim_circuits,
+      const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
+      const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      const std::vector<std::vector<float>>& downstream_grads,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    const auto tfq_for = tfq::QsimFor(context);
+    using Simulator = qsim::Simulator<const tfq::QsimFor&>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(tfq_for);
+    StateSpace ss = StateSpace(tfq_for);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+    auto scratch2 = ss.Create(largest_nq);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (std::vector<std::vector<qsim::GateFused<QsimGate>>>::size_type i = 0;
+         i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+        scratch2 = ss.Create(largest_nq);
+      }
+      ss.SetStateZero(sv);
+      for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
+           j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+
+      auto status =
+          AccumulateFusedCircuits(downstream_grads[i], other_fused_circuits[i],
+                                  sim, ss, scratch2, scratch);
+
+      // now sv is |psi>
+      // scratch contains sum_j downstream_grads[i][j]*|phi[i][j]>
+      // Start adjoint differentiation.
+      for (int l = partial_fused_circuits[i].size() - 1; l >= 0; l--) {
+        for (int k = partial_fused_circuits[i][l].size() - 1; k >= 0; k--) {
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], sv);
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][l][k], scratch);
+        }
+        if (l == 0) {
+          // last layer will have no parametrized gates so can break.
+          break;
+        }
+
+        // Hit a parameterized gate.
+        // todo fix this copy.
+        auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][l - 1].index];
+        ApplyGateDagger(sim, cur_gate, sv);
+
+        // if applicable compute control qubit mask and control value bits.
+        uint64_t mask = 0;
+        uint64_t cbits = 0;
+        for (std::vector<unsigned int>::size_type k = 0;
+             k < cur_gate.controlled_by.size(); k++) {
+          uint64_t control_loc = cur_gate.controlled_by[k];
+          mask |= uint64_t{1} << control_loc;
+          cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+        }
+
+        for (std::vector<QsimGate>::size_type k = 0;
+             k < gradient_gates[i][l - 1].grad_gates.size(); k++) {
+          // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+          // gate".
+          ss.Copy(sv, scratch2);
+          if (!cur_gate.controlled_by.empty()) {
+            // Gradient of controlled gates puts zeros on diagonal which is
+            // the same as collapsing the state and then applying the
+            // non-controlled version of the gradient gate.
+            ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
+          }
+          qsim::ApplyGate(sim, gradient_gates[i][l - 1].grad_gates[k],
+                          scratch2);
+
+          // don't need not-found check since this is done upstream already.
+          const auto it = maps[i].find(gradient_gates[i][l - 1].params[k]);
+          const int loc = it->second.first;
+          // Apply finite differencing for adjoint gradients.
+          // Finite differencing enables applying multiple `gradient_gate`
+          // of a symbol at the same circuit. For analytic methods like
+          // parameter-shift we need to apply a single `gradient_gate`
+          // per a symbol.
+          std::complex<double> result = ss.InnerProduct(scratch2, scratch);
+          (*output_tensor)(i, loc) +=
+              std::complex<float>(static_cast<float>(result.real()),
+                                  static_cast<float>(result.imag()));
+        }
+        ApplyGateDagger(sim, cur_gate, scratch);
+      }
+    }
+  }
+
+  void ComputeSmall(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<SymbolMap>& maps,
+      const std::vector<QsimCircuit>& qsim_circuits,
+      const std::vector<QsimFusedCircuit>& fused_circuits,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
+      const std::vector<std::vector<QsimFusedCircuit>>& other_fused_circuits,
+      const std::vector<std::vector<float>>& downstream_grads,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>>::Matrix* output_tensor) {
+    const auto tfq_for = qsim::SequentialFor(1);
+    using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
+    using StateSpace = Simulator::StateSpace;
+
+    const int output_dim_internal_size = other_fused_circuits[0].size();
+
+    auto DoWork = [&](int start, int end) {
+      int old_batch_index = -2;
+      int cur_batch_index = -1;
+      int largest_nq = 1;
+      int cur_internal_index;
+
+      Simulator sim = Simulator(tfq_for);
+      StateSpace ss = StateSpace(tfq_for);
+      auto sv = ss.Create(largest_nq);
+      auto sv_adj = ss.Create(largest_nq);
+      auto scratch = ss.Create(largest_nq);
+      auto scratch2 = ss.Create(largest_nq);
+      for (int i = start; i < end; i++) {
+        cur_batch_index = i / output_dim_internal_size;
+        cur_internal_index = i % output_dim_internal_size;
+
+        const int nq = num_qubits[cur_batch_index];
+
+        if (cur_batch_index != old_batch_index) {
+          // We've run into a new state vector we must compute.
+          // Only compute a new state vector when we have to.
+          if (nq > largest_nq) {
+            largest_nq = nq;
+            sv = ss.Create(largest_nq);
+            sv_adj = ss.Create(largest_nq);
+            scratch = ss.Create(largest_nq);
+            scratch2 = ss.Create(largest_nq);
+          }
+          ss.SetStateZero(sv);
+          for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
+               j < fused_circuits[cur_batch_index].size(); j++) {
+            qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
+          }
+        }
+
+        ss.SetStateZero(scratch);
+        for (std::vector<qsim::GateFused<QsimGate>>::size_type k = 0;
+             k <
+             other_fused_circuits[cur_batch_index][cur_internal_index].size();
+             k++) {
+          qsim::ApplyFusedGate(
+              sim, other_fused_circuits[cur_batch_index][cur_internal_index][k],
+              scratch);
+        }
+        // now sv is |psi>, scratch is |phi>
+        // Start adjoint differentiation.
+        ss.Copy(sv, sv_adj);
+        for (int l = partial_fused_circuits[cur_batch_index].size() - 1; l >= 0;
+             l--) {
+          for (int k = partial_fused_circuits[cur_batch_index][l].size() - 1;
+               k >= 0; k--) {
+            ApplyFusedGateDagger(
+                sim, partial_fused_circuits[cur_batch_index][l][k], sv_adj);
+            ApplyFusedGateDagger(
+                sim, partial_fused_circuits[cur_batch_index][l][k], scratch);
+          }
+          if (l == 0) {
+            // last layer will have no parametrized gates so can break.
+            break;
+          }
+
+          // Hit a parameterized gate.
+          // todo fix this copy.
+          auto cur_gate =
+              qsim_circuits[cur_batch_index]
+                  .gates[gradient_gates[cur_batch_index][l - 1].index];
+          ApplyGateDagger(sim, cur_gate, sv_adj);
+
+          // if applicable compute control qubit mask and control value bits.
+          uint64_t mask = 0;
+          uint64_t cbits = 0;
+          for (int k = 0; k < cur_gate.controlled_by.size(); k++) {
+            uint64_t control_loc = cur_gate.controlled_by[k];
+            mask |= uint64_t{1} << control_loc;
+            cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+          }
+
+          for (int k = 0;
+               k < gradient_gates[cur_batch_index][l - 1].grad_gates.size();
+               k++) {
+            // Copy sv_adj onto scratch2 in anticipation of non-unitary
+            // "gradient gate".
+            ss.Copy(sv_adj, scratch2);
+            if (!cur_gate.controlled_by.empty()) {
+              // Gradient of controlled gates puts zeros on diagonal which is
+              // the same as collapsing the state and then applying the
+              // non-controlled version of the gradient gate.
+              ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
+            }
+            qsim::ApplyGate(
+                sim, gradient_gates[cur_batch_index][l - 1].grad_gates[k],
+                scratch2);
+
+            // don't need not-found check since this is done upstream already.
+            const auto it = maps[cur_batch_index].find(
+                gradient_gates[cur_batch_index][l - 1].params[k]);
+            const int loc = it->second.first;
+            // Apply finite differencing for adjoint gradients.
+            // Finite differencing enables applying multiple `gradient_gate`
+            // of a symbol at the same circuit. For analytic methods like
+            // parameter-shift we need to apply a single `gradient_gate`
+            // per a symbol.
+            std::complex<double> result = ss.InnerProduct(scratch2, scratch);
+            (*output_tensor)(cur_batch_index, loc) +=
+                (downstream_grads[cur_batch_index][cur_internal_index] *
+                 std::complex<float>(static_cast<float>(result.real()),
+                                     static_cast<float>(result.imag())));
+          }
+          ApplyGateDagger(sim, cur_gate, scratch);
+        }
+        old_batch_index = cur_batch_index;
+      }
+    };
+
+    const int64_t num_cycles =
+        200 * (int64_t(1) << static_cast<int64_t>(max_num_qubits));
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        fused_circuits.size() * output_dim_internal_size, num_cycles, DoWork);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqInnerProductGrad").Device(tensorflow::DEVICE_CPU),
+    TfqInnerProductGradOp);
+
+REGISTER_OP("TfqInnerProductGrad")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("other_programs: string")
+    .Input("downstream_grads: float")
+    .Output("inner_products_grad: complex64")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle other_programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &other_programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle downstream_grads_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &downstream_grads_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(symbol_names_shape, 0);
+      std::vector<tensorflow::shape_inference::DimensionHandle> dims = {
+          output_rows, output_cols};
+      c->set_output(0, c->MakeShape(dims));
+
+      return tensorflow::Status::OK();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
index f3b8faf83..a07dded19 100644
--- a/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op.cc
@@ -238,7 +238,7 @@ class TfqAdjointGradientOp : public tensorflow::OpKernel {
             // gate".
             ss.Copy(sv, scratch2);
             if (!cur_gate.controlled_by.empty()) {
-              // Gradient of controlled gattes puts zeros on diagonal which is
+              // Gradient of controlled gates puts zeros on diagonal which is
               // the same as collapsing the state and then applying the
               // non-controlled version of the gradient gate.
               ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
@@ -349,7 +349,7 @@ class TfqAdjointGradientOp : public tensorflow::OpKernel {
           // gate".
           ss.Copy(sv, scratch2);
           if (!cur_gate.controlled_by.empty()) {
-            // Gradient of controlled gattes puts zeros on diagonal which is
+            // Gradient of controlled gates puts zeros on diagonal which is
             // the same as collapsing the state and then applying the
             // non-controlled version of the gradient gate.
             ss.BulkSetAmpl(scratch2, mask, cbits, 0, 0, true);
diff --git a/tensorflow_quantum/core/src/util_qsim.h b/tensorflow_quantum/core/src/util_qsim.h
index 5024d47bf..955085159 100644
--- a/tensorflow_quantum/core/src/util_qsim.h
+++ b/tensorflow_quantum/core/src/util_qsim.h
@@ -35,6 +35,7 @@ namespace tfq {
 
 typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
 
 // Custom FOR loop struct to use TF threadpool instead of native
 // qsim OpenMP or serial FOR implementations.
@@ -315,6 +316,34 @@ tensorflow::Status AccumulateOperators(
   return status;
 }
 
+// Assumes coefficients.size() == fused_circuits.size().
+// These are checked at the upstream.
+// scratch has been created, but does not require initialization.
+// dest has been created, but does not require initialization.
+// scratch has garbage value.
+// |psi> = sum_i coefficients[i]*|phi[i]>
+template <typename SimT, typename StateSpaceT, typename StateT>
+tensorflow::Status AccumulateFusedCircuits(
+    const std::vector<float>& coefficients,
+    const std::vector<QsimFusedCircuit>& fused_circuits, const SimT& sim,
+    const StateSpaceT& ss, StateT& scratch, StateT& dest) {
+  tensorflow::Status status = tensorflow::Status::OK();
+  ss.SetAllZeros(dest);
+
+  for (std::vector<qsim::GateFused<QsimGate>>::size_type i = 0;
+       i < fused_circuits.size(); i++) {
+    ss.SetStateZero(scratch);
+    for (std::vector<qsim::GateFused<QsimGate>>::size_type j = 0;
+         j < fused_circuits[i].size(); j++) {
+      qsim::ApplyFusedGate(sim, fused_circuits[i][j], scratch);
+    }
+    ss.Multiply(coefficients[i], scratch);
+    ss.Add(scratch, dest);
+  }
+
+  return status;
+}
+
 }  // namespace tfq
 
 #endif  // UTIL_QSIM_H_
diff --git a/tensorflow_quantum/core/src/util_qsim_test.cc b/tensorflow_quantum/core/src/util_qsim_test.cc
index 324d547bd..0740afa18 100644
--- a/tensorflow_quantum/core/src/util_qsim_test.cc
+++ b/tensorflow_quantum/core/src/util_qsim_test.cc
@@ -40,6 +40,7 @@ using ::tfq::proto::PauliTerm;
 typedef absl::flat_hash_map<std::string, std::pair<int, float>> SymbolMap;
 typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef std::vector<qsim::GateFused<QsimGate>> QsimFusedCircuit;
 
 class TwoTermSampledExpectationFixture
     : public ::testing::TestWithParam<std::tuple<std::string, float>> {};
@@ -551,5 +552,78 @@ TEST(UtilQsimTest, AccumulateOperatorsEmpty) {
   EXPECT_NEAR(ss.GetAmpl(scratch, 3).imag(), 0.0, 1e-5);
 }
 
+TEST(UtilQsimTest, AccumulateFusedCircuitsBasic) {
+  // Create circuit to prepare initial state.
+  std::vector<QsimCircuit> simple_circuits(2, QsimCircuit());
+  simple_circuits[0].num_qubits = 2;
+  simple_circuits[0].gates.push_back(
+      qsim::Cirq::XPowGate<float>::Create(0, 1, 0.25, 0.0));
+  simple_circuits[1].num_qubits = 2;
+  simple_circuits[1].gates.push_back(
+      qsim::Cirq::CXPowGate<float>::Create(1, 1, 0, 1.0, 0.0));
+  simple_circuits[1].gates.push_back(
+      qsim::Cirq::YPowGate<float>::Create(2, 0, 0.5, 0.0));
+
+  // Initialize fused circuits.
+  std::vector<QsimFusedCircuit> fused_circuits;
+  for (int i = 0; i < 2; i++) {
+    fused_circuits.push_back(
+        qsim::BasicGateFuser<qsim::IO, QsimGate>().FuseGates(
+            qsim::BasicGateFuser<qsim::IO, QsimGate>::Parameter(),
+            simple_circuits[i].num_qubits, simple_circuits[i].gates));
+  }
+
+  // Instantiate qsim objects.
+  qsim::Simulator<qsim::SequentialFor> sim(1);
+  qsim::Simulator<qsim::SequentialFor>::StateSpace ss(1);
+  auto sv = ss.Create(2);
+  auto scratch = ss.Create(2);
+  auto dest = ss.Create(2);
+
+  // Initialize coeffs.
+  std::vector<float> coeffs = {1.23, 4.56};
+
+  AccumulateFusedCircuits(coeffs, fused_circuits, sim, ss, scratch, dest);
+
+  // Scratch has coeffs[r][c] * fused circuits[r][c] where r, c = last indices.
+  // Check that dest got accumulated onto.
+  double accumulated_real[4] = {0.0, 0.0, 0.0, 0.0};
+  double accumulated_imag[4] = {0.0, 0.0, 0.0, 0.0};
+  for (unsigned int i = 0; i < 2; i++) {
+    ss.SetStateZero(sv);
+    for (const qsim::GateFused<QsimGate>& fused_gate : fused_circuits[i]) {
+      qsim::ApplyFusedGate(sim, fused_gate, sv);
+    }
+    for (unsigned int k = 0; k < 4; k++) {
+      accumulated_real[k] += coeffs[i] * ss.GetAmpl(sv, k).real();
+      accumulated_imag[k] += coeffs[i] * ss.GetAmpl(sv, k).imag();
+    }
+  }
+  for (unsigned int k = 0; k < 4; k++) {
+    EXPECT_NEAR(ss.GetAmpl(dest, k).real(), accumulated_real[k], 1e-5);
+    EXPECT_NEAR(ss.GetAmpl(dest, k).imag(), accumulated_imag[k], 1e-5);
+  }
+}
+
+TEST(UtilQsimTest, AccumulateFusedCircuitsEmpty) {
+  // Instantiate qsim objects.
+  qsim::Simulator<qsim::SequentialFor> sim(1);
+  qsim::Simulator<qsim::SequentialFor>::StateSpace ss(1);
+  auto scratch = ss.Create(2);
+  auto dest = ss.Create(2);
+
+  AccumulateFusedCircuits({}, {}, sim, ss, scratch, dest);
+
+  // scratch has garbage value.
+  // Check that dest contains all zeros.
+  EXPECT_NEAR(ss.GetAmpl(dest, 0).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 0).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 1).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 1).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 2).real(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 2).imag(), 0.0, 1e-5);
+  EXPECT_NEAR(ss.GetAmpl(dest, 3).real(), 0.0, 1e-5);
+}
+
 }  // namespace
 }  // namespace tfq