diff --git a/WORKSPACE b/WORKSPACE
index 6ef7ddad9..f877432a5 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -67,9 +67,9 @@ http_archive(
 
 http_archive(
     name = "qsim",
-    sha256 = "06c330960edf95d495c3686be9006fa11a5f87c8294d6ef4a2ad0a01660f2e49",
-    strip_prefix = "qsim-0.9.1",
-    urls = ["https://github.com/quantumlib/qsim/archive/v0.9.1.zip"],
+    sha256 = "d39b9c48866ce4d6a095093ae8059444d649e851219497af99e937a74f1e9a45",
+    strip_prefix = "qsim-0.9.2-dev-20210317",
+    urls = ["https://github.com/quantumlib/qsim/archive/v0.9.2-dev+20210317.zip"],
 )
 
 # Added for crosstool in tensorflow.
diff --git a/release/BUILD b/release/BUILD
index dff6e9b7d..55c938713 100644
--- a/release/BUILD
+++ b/release/BUILD
@@ -13,6 +13,7 @@ sh_binary(
         "//tensorflow_quantum/core:__init__.py",
         "//tensorflow_quantum/core/ops:__init__.py",
         "//tensorflow_quantum/core/ops/math_ops:__init__.py",
+        "//tensorflow_quantum/core/ops/noise:__init__.py",
         "//tensorflow_quantum/core/proto:__init__.py",
         "//tensorflow_quantum/core/serialize:__init__.py",
 
@@ -39,6 +40,7 @@ sh_binary(
         "//tensorflow_quantum/core/ops:tfq_utility_ops_py",
         "//tensorflow_quantum/core/ops:tfq_simulate_ops_py",
         "//tensorflow_quantum/core/ops/math_ops:inner_product_op_py",
+        "//tensorflow_quantum/core/ops/noise:noisy_expectation_op_py",
         "//tensorflow_quantum/core/serialize:serializer",
         "//tensorflow_quantum/datasets:cluster_state",
         "//tensorflow_quantum/datasets:spin_system",
diff --git a/scripts/import_test.py b/scripts/import_test.py
index 0db1fd60b..b5bd1888c 100644
--- a/scripts/import_test.py
+++ b/scripts/import_test.py
@@ -37,6 +37,9 @@ def test_imports():
     # Math ops.
     _ = tfq.math.inner_product
 
+    # Noisy simulation ops.
+    _ = tfq.noise.expectation
+
     # Util functions.
     _ = tfq.convert_to_tensor
     _ = tfq.get_quantum_concurrent_op_mode
diff --git a/tensorflow_quantum/__init__.py b/tensorflow_quantum/__init__.py
index e4f62ea17..4cee264a6 100644
--- a/tensorflow_quantum/__init__.py
+++ b/tensorflow_quantum/__init__.py
@@ -24,6 +24,9 @@
 # Import math ops.
 from tensorflow_quantum.core import math_ops as math
 
+# Import noise ops.
+from tensorflow_quantum.core import noise
+
 # Re-label python module as layers module.
 import tensorflow_quantum.python.layers as layers
 
diff --git a/tensorflow_quantum/core/__init__.py b/tensorflow_quantum/core/__init__.py
index 94dd5fc2c..fb24ad4a8 100644
--- a/tensorflow_quantum/core/__init__.py
+++ b/tensorflow_quantum/core/__init__.py
@@ -23,3 +23,6 @@
                                          padded_to_ragged2d, resolve_parameters)
 # Import math ops.
 from tensorflow_quantum.core.ops import math_ops
+
+# Import noise ops.
+from tensorflow_quantum.core.ops import noise
diff --git a/tensorflow_quantum/core/ops/__init__.py b/tensorflow_quantum/core/ops/__init__.py
index 8c553a74c..062ea7499 100644
--- a/tensorflow_quantum/core/ops/__init__.py
+++ b/tensorflow_quantum/core/ops/__init__.py
@@ -27,3 +27,6 @@
 
 # Import math_ops.
 from tensorflow_quantum.core.ops import math_ops
+
+# Import noise ops.
+from tensorflow_quantum.core.ops import noise
diff --git a/tensorflow_quantum/core/ops/noise/BUILD b/tensorflow_quantum/core/ops/noise/BUILD
new file mode 100644
index 000000000..9f31bd666
--- /dev/null
+++ b/tensorflow_quantum/core/ops/noise/BUILD
@@ -0,0 +1,85 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+# Export for the PIP package.
+exports_files(["__init__.py"])
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@bazel_tools//platforms:windows"],
+)
+
+cc_binary(
+    name = "_tfq_noise_ops.so",
+    srcs = [
+        "tfq_noisy_expectation.cc",
+    ],
+    copts = select({
+        ":windows": [
+            "/D__CLANG_SUPPORT_DYN_ANNOTATION__",
+            "/D_USE_MATH_DEFINES",
+            "/DEIGEN_MPL2_ONLY",
+            "/DEIGEN_MAX_ALIGN_BYTES=64",
+            "/DEIGEN_HAS_TYPE_TRAITS=0",
+            "/DTF_USE_SNAPPY",
+            "/showIncludes",
+            "/MD",
+            "/O2",
+            "/DNDEBUG",
+            "/w",
+            "-DWIN32_LEAN_AND_MEAN",
+            "-DNOGDI",
+            "/d2ReducedOptimizeHugeFunctions",
+            "/arch:AVX",
+            "/std:c++14",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-pthread",
+            "-std=c++11",
+            "-D_GLIBCXX_USE_CXX11_ABI=0",
+        ],
+    }),
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    linkshared = 1,
+    deps = [
+        "//tensorflow_quantum/core/ops:parse_context",
+        "//tensorflow_quantum/core/ops:tfq_simulate_utils",
+        "//tensorflow_quantum/core/src:util_qsim",
+        "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        "@qsim//lib:qsim_lib",
+    ],
+)
+
+py_library(
+    name = "noisy_expectation_op_py",
+    srcs = ["noisy_expectation_op.py"],
+    data = [":_tfq_noise_ops.so"],
+    deps = [
+        "//tensorflow_quantum/core/ops:load_module",
+    ],
+)
+
+py_test(
+    name = "noisy_expectation_op_test",
+    srcs = ["noisy_expectation_op_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":noisy_expectation_op_py",
+        "//tensorflow_quantum/core/ops:batch_util",
+        "//tensorflow_quantum/python:util",
+    ],
+)
diff --git a/tensorflow_quantum/core/ops/noise/__init__.py b/tensorflow_quantum/core/ops/noise/__init__.py
new file mode 100644
index 000000000..23243dd8d
--- /dev/null
+++ b/tensorflow_quantum/core/ops/noise/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for tfq.core.ops.noise.*"""
+
+from tensorflow_quantum.core.ops.noise.noisy_expectation_op import expectation
diff --git a/tensorflow_quantum/core/ops/noise/noisy_expectation_op.py b/tensorflow_quantum/core/ops/noise/noisy_expectation_op.py
new file mode 100644
index 000000000..c40c69882
--- /dev/null
+++ b/tensorflow_quantum/core/ops/noise/noisy_expectation_op.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for high performance noisy circuit simulation ops."""
+import os
+import tensorflow as tf
+from tensorflow_quantum.core.ops.load_module import load_module
+
+NOISY_OP_MODULE = load_module(os.path.join("noise", "_tfq_noise_ops.so"))
+
+
+def expectation(programs, symbol_names, symbol_values, pauli_sums, num_samples):
+    """Calculate the analytic expectation values using monte-carlo trajectories.
+
+    Simulate the final state of `programs` given `symbol_values` are placed
+    inside of the symbols with the name in `symbol_names` in each circuit.
+    Channels in this simulation will be "tossed" to a certain realization
+    during simulation. This simulation is repeated `num_samples` times and
+    analytic expectation calculations with the given `pauli_sums` are calculated
+    after each run. Once all the runs are finished, these quantities are
+    averaged together. This process can be thought of as analyical expectation
+    calculation done using monte carlo state vector simulation to account
+    for noisy operations in the given circuits.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
+            containing the string representation of the operators that will
+            be used on all of the circuits in the expectation calculations.
+        num_samples: `tf.Tensor` with `num_samples[i][j]` is equal to the
+            number of times `programs[i]` will be simulated to estimate
+            `pauli_sums[i][j]`. Therefore, `num_samples` must have the same
+            shape as `pauli_sums`. Note: internally this quantity can get
+            rounded up to the nearest multiple of the number of available
+            threads to TensorFlow. For best performance ensure that the
+            quantities in `num_samples` are a multiple of the number of
+            available threads.
+    Returns:
+        `tf.Tensor` with shape [batch_size, n_ops] that holds the
+            expectation value for each circuit with each op applied to it
+            (after resolving the corresponding parameters in).
+    """
+    return NOISY_OP_MODULE.tfq_noisy_expectation(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums,
+        tf.cast(num_samples, dtype=tf.int32))
diff --git a/tensorflow_quantum/core/ops/noise/noisy_expectation_op_test.py b/tensorflow_quantum/core/ops/noise/noisy_expectation_op_test.py
new file mode 100644
index 000000000..3b2b7501b
--- /dev/null
+++ b/tensorflow_quantum/core/ops/noise/noisy_expectation_op_test.py
@@ -0,0 +1,337 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that specifically target noisy expectation calculation."""
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops import batch_util
+from tensorflow_quantum.core.ops.noise import noisy_expectation_op
+from tensorflow_quantum.python import util
+
+
+class NoisyExpectationCalculationTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq.noise.expectation."""
+
+    def test_noisy_expectation_inputs(self):
+        """Make sure noisy expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, include_channels=True)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        num_samples = [[10]] * batch_size
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([x for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                [util.convert_to_tensor([[x] for x in pauli_sums])],
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                num_samples[0])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            noisy_expectation_op.expectation(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_pauli_sums]),
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [['junk']] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            noisy_expectation_op.expectation(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, num_samples)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), [],
+                num_samples)
+            # pylint: enable=too-many-function-args
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor([cirq.Circuit()]), symbol_names,
+                symbol_values_array.astype(np.float64),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'greater than 0'):
+            # pylint: disable=too-many-function-args
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [[-1]] * batch_size)
+            # pylint: enable=too-many-function-args
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            noisy_expectation_op.expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+    @parameterized.parameters([
+        {
+            'n_qubits': 13,
+            'batch_size': 1,
+            'noisy': False
+        },  # ComputeLarge.
+        {
+            'n_qubits': 6,
+            'batch_size': 25,
+            'noisy': False
+        },  # ComputeSmall.
+        {
+            'n_qubits': 6,
+            'batch_size': 10,
+            'noisy': True
+        },  # ComputeSmall.
+        {
+            'n_qubits': 8,
+            'batch_size': 1,
+            'noisy': True
+        }  # ComputeLarge.
+    ])
+    def test_simulate_consistency(self, batch_size, n_qubits, noisy):
+        """Test consistency with batch_util.py simulation."""
+        symbol_names = ['alpha', 'beta']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, include_channels=noisy)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums1 = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums2 = util.random_pauli_sums(qubits, 3, batch_size)
+        batch_pauli_sums = [[x, y] for x, y in zip(pauli_sums1, pauli_sums2)]
+        num_samples = [[10000 if noisy else 3] * 2] * batch_size
+
+        op_exps = noisy_expectation_op.expectation(
+            util.convert_to_tensor(circuit_batch),
+            symbol_names, symbol_values_array,
+            util.convert_to_tensor(batch_pauli_sums), num_samples)
+
+        cirq_exps = batch_util.batch_calculate_expectation(
+            circuit_batch, resolver_batch, batch_pauli_sums,
+            cirq.DensityMatrixSimulator() if noisy else cirq.Simulator())
+        tol = 5e-2 if noisy else 5e-4
+        self.assertAllClose(cirq_exps, op_exps, atol=tol, rtol=tol)
+
+    @parameterized.parameters([{
+        'channel': x
+    } for x in util.get_supported_channels()])
+    def test_single_channel(self, channel):
+        """Individually test adding just a single channel type to circuits."""
+        symbol_names = []
+        batch_size = 5
+        n_qubits = 6
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+
+        circuit_batch, resolver_batch = \
+            util.random_circuit_resolver_batch(
+                qubits, batch_size, include_channels=False)
+
+        for i in range(batch_size):
+            circuit_batch[i] = circuit_batch[i] + channel.on_each(*qubits)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums1 = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums2 = util.random_pauli_sums(qubits, 3, batch_size)
+        batch_pauli_sums = [[x, y] for x, y in zip(pauli_sums1, pauli_sums2)]
+        num_samples = [[10000] * 2] * batch_size
+
+        op_exps = noisy_expectation_op.expectation(
+            util.convert_to_tensor(circuit_batch),
+            symbol_names, symbol_values_array,
+            util.convert_to_tensor(batch_pauli_sums), num_samples)
+
+        cirq_exps = batch_util.batch_calculate_expectation(
+            circuit_batch, resolver_batch, batch_pauli_sums,
+            cirq.DensityMatrixSimulator())
+
+        self.assertAllClose(cirq_exps, op_exps, atol=5e-2, rtol=5e-2)
+
+    def test_correctness_empty(self):
+        """Test the expectation for empty circuits."""
+        empty_circuit = util.convert_to_tensor([cirq.Circuit()])
+        empty_symbols = tf.convert_to_tensor([], dtype=tf.dtypes.string)
+        empty_values = tf.convert_to_tensor([[]])
+        empty_paulis = tf.convert_to_tensor([[]], dtype=tf.dtypes.string)
+        empty_n_samples = tf.convert_to_tensor([[]], dtype=tf.int32)
+
+        out = noisy_expectation_op.expectation(empty_circuit, empty_symbols,
+                                               empty_values, empty_paulis,
+                                               empty_n_samples)
+
+        expected = np.array([[]], dtype=np.complex64)
+        self.assertAllClose(out, expected)
+
+    def test_correctness_no_circuit(self):
+        """Test the correctness with the empty tensor."""
+        empty_circuit = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_symbols = tf.raw_ops.Empty(shape=(0,), dtype=tf.string)
+        empty_values = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32)
+        empty_paulis = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string)
+        empty_n_samples = tf.raw_ops.Empty(shape=(0, 0), dtype=tf.int32)
+
+        out = noisy_expectation_op.expectation(empty_circuit, empty_symbols,
+                                               empty_values, empty_paulis,
+                                               empty_n_samples)
+
+        self.assertShapeEqual(np.zeros((0, 0)), out)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/noise/tfq_noisy_expectation.cc b/tensorflow_quantum/core/ops/noise/tfq_noisy_expectation.cc
new file mode 100644
index 000000000..8c82474c3
--- /dev/null
+++ b/tensorflow_quantum/core/ops/noise/tfq_noisy_expectation.cc
@@ -0,0 +1,393 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/channel.h"
+#include "../qsim/lib/channels_cirq.h"
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/circuit_noisy.h"
+#include "../qsim/lib/fuser_mqubit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/io.h"
+#include "../qsim/lib/qtrajectory.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simmux.h"
+#include "cirq/google/api/v2/program.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::cirq::google::api::v2::Program;
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+typedef qsim::NoisyCircuit<QsimGate> NoisyQsimCircuit;
+
+class TfqNoisyExpectationOp : public tensorflow::OpKernel {
+ public:
+  explicit TfqNoisyExpectationOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 5,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_op_size = context->input(3).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_op_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->matrix<float>();
+
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    std::vector<std::vector<int>> num_samples;
+    OP_REQUIRES_OK(context, GetNumSamples(context, &num_samples));
+
+    OP_REQUIRES(context, num_samples.size() == pauli_sums.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Dimension 0 of num_samples and pauli_sums do not match.",
+                    "Got ", num_samples.size(), " lists of sample sizes and ",
+                    pauli_sums.size(), " lists of pauli sums.")));
+
+    OP_REQUIRES(
+        context, context->input(4).dim_size(1) == context->input(3).dim_size(1),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Dimension 1 of num_samples and pauli_sums do not match.", "Got ",
+            context->input(4).dim_size(1), " lists of sample sizes and ",
+            context->input(3).dim_size(1), " lists of pauli sums.")));
+
+    // Construct qsim circuits.
+    std::vector<NoisyQsimCircuit> qsim_circuits(programs.size(),
+                                                NoisyQsimCircuit());
+
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        OP_REQUIRES_OK(context, NoisyQsimCircuitFromProgram(
+                                    programs[i], maps[i], num_qubits[i],
+                                    &qsim_circuits[i]));
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    // Cross reference with standard google cloud compute instances
+    // Memory ~= 2 * num_threads * (2 * 64 * 2 ** num_qubits in circuits)
+    // e2s2 = 2 CPU, 8GB -> Can safely do 25 since Memory = 4GB
+    // e2s4 = 4 CPU, 16GB -> Can safely do 25 since Memory = 8GB
+    // ...
+    if (max_num_qubits >= 26) {
+      // If the number of qubits is lager than 24, we switch to an
+      // alternate parallelization scheme with runtime:
+      // O(n_circuits * max_j(num_samples[i])) with parallelization being
+      // multiple threads per wavefunction.
+      ComputeLarge(num_qubits, qsim_circuits, pauli_sums, num_samples, context,
+                   &output_tensor);
+    } else {
+      // Runtime: O(n_circuits * max_j(num_samples[i])) with parallelization
+      // being done over number of trials.
+      ComputeSmall(num_qubits, max_num_qubits, qsim_circuits, pauli_sums,
+                   num_samples, context, &output_tensor);
+    }
+  }
+
+ private:
+  void ComputeLarge(const std::vector<int>& num_qubits,
+                    const std::vector<NoisyQsimCircuit>& ncircuits,
+                    const std::vector<std::vector<PauliSum>>& pauli_sums,
+                    const std::vector<std::vector<int>>& num_samples,
+                    tensorflow::OpKernelContext* context,
+                    tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    const auto tfq_for = tfq::QsimFor(context);
+    using Simulator = qsim::Simulator<const tfq::QsimFor&>;
+    using StateSpace = Simulator::StateSpace;
+    using QTSimulator =
+        qsim::QuantumTrajectorySimulator<qsim::IO, QsimGate,
+                                         qsim::MultiQubitGateFuser, Simulator>;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(tfq_for);
+    StateSpace ss = StateSpace(tfq_for);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (int i = 0; i < ncircuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      // (#679) Just ignore empty program
+      if (ncircuits[i].channels.size() == 0) {
+        for (int j = 0; j < pauli_sums[i].size(); j++) {
+          (*output_tensor)(i, j) = -2.0;
+        }
+        continue;
+      }
+
+      if (nq > largest_nq) {
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+      }
+      QTSimulator::Parameter param;
+      param.collect_kop_stat = false;
+      param.collect_mea_stat = false;
+      param.normalize_before_mea_gates = true;
+      std::vector<uint64_t> unused_stats;
+      // Track op-wise stats.
+      std::vector<int> run_samples(num_samples[i].size(), 0);
+      std::vector<double> rolling_sums(num_samples[i].size(), 0.0);
+
+      while (1) {
+        ss.SetStateZero(sv);
+        // time since epoch seeds random generator.
+        unsigned long r_seed =
+            std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::system_clock::now().time_since_epoch())
+                .count();
+        QTSimulator::RunOnce(param, ncircuits[i], r_seed, ss, sim, scratch, sv,
+                             unused_stats);
+
+        // Use this trajectory as a source for all expectation calculations.
+        for (int j = 0; j < pauli_sums[i].size(); j++) {
+          if (run_samples[j] >= num_samples[i][j]) {
+            continue;
+          }
+          float exp_v = 0.0;
+          OP_REQUIRES_OK(context,
+                         ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv,
+                                                scratch, &exp_v));
+          rolling_sums[j] += static_cast<double>(exp_v);
+          run_samples[j]++;
+        }
+        bool break_loop = true;
+        for (int j = 0; j < num_samples[i].size(); j++) {
+          if (run_samples[j] < num_samples[i][j]) {
+            break_loop = false;
+            break;
+          }
+        }
+        if (break_loop) {
+          for (int j = 0; j < num_samples[i].size(); j++) {
+            rolling_sums[j] /= num_samples[i][j];
+            (*output_tensor)(i, j) = static_cast<float>(rolling_sums[j]);
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  void ComputeSmall(const std::vector<int>& num_qubits,
+                    const int max_num_qubits,
+                    const std::vector<NoisyQsimCircuit>& ncircuits,
+                    const std::vector<std::vector<PauliSum>>& pauli_sums,
+                    const std::vector<std::vector<int>>& num_samples,
+                    tensorflow::OpKernelContext* context,
+                    tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
+    using StateSpace = Simulator::StateSpace;
+    using QTSimulator =
+        qsim::QuantumTrajectorySimulator<qsim::IO, QsimGate,
+                                         qsim::MultiQubitGateFuser, Simulator>;
+
+    const int output_dim_batch_size = output_tensor->dimension(0);
+    std::vector<tensorflow::mutex> batch_locks(output_dim_batch_size,
+                                               tensorflow::mutex());
+
+    const int num_threads = context->device()
+                                ->tensorflow_cpu_worker_threads()
+                                ->workers->NumThreads();
+
+    // [num_threads, batch_size].
+    std::vector<std::vector<int>> rep_offsets(
+        num_threads, std::vector<int>(output_dim_batch_size, 0));
+
+    BalanceTrajectory(num_samples, num_threads, &rep_offsets);
+
+    output_tensor->setZero();
+    auto DoWork = [&](int start, int end) {
+      // Begin simulation.
+      const auto tfq_for = qsim::SequentialFor(1);
+      int largest_nq = 1;
+      Simulator sim = Simulator(tfq_for);
+      StateSpace ss = StateSpace(tfq_for);
+      auto sv = ss.Create(largest_nq);
+      auto scratch = ss.Create(largest_nq);
+
+      for (int i = 0; i < ncircuits.size(); i++) {
+        int nq = num_qubits[i];
+        int rep_offset = rep_offsets[start][i];
+
+        // (#679) Just ignore empty program
+        if (ncircuits[i].channels.size() == 0) {
+          for (int j = 0; j < pauli_sums[i].size(); j++) {
+            (*output_tensor)(i, j) = -2.0;
+          }
+          continue;
+        }
+
+        if (nq > largest_nq) {
+          largest_nq = nq;
+          sv = ss.Create(largest_nq);
+          scratch = ss.Create(largest_nq);
+        }
+        QTSimulator::Parameter param;
+        param.collect_kop_stat = false;
+        param.collect_mea_stat = false;
+        param.normalize_before_mea_gates = true;
+        std::vector<uint64_t> unused_stats;
+        // Track op-wise stats.
+        std::vector<int> run_samples(num_samples[i].size(), 0);
+        std::vector<double> rolling_sums(num_samples[i].size(), 0.0);
+
+        while (1) {
+          ss.SetStateZero(sv);
+          // time since epoch seeds random generator.
+          unsigned long r_seed =
+              std::chrono::duration_cast<std::chrono::milliseconds>(
+                  std::chrono::system_clock::now().time_since_epoch())
+                  .count();
+
+          QTSimulator::RunOnce(param, ncircuits[i], r_seed, ss, sim, scratch,
+                               sv, unused_stats);
+
+          // Compute expectations across all ops using this trajectory.
+          for (int j = 0; j < pauli_sums[i].size(); j++) {
+            int p_reps = (num_samples[i][j] + num_threads - 1) / num_threads;
+            if (run_samples[j] >= p_reps + rep_offset) {
+              continue;
+            }
+            float exp_v = 0.0;
+            OP_REQUIRES_OK(context,
+                           ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv,
+                                                  scratch, &exp_v));
+            rolling_sums[j] += static_cast<double>(exp_v);
+            run_samples[j]++;
+          }
+
+          // Check if we have run enough trajectories for all ops.
+          bool break_loop = true;
+          for (int j = 0; j < num_samples[i].size(); j++) {
+            int p_reps = (num_samples[i][j] + num_threads - 1) / num_threads;
+            if (run_samples[j] < p_reps + rep_offset) {
+              break_loop = false;
+              break;
+            }
+          }
+          if (break_loop) {
+            // Lock writing to this batch index in output_tensor.
+            batch_locks[i].lock();
+            for (int j = 0; j < num_samples[i].size(); j++) {
+              rolling_sums[j] /= num_samples[i][j];
+              (*output_tensor)(i, j) += static_cast<float>(rolling_sums[j]);
+            }
+            batch_locks[i].unlock();
+            break;
+          }
+        }
+      }
+    };
+
+    // block_size = 1.
+    tensorflow::thread::ThreadPool::SchedulingParams scheduling_params(
+        tensorflow::thread::ThreadPool::SchedulingStrategy::kFixedBlockSize,
+        absl::nullopt, 1);
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        num_threads, scheduling_params, DoWork);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqNoisyExpectation").Device(tensorflow::DEVICE_CPU),
+    TfqNoisyExpectationOp);
+
+REGISTER_OP("TfqNoisyExpectation")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Input("num_samples: int32")
+    .Output("expectations: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::ShapeHandle num_samples_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &num_samples_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(pauli_sums_shape, 1);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return tensorflow::Status::OK();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/src/circuit_parser_qsim.cc b/tensorflow_quantum/core/src/circuit_parser_qsim.cc
index 2f942f2b9..b54b9623f 100644
--- a/tensorflow_quantum/core/src/circuit_parser_qsim.cc
+++ b/tensorflow_quantum/core/src/circuit_parser_qsim.cc
@@ -570,7 +570,8 @@ tensorflow::Status ParseAppendGate(const Operation& op,
                                    const unsigned int num_qubits,
                                    const unsigned int time,
                                    QsimCircuit* circuit,
-                                   std::vector<GateMetaData>* metadata) {
+                                   std::vector<GateMetaData>* metadata,
+                                   bool* lookup_succeeded) {
   // map gate name -> callable to build that qsim gate from operation proto.
   static const absl::flat_hash_map<
       std::string,
@@ -588,11 +589,13 @@ tensorflow::Status ParseAppendGate(const Operation& op,
 
   auto build_f = func_map.find(op.gate().id());
   if (build_f == func_map.end()) {
+    *lookup_succeeded = false;
     return Status(tensorflow::error::INVALID_ARGUMENT,
                   absl::StrCat("Could not parse gate id: ", op.gate().id(),
                                ". This is likely because a cirq.Channel was "
                                "used in an op that does not support them."));
   }
+  *lookup_succeeded = true;
   return build_f->second(op, param_map, num_qubits, time, circuit, metadata);
 }
 
@@ -614,7 +617,7 @@ inline Status AsymmetricDepolarizingChannel(const Operation& op,
   }
   auto chan = qsim::Cirq::AsymmetricDepolarizingChannel<float>::Create(
       time, num_qubits - q - 1, p_x, p_y, p_z);
-  ncircuit->push_back(chan);
+  ncircuit->channels.push_back(chan);
   return Status::OK();
 }
 
@@ -634,7 +637,7 @@ inline Status DepolarizingChannel(const Operation& op,
   }
   auto chan = qsim::Cirq::DepolarizingChannel<float>::Create(
       time, num_qubits - q - 1, p);
-  ncircuit->push_back(chan);
+  ncircuit->channels.push_back(chan);
   return Status::OK();
 }
 
@@ -664,25 +667,32 @@ tensorflow::Status NoisyQsimCircuitFromProgram(const Program& program,
                                                const int num_qubits,
                                                NoisyQsimCircuit* ncircuit) {
   // Special case empty.
+  ncircuit->num_qubits = num_qubits;
   if (num_qubits <= 0) {
     return Status::OK();
   }
+
   int time = 0;
+  bool gate_found;
   QsimCircuit placeholder;
   placeholder.gates.reserve(2);
 
   for (const Moment& moment : program.circuit().moments()) {
     for (const Operation& op : moment.operations()) {
       placeholder.gates.clear();
+      gate_found = false;
       Status status = ParseAppendGate(op, param_map, num_qubits, time,
-                                      &placeholder, nullptr);
-      if (!status.ok()) {
-        // if failed to append gate, try appending channel.
-        status = ParseAppendChannel(op, num_qubits, time, ncircuit);
-      } else {
-        // succeeded in appending gate, convert to channel.
-        ncircuit->push_back(
+                                      &placeholder, nullptr, &gate_found);
+      if (gate_found && !status.ok()) {
+        // gate found, failed when parsing proto.
+        return status;
+      } else if (status.ok()) {
+        // gate found. succeeded in parsing.
+        ncircuit->channels.push_back(
             std::move(qsim::MakeChannelFromGate(time, placeholder.gates[0])));
+      } else {
+        // got not found. Attempt to find and append channel.
+        status = ParseAppendChannel(op, num_qubits, time, ncircuit);
       }
 
       if (!status.ok()) {
@@ -702,6 +712,7 @@ tensorflow::Status QsimCircuitFromProgram(
   // Convert proto to qsim internal representation.
   circuit->num_qubits = num_qubits;
   int time = 0;
+  bool unused;
   // Special case empty.
   if (num_qubits <= 0) {
     return Status::OK();
@@ -713,8 +724,8 @@ tensorflow::Status QsimCircuitFromProgram(
   }
   for (const Moment& moment : program.circuit().moments()) {
     for (const Operation& op : moment.operations()) {
-      Status status =
-          ParseAppendGate(op, param_map, num_qubits, time, circuit, metadata);
+      Status status = ParseAppendGate(op, param_map, num_qubits, time, circuit,
+                                      metadata, &unused);
       if (!status.ok()) {
         return status;
       }
diff --git a/tensorflow_quantum/core/src/circuit_parser_qsim_test.cc b/tensorflow_quantum/core/src/circuit_parser_qsim_test.cc
index cd4718a22..6ea256af6 100644
--- a/tensorflow_quantum/core/src/circuit_parser_qsim_test.cc
+++ b/tensorflow_quantum/core/src/circuit_parser_qsim_test.cc
@@ -1227,9 +1227,10 @@ TEST(QsimCircuitParserTest, CompoundCircuit) {
 
   ASSERT_EQ(NoisyQsimCircuitFromProgram(program_proto, {}, 2, &test_circuit),
             tensorflow::Status::OK());
-  AssertChannelEqual(test_circuit[0], ref_chan);
-  AssertOneQubitEqual(test_circuit[1][0].ops[0], ref_gate);
-  ASSERT_EQ(test_circuit.size(), 2);
+  AssertChannelEqual(test_circuit.channels[0], ref_chan);
+  AssertOneQubitEqual(test_circuit.channels[1][0].ops[0], ref_gate);
+  ASSERT_EQ(test_circuit.channels.size(), 2);
+  ASSERT_EQ(test_circuit.num_qubits, 2);
 }
 
 TEST(QsimCircuitParserTest, AsymmetricDepolarizing) {
@@ -1267,8 +1268,9 @@ TEST(QsimCircuitParserTest, AsymmetricDepolarizing) {
 
   ASSERT_EQ(NoisyQsimCircuitFromProgram(program_proto, {}, 1, &test_circuit),
             tensorflow::Status::OK());
-  AssertChannelEqual(test_circuit[0], reference);
-  ASSERT_EQ(test_circuit.size(), 1);
+  AssertChannelEqual(test_circuit.channels[0], reference);
+  ASSERT_EQ(test_circuit.channels.size(), 1);
+  ASSERT_EQ(test_circuit.num_qubits, 1);
 }
 
 TEST(QsimCircuitParserTest, Depolarizing) {
@@ -1301,8 +1303,9 @@ TEST(QsimCircuitParserTest, Depolarizing) {
 
   ASSERT_EQ(NoisyQsimCircuitFromProgram(program_proto, {}, 1, &test_circuit),
             tensorflow::Status::OK());
-  AssertChannelEqual(test_circuit[0], reference);
-  ASSERT_EQ(test_circuit.size(), 1);
+  AssertChannelEqual(test_circuit.channels[0], reference);
+  ASSERT_EQ(test_circuit.channels.size(), 1);
+  ASSERT_EQ(test_circuit.num_qubits, 1);
 }
 
 TEST(QsimCircuitParserTest, NoisyEmpty) {
@@ -1314,7 +1317,8 @@ TEST(QsimCircuitParserTest, NoisyEmpty) {
   NoisyQsimCircuit test_circuit;
   ASSERT_EQ(NoisyQsimCircuitFromProgram(program_proto, {}, 0, &test_circuit),
             tensorflow::Status::OK());
-  ASSERT_EQ(test_circuit.size(), 0);
+  ASSERT_EQ(test_circuit.channels.size(), 0);
+  ASSERT_EQ(test_circuit.num_qubits, 0);
 }
 
 TEST(QsimCircuitParserTest, NoisyBadProto) {
diff --git a/tensorflow_quantum/core/src/util_qsim.h b/tensorflow_quantum/core/src/util_qsim.h
index 955085159..7e9e39f6d 100644
--- a/tensorflow_quantum/core/src/util_qsim.h
+++ b/tensorflow_quantum/core/src/util_qsim.h
@@ -344,6 +344,56 @@ tensorflow::Status AccumulateFusedCircuits(
   return status;
 }
 
+// Balance the number of trajectory computations done between
+// threads. num_samples is a 2d vector containing the number of reps
+// requested for each pauli_sum[i,j]. After running thread_offsets
+// contains 0/-1 values that will offset the work for each thread.
+// to make it as close to uniform as possible. **Assumes circuits
+// have rouhgly equal simulation cost**
+static void BalanceTrajectory(const std::vector<std::vector<int>>& num_samples,
+                              const int& num_threads,
+                              std::vector<std::vector<int>>* thread_offsets) {
+  std::vector<int> rep_limits(num_samples.size(), -1);
+  std::vector<int> height(num_threads, 0);
+
+  for (int i = 0; i < num_samples.size(); i++) {
+    for (int j = 0; j < num_samples[i].size(); j++) {
+      rep_limits[i] = std::max(rep_limits[i], num_samples[i][j]);
+    }
+  }
+  int prev_max_height = -1;
+  for (int j = 0; j < num_samples.size(); j++) {
+    int run_ceiling = ((rep_limits[j] + num_threads - 1) / num_threads);
+    int num_lo = num_threads * run_ceiling - rep_limits[j];
+    int num_hi = num_threads - num_lo;
+    int cur_max = prev_max_height;
+    for (int i = 0; i < num_threads; i++) {
+      if (height[i] == cur_max && num_lo) {
+        // previously had extra work on this thread and
+        // have remaining low budget to give.
+        height[i]++;
+        (*thread_offsets)[i][j] = -1;
+        num_lo--;
+      } else if (height[i] == cur_max - 1 && num_hi) {
+        // previously had less work on this thread and
+        // remaining high budget to give.
+        height[i] += 2;
+        (*thread_offsets)[i][j] = 0;
+        num_hi--;
+      } else if (num_hi) {
+        height[i] += 2;
+        (*thread_offsets)[i][j] = 0;
+        num_hi--;
+      } else {
+        height[i]++;
+        (*thread_offsets)[i][j] = -1;
+        num_lo--;
+      }
+      prev_max_height = std::max(height[i], prev_max_height);
+    }
+  }
+}
+
 }  // namespace tfq
 
 #endif  // UTIL_QSIM_H_
diff --git a/tensorflow_quantum/core/src/util_qsim_test.cc b/tensorflow_quantum/core/src/util_qsim_test.cc
index 0740afa18..82160c5f9 100644
--- a/tensorflow_quantum/core/src/util_qsim_test.cc
+++ b/tensorflow_quantum/core/src/util_qsim_test.cc
@@ -625,5 +625,94 @@ TEST(UtilQsimTest, AccumulateFusedCircuitsEmpty) {
   EXPECT_NEAR(ss.GetAmpl(dest, 3).real(), 0.0, 1e-5);
 }
 
+static void AssertWellBalanced(const std::vector<std::vector<int>>& n_reps,
+                               const int& num_threads,
+                               const std::vector<std::vector<int>>& offsets) {
+  auto max_work = std::vector<int>(n_reps.size(), -1);
+  for (int i = 0; i < n_reps.size(); i++) {
+    for (int j = 0; j < n_reps[0].size(); j++) {
+      max_work[i] = std::max(max_work[i], n_reps[i][j]);
+    }
+  }
+
+  for (int i = 0; i < n_reps.size(); i++) {
+    int sum = 0;
+    int prev_local_work = 0;
+    for (int k = 0; k < num_threads; k++) {
+      int local_work = (max_work[i] + num_threads - 1) / num_threads;
+      local_work += offsets[k][i];
+      sum += local_work;
+      if (k > 0) {
+        EXPECT_LT(abs(local_work - prev_local_work), 2);
+      }
+      prev_local_work = local_work;
+    }
+    EXPECT_EQ(sum, max_work[i]);
+  }
+}
+
+TEST(UtilQsimTest, BalanceTrajectorySimple) {
+  std::vector<std::vector<int>> n_reps = {{1, 3, 5, 10, 15},
+                                          {1, 10, 20, 30, 40},
+                                          {50, 70, 100, 100, 100},
+                                          {100, 200, 200, 200, 200}};
+  const int num_threads = 3;
+  // [num_threads, n_reps.size()]
+  std::vector<std::vector<int>> offsets = {
+      {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+
+  BalanceTrajectory(n_reps, num_threads, &offsets);
+  AssertWellBalanced(n_reps, num_threads, offsets);
+}
+
+TEST(UtilQsimTest, BalanceTrajectoryPreventIdle) {
+  std::vector<std::vector<int>> n_reps = {{1, 1, 1, 1, 11},
+                                          {1, 1, 1, 11, 1},
+                                          {1, 1, 11, 1, 1},
+                                          {1, 11, 1, 1, 1},
+                                          {11, 1, 1, 1, 1}};
+  const int num_threads = 10;
+  // [num_threads, n_reps.size()]
+  std::vector<std::vector<int>> offsets = {
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}};
+
+  BalanceTrajectory(n_reps, num_threads, &offsets);
+  AssertWellBalanced(n_reps, num_threads, offsets);
+}
+
+TEST(UtilQsimTest, BalanceTrajectoryLowRep) {
+  std::vector<std::vector<int>> n_reps = {
+      {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1},
+      {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}};
+  const int num_threads = 5;
+  // [num_threads, n_reps.size()]
+  std::vector<std::vector<int>> offsets = {{0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0}};
+
+  BalanceTrajectory(n_reps, num_threads, &offsets);
+  AssertWellBalanced(n_reps, num_threads, offsets);
+}
+
+TEST(UtilQsimTest, BalanceTrajectoryFewHigh) {
+  std::vector<std::vector<int>> n_reps = {
+      {1, 100, 1, 1, 1}, {1, 1, 1, 1, 1000}, {1, 1, 1, 1, 1},   {1, 1, 1, 1, 1},
+      {1, 1, 1, 1, 1},   {1, 10, 1, 1, 1},   {1, 1, 1, 1, 1000}};
+  const int num_threads = 5;
+  // [num_threads, n_reps.size()]
+  std::vector<std::vector<int>> offsets = {{0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0},
+                                           {0, 0, 0, 0, 0, 0, 0}};
+
+  BalanceTrajectory(n_reps, num_threads, &offsets);
+  AssertWellBalanced(n_reps, num_threads, offsets);
+}
+
 }  // namespace
 }  // namespace tfq