From 2d1445cf4d730a3f29320943bf0364763b528893 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 12:15:46 -0800
Subject: [PATCH 01/67] fixed no of outputs

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index a67f2084e..0bf21451d 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -413,6 +413,7 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
 // ComputeUsingParallelExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
+  cout << "using parallel exec " << endl;
   // TF input tensors
   std::vector<Tensor> tf_input_tensors;
 
@@ -484,7 +485,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 
   // create inputs, outputs, pipelineId
   int num_of_inputs = tensor_manager->GetNumberOfInputs();
-  int num_of_outputs = tensor_manager->GetNumberOfInputs();
+  int num_of_outputs = tensor_manager->GetNumberOfOutputs();
   int current_iter_pipeline_depth = get<0>(io_tensors);
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
@@ -497,6 +498,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
+    cout << "using prefetch env flag " << endl;
     NGraphPrefetchSharedResouce::InputTensorBundle prefetch_input_tensor_bundle{
         current_iter_pipeline_depth, ng_inputs};
     // Set the prefetch shared obj if applicable
@@ -542,6 +544,8 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
                         "signal prefetching";
     } else {
+      cout << "using prefetch inputs " << endl;
+
       int prefetch_buffer_depth = shared_data->GetBufferDepth();
       int skip_count = shared_data->GetSkipCount();
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth

From 36a0b6191343c260e37fdbaa1da3f772cc130e87 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 14:54:21 -0800
Subject: [PATCH 02/67] Some minor changes

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 37 +++++++++++---------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 0bf21451d..2f261377a 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -458,34 +458,29 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 
   // Get Tensor Manager and some error checking
   auto tensor_manager = m_parallel_executor->GetTensorManager();
-  OP_REQUIRES(ctx, tensor_manager->GetNumberOfInputs() == ctx->num_inputs(),
-              errors::Internal("Num of inputs from TensorManager ",
-                               tensor_manager->GetNumberOfInputs(),
-                               " and Ctx->num_inputs() ", ctx->num_inputs(),
-                               " do not match"));
-  OP_REQUIRES(ctx,
-              tensor_manager->GetNumberOfInputs() == tf_input_tensors.size(),
+  int num_of_inputs = tensor_manager->GetNumberOfInputs();
+  int num_of_outputs = tensor_manager->GetNumberOfOutputs();
+  OP_REQUIRES(ctx, num_of_inputs == ctx->num_inputs(),
               errors::Internal("Num of inputs from TensorManager ",
-                               tensor_manager->GetNumberOfInputs(),
-                               " and num of "
-                               "input tensors from ctxt ",
-                               tf_input_tensors.size(), " do not match"));
+                               num_of_inputs, " and Ctx->num_inputs() ",
+                               ctx->num_inputs(), " do not match"));
+  OP_REQUIRES(
+      ctx, num_of_inputs == tf_input_tensors.size(),
+      errors::Internal("Num of inputs from TensorManager ", num_of_inputs,
+                       " and num of "
+                       "input tensors from ctxt ",
+                       tf_input_tensors.size(), " do not match"));
 
-  OP_REQUIRES(ctx, tensor_manager->GetNumberOfOutputs() == ctx->num_outputs(),
+  OP_REQUIRES(ctx, num_of_outputs == ctx->num_outputs(),
               errors::Internal("Num of outputs from TensorManager ",
-                               tensor_manager->GetNumberOfOutputs(),
-                               " and Ctx->num_outputs()", ctx->num_outputs(),
-                               " do not match"));
-  OP_REQUIRES(ctx, tensor_manager->GetNumberOfOutputs() ==
-                       ng_exec->get_results().size(),
+                               num_of_outputs, " and Ctx->num_outputs()",
+                               ctx->num_outputs(), " do not match"));
+  OP_REQUIRES(ctx, num_of_outputs == ng_exec->get_results().size(),
               errors::Internal("Num of outputs from TensorManager ",
-                               tensor_manager->GetNumberOfOutputs(),
-                               "and number of exec outputs ",
+                               num_of_outputs, "and number of exec outputs ",
                                ng_exec->get_results().size(), " do not match"));
 
   // create inputs, outputs, pipelineId
-  int num_of_inputs = tensor_manager->GetNumberOfInputs();
-  int num_of_outputs = tensor_manager->GetNumberOfOutputs();
   int current_iter_pipeline_depth = get<0>(io_tensors);
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);

From e65d66fa66f92c0f428e35e49e224dc7e6092a70 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 15:21:07 -0800
Subject: [PATCH 03/67] initialize only pipelined tensors

---
 ngraph_bridge/ngraph_encapsulate_op.cc |  6 ++++
 ngraph_bridge/ngraph_executor.cc       | 38 +++++++++++++-------------
 ngraph_bridge/ngraph_executor.h        |  3 +-
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 2f261377a..251b11c0f 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -470,6 +470,12 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                        " and num of "
                        "input tensors from ctxt ",
                        tf_input_tensors.size(), " do not match"));
+  OP_REQUIRES(
+      ctx, num_of_inputs == ng_exec->get_parameters().size(),
+      errors::Internal("Num of inputs from TensorManager ", num_of_inputs,
+                       " and num of "
+                       "parameters from exec ",
+                       ng_exec->get_parameters().size(), " do not match"));
 
   OP_REQUIRES(ctx, num_of_outputs == ctx->num_outputs(),
               errors::Internal("Num of outputs from TensorManager ",
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index e75ff55cf..b24e04f07 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -318,7 +318,7 @@ NGraphExecutor::CreateCallback(const std::string signature,
   // Create PipelinedTensorStore
   if (status_ng_exec_pair.first == Status::OK()) {
     ng_exec = status_ng_exec_pair.second;
-    auto status_ng_pts_pair = InitializeIOTensorPipeline(ng_exec);
+    auto status_ng_pts_pair = InitializeIOTensorPipeline(ng_exec, m_tensor_manager->GetPipelinedInputIndexes(), m_tensor_manager->GetPipelinedOutputIndexes());
     pts = status_ng_pts_pair.second;
     return std::make_pair(status_ng_pts_pair.first,
                           std::make_tuple(ng_exec, serialized_ng_func, pts));
@@ -463,7 +463,9 @@ Status NGraphExecutor::ParseNodeAttributes(
 
 std::pair<Status, shared_ptr<PipelinedTensorsStore>>
 NGraphExecutor::InitializeIOTensorPipeline(
-    std::shared_ptr<ngraph::runtime::Executable> ng_exec) {
+    std::shared_ptr<ngraph::runtime::Executable> ng_exec,
+    const vector<int>& pipelined_input_indexes,
+    const vector<int>& pipelined_output_indexes) {
   if (!m_executable_can_create_tensor) {
     return std::make_pair(
         errors::Internal(
@@ -472,27 +474,25 @@ NGraphExecutor::InitializeIOTensorPipeline(
         nullptr);
   }
   // Create these pipelined ng tensors only if needed, else reuse from cache
-  size_t num_inputs = ng_exec->get_parameters().size();
-  size_t num_outputs = ng_exec->get_results().size();
-
-  if (num_outputs == 0) {
-    return std::make_pair(
-        errors::Internal("Bad input/output length. Input size: ", num_inputs,
-                         " Output size: ", num_outputs),
-        nullptr);
-  }
+  size_t num_pipelined_inputs = pipelined_input_indexes.size();
+  size_t num_pipelined_outputs = pipelined_output_indexes.size();
 
   // If the input or the output size if 0 then???
-  NGRAPH_VLOG(5) << "InitializeIOTensorPipeline: In: " << num_inputs
-                 << " Out: " << num_outputs;
-  PipelinedTensorMatrix pipelined_input_tensors(num_inputs);
-  PipelinedTensorMatrix pipelined_output_tensors(num_outputs);
-  for (size_t i = 0; i < num_inputs; i++) {
-    pipelined_input_tensors[i] = ng_exec->create_input_tensor(i, m_depth);
+  NGRAPH_VLOG(5) << "InitializeIOTensorPipeline: No. of Pipelined Inputs: " << num_inputs
+                 << " No. of Pipelined Pipelined Outputs: " << num_outputs;
+  PipelinedTensorMatrix pipelined_input_tensors(num_pipelined_inputs);
+  PipelinedTensorMatrix pipelined_output_tensors(num_pipelined_outputs);
+
+  for (size_t i = 0; i < num_pipelined_inputs; i++) {
+    int input_index = pipelined_input_indexes[i];
+    pipelined_input_tensors[i] = ng_exec->create_input_tensor(input_index, m_depth);
   }
-  for (size_t i = 0; i < num_outputs; i++) {
-    pipelined_output_tensors[i] = ng_exec->create_output_tensor(i, m_depth);
+  
+  for (size_t i = 0; i < num_pipelined_outputs; i++) {
+    int output_index = pipelined_output_indexes[i];
+    pipelined_output_tensors[i] = ng_exec->create_output_tensor(output_index, m_depth);
   }
+  
   shared_ptr<PipelinedTensorsStore> pts(new PipelinedTensorsStore(
       pipelined_input_tensors, pipelined_output_tensors));
 
diff --git a/ngraph_bridge/ngraph_executor.h b/ngraph_bridge/ngraph_executor.h
index d2c715b89..229791cfd 100644
--- a/ngraph_bridge/ngraph_executor.h
+++ b/ngraph_bridge/ngraph_executor.h
@@ -103,7 +103,8 @@ class NGraphExecutor {
   // Called from CreateCallback
   std::pair<Status, shared_ptr<PipelinedTensorsStore>>
   InitializeIOTensorPipeline(
-      std::shared_ptr<ngraph::runtime::Executable> ng_exec);
+      std::shared_ptr<ngraph::runtime::Executable> ng_exec, const vector<int>& pipelined_input_indexes,
+    const vector<int>& pipelined_output_indexes);
 
   // Get tensorflow input tensors, input shapes, static_inputs to Compute
   // Signature

From 71d0cad054a48c9d8982f1bb65533c8d12dce987 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 15:21:07 -0800
Subject: [PATCH 04/67] initialize only pipelined tensors

---
 ngraph_bridge/ngraph_encapsulate_op.cc |  6 ++++
 ngraph_bridge/ngraph_executor.cc       | 43 ++++++++++++++------------
 ngraph_bridge/ngraph_executor.h        |  4 ++-
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 2f261377a..251b11c0f 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -470,6 +470,12 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                        " and num of "
                        "input tensors from ctxt ",
                        tf_input_tensors.size(), " do not match"));
+  OP_REQUIRES(
+      ctx, num_of_inputs == ng_exec->get_parameters().size(),
+      errors::Internal("Num of inputs from TensorManager ", num_of_inputs,
+                       " and num of "
+                       "parameters from exec ",
+                       ng_exec->get_parameters().size(), " do not match"));
 
   OP_REQUIRES(ctx, num_of_outputs == ctx->num_outputs(),
               errors::Internal("Num of outputs from TensorManager ",
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index e75ff55cf..c60f49209 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -318,7 +318,9 @@ NGraphExecutor::CreateCallback(const std::string signature,
   // Create PipelinedTensorStore
   if (status_ng_exec_pair.first == Status::OK()) {
     ng_exec = status_ng_exec_pair.second;
-    auto status_ng_pts_pair = InitializeIOTensorPipeline(ng_exec);
+    auto status_ng_pts_pair = InitializeIOTensorPipeline(
+        ng_exec, m_tensor_manager->GetPipelinedInputIndexes(),
+        m_tensor_manager->GetPipelinedOutputIndexes());
     pts = status_ng_pts_pair.second;
     return std::make_pair(status_ng_pts_pair.first,
                           std::make_tuple(ng_exec, serialized_ng_func, pts));
@@ -463,7 +465,9 @@ Status NGraphExecutor::ParseNodeAttributes(
 
 std::pair<Status, shared_ptr<PipelinedTensorsStore>>
 NGraphExecutor::InitializeIOTensorPipeline(
-    std::shared_ptr<ngraph::runtime::Executable> ng_exec) {
+    std::shared_ptr<ngraph::runtime::Executable> ng_exec,
+    const vector<int>& pipelined_input_indexes,
+    const vector<int>& pipelined_output_indexes) {
   if (!m_executable_can_create_tensor) {
     return std::make_pair(
         errors::Internal(
@@ -472,27 +476,28 @@ NGraphExecutor::InitializeIOTensorPipeline(
         nullptr);
   }
   // Create these pipelined ng tensors only if needed, else reuse from cache
-  size_t num_inputs = ng_exec->get_parameters().size();
-  size_t num_outputs = ng_exec->get_results().size();
-
-  if (num_outputs == 0) {
-    return std::make_pair(
-        errors::Internal("Bad input/output length. Input size: ", num_inputs,
-                         " Output size: ", num_outputs),
-        nullptr);
-  }
+  size_t num_pipelined_inputs = pipelined_input_indexes.size();
+  size_t num_pipelined_outputs = pipelined_output_indexes.size();
 
   // If the input or the output size if 0 then???
-  NGRAPH_VLOG(5) << "InitializeIOTensorPipeline: In: " << num_inputs
-                 << " Out: " << num_outputs;
-  PipelinedTensorMatrix pipelined_input_tensors(num_inputs);
-  PipelinedTensorMatrix pipelined_output_tensors(num_outputs);
-  for (size_t i = 0; i < num_inputs; i++) {
-    pipelined_input_tensors[i] = ng_exec->create_input_tensor(i, m_depth);
+  NGRAPH_VLOG(5) << "InitializeIOTensorPipeline: No. of Pipelined Inputs: "
+                 << num_inputs
+                 << " No. of Pipelined Pipelined Outputs: " << num_outputs;
+  PipelinedTensorMatrix pipelined_input_tensors(num_pipelined_inputs);
+  PipelinedTensorMatrix pipelined_output_tensors(num_pipelined_outputs);
+
+  for (size_t i = 0; i < num_pipelined_inputs; i++) {
+    int input_index = pipelined_input_indexes[i];
+    pipelined_input_tensors[i] =
+        ng_exec->create_input_tensor(input_index, m_depth);
   }
-  for (size_t i = 0; i < num_outputs; i++) {
-    pipelined_output_tensors[i] = ng_exec->create_output_tensor(i, m_depth);
+
+  for (size_t i = 0; i < num_pipelined_outputs; i++) {
+    int output_index = pipelined_output_indexes[i];
+    pipelined_output_tensors[i] =
+        ng_exec->create_output_tensor(output_index, m_depth);
   }
+
   shared_ptr<PipelinedTensorsStore> pts(new PipelinedTensorsStore(
       pipelined_input_tensors, pipelined_output_tensors));
 
diff --git a/ngraph_bridge/ngraph_executor.h b/ngraph_bridge/ngraph_executor.h
index d2c715b89..a71851c2c 100644
--- a/ngraph_bridge/ngraph_executor.h
+++ b/ngraph_bridge/ngraph_executor.h
@@ -103,7 +103,9 @@ class NGraphExecutor {
   // Called from CreateCallback
   std::pair<Status, shared_ptr<PipelinedTensorsStore>>
   InitializeIOTensorPipeline(
-      std::shared_ptr<ngraph::runtime::Executable> ng_exec);
+      std::shared_ptr<ngraph::runtime::Executable> ng_exec,
+      const vector<int>& pipelined_input_indexes,
+      const vector<int>& pipelined_output_indexes);
 
   // Get tensorflow input tensors, input shapes, static_inputs to Compute
   // Signature

From 8e429a5bc5fd4ba17150244580b37e25477cf0b0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 17:19:00 -0800
Subject: [PATCH 05/67] GetPrefetchedTensors

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 10 +++++++++-
 ngraph_bridge/ngraph_executor.cc       |  5 +++--
 ngraph_bridge/ngraph_tensor_manager.cc | 13 +++++++++++++
 ngraph_bridge/ngraph_tensor_manager.h  |  3 +++
 test/test_ngraph_tensor_manager.cpp    |  2 ++
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 251b11c0f..a881e0a96 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -523,9 +523,15 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
           io_tensors_next_iter;
       io_tensors_next_iter = pipelined_tensor_store->get_tensors();
+
+      // Get prefetched inputs
+      pipelined_input_tensors_next_iter = get<1>(io_tensors_next_iter);
+      prefetched_input_tensors_next_iter = tensor_manager->GetPrefetchedTensors(
+          pipelined_input_tensors_next_iter);
+
       // Save the ngTensors for the next iteration
       NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
-          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter)};
+          get<0>(io_tensors_next_iter), prefetched_input_tensors_next_iter};
 
       OP_REQUIRES(ctx,
                   current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
@@ -552,6 +558,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
                      << " skip count; " << skip_count;
       if (skip_count >= prefetch_buffer_depth) {
+        cout << "skip_tf2ng_copy true " << endl;
         // We have been using the pipelined tensors - therefore do the
         // following:
         // 1. Get the next set of IO tensors from the pipelined store
@@ -586,6 +593,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 
   if (!skip_tf2ng_copy) {
     for (auto i = 0; i < tf_input_tensors.size(); i++) {
+      cout << "copying inputs true " << endl;
       ng::element::Type ng_element_type;
       OP_REQUIRES_OK(ctx, TFDataTypeToNGraphElementType(
                               tf_input_tensors[i].dtype(), &ng_element_type));
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index c60f49209..03153f80a 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -481,8 +481,9 @@ NGraphExecutor::InitializeIOTensorPipeline(
 
   // If the input or the output size if 0 then???
   NGRAPH_VLOG(5) << "InitializeIOTensorPipeline: No. of Pipelined Inputs: "
-                 << num_inputs
-                 << " No. of Pipelined Pipelined Outputs: " << num_outputs;
+                 << num_pipelined_inputs
+                 << " No. of Pipelined Pipelined Outputs: "
+                 << num_pipelined_outputs;
   PipelinedTensorMatrix pipelined_input_tensors(num_pipelined_inputs);
   PipelinedTensorMatrix pipelined_output_tensors(num_pipelined_outputs);
 
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index eb25252c3..f7757d4f5 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -95,5 +95,18 @@ void NGraphTensorManager::Initialize() {
 //---------------------------------------------------------------------------
 NGraphTensorManager::~NGraphTensorManager() {}
 
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::GetPrefetchedTensors
+//---------------------------------------------------------------------------
+vector<shared_ptr<ng::runtime::Tensor>> GetPrefetchedTensors(
+    const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors) {
+  vector<shared_ptr<ng::runtime::Tensor>> prefetched_tensors;
+  auto prefetched_indexes = GetPipelinedInputIndexesThatArePrefetched();
+  for (auto index : prefetched_indexes) {
+    prefetched_tensors.push_back(pipelined_input_tensors[index]);
+  }
+  return prefetched_tensors;
+}
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index eddf535c4..74351ee90 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -71,6 +71,9 @@ class NGraphTensorManager {
     return m_pipelined_input_indexes_prefetched;
   }
 
+  vector<shared_ptr<ng::runtime::Tensor>> GetPrefetchedTensors(
+      const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors);
+
  private:
   void Initialize();
   string m_ng_encap_node_name;
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index fdff539c2..a16db7f5f 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -326,6 +326,8 @@ TEST_F(NGraphTensorManagerTest, PrefetchNotInPipeline) {
   ClearCatalog();
 }
 
+TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors) {}
+
 }  // namespace testing
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file

From 2bf99a4577b1806c5bc103a25f45ba9957621f3c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 2 Dec 2019 18:39:06 -0800
Subject: [PATCH 06/67] Added test

---
 ngraph_bridge/ngraph_encapsulate_op.cc |   9 +-
 ngraph_bridge/ngraph_tensor_manager.cc |   3 +-
 ngraph_bridge/ngraph_tensor_manager.h  |   3 +
 test/test_ngraph_tensor_manager.cpp    | 136 ++++++++++++++++++++++++-
 4 files changed, 143 insertions(+), 8 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index a881e0a96..796336996 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -525,9 +525,12 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       io_tensors_next_iter = pipelined_tensor_store->get_tensors();
 
       // Get prefetched inputs
-      pipelined_input_tensors_next_iter = get<1>(io_tensors_next_iter);
-      prefetched_input_tensors_next_iter = tensor_manager->GetPrefetchedTensors(
-          pipelined_input_tensors_next_iter);
+      vector<shared_ptr<ng::runtime::Tensor>>
+          pipelined_input_tensors_next_iter = get<1>(io_tensors_next_iter);
+      vector<shared_ptr<ng::runtime::Tensor>>
+          prefetched_input_tensors_next_iter =
+              tensor_manager->GetPrefetchedTensors(
+                  pipelined_input_tensors_next_iter);
 
       // Save the ngTensors for the next iteration
       NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index f7757d4f5..11daeb218 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -98,7 +98,8 @@ NGraphTensorManager::~NGraphTensorManager() {}
 //---------------------------------------------------------------------------
 //  NGraphTensorManager::GetPrefetchedTensors
 //---------------------------------------------------------------------------
-vector<shared_ptr<ng::runtime::Tensor>> GetPrefetchedTensors(
+vector<shared_ptr<ng::runtime::Tensor>>
+NGraphTensorManager::GetPrefetchedTensors(
     const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors) {
   vector<shared_ptr<ng::runtime::Tensor>> prefetched_tensors;
   auto prefetched_indexes = GetPipelinedInputIndexesThatArePrefetched();
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 74351ee90..09060dd32 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -24,7 +24,10 @@
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
+#include "ngraph/ngraph.hpp"
+
 using namespace std;
+namespace ng = ngraph;
 namespace tensorflow {
 
 namespace ngraph_bridge {
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index a16db7f5f..8b81510f4 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -17,6 +17,8 @@
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
+#include "ngraph/ngraph.hpp"
+
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_tensor_manager.h"
 #include "ngraph_bridge/ngraph_utils.h"
@@ -24,6 +26,7 @@
 #include "test/test_utilities.h"
 
 using namespace std;
+namespace ng = ngraph;
 
 namespace tensorflow {
 
@@ -79,12 +82,24 @@ class NGraphTensorManagerTest : public ::testing::Test {
     iota(vout.begin(), vout.end(), 0);
     return vout;
   }
+
+  // Creates ngraph tensor
+  shared_ptr<ng::runtime::Tensor> CreateNGraphScalarTensor(
+      int value, string backend_type = "INTERPRETER") {
+    // create scalar tensor
+    ng::Shape ng_shape_scalar({});
+
+    // create Backend
+    auto backend = ng::runtime::Backend::create(backend_type);
+
+    auto temp = backend->create_tensor(ng::element::i32, ng_shape_scalar);
+
+    temp->write(&value, sizeof(value));
+    return temp;
+  }
 };
 
 TEST(NGraphUtils, FindComplement1) {
-  bool yes;
-  Status st = IsNgraphTFLogTensorCopiesEnabled(0, yes);
-
   vector<int> input = {0, 3, 5, 8, 9};
   vector<int> complement = FindComplement(10, input);
 
@@ -326,7 +341,120 @@ TEST_F(NGraphTensorManagerTest, PrefetchNotInPipeline) {
   ClearCatalog();
 }
 
-TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors) {}
+TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors) {
+  string ng_encap_node_name = "xyz_1";
+  int ng_encap_cluster_id = 1;
+  int ng_encap_graph_id = 1;
+  int number_of_inputs = 5;
+  int number_of_outputs = 2;
+
+  // expected
+  vector<int> empty;
+  vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
+  vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
+  vector<int> expected_prefetched_inp_indexes = {1, 3};
+
+  EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
+                         expected_prefetched_inp_indexes);
+
+  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
+                                     ng_encap_graph_id, number_of_inputs,
+                                     number_of_outputs);
+
+  // Allocate tensors for arguments a, b, c
+  vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors(
+      number_of_inputs);
+
+  for (int i = 0; i < number_of_inputs; i++) {
+    pipelined_input_tensors[i] = CreateNGraphScalarTensor(i);
+  }
+
+  vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
+      tensor_manager.GetPrefetchedTensors(pipelined_input_tensors);
+  ASSERT_EQ(prefetched_input_tensors.size(),
+            expected_prefetched_inp_indexes.size());
+
+  for (int i = 0; i < expected_prefetched_inp_indexes.size(); i++) {
+    int tensor_val = 0;
+    prefetched_input_tensors[i]->read(&tensor_val, sizeof(tensor_val));
+    ASSERT_EQ(tensor_val, expected_prefetched_inp_indexes[i]);
+  }
+
+  // clean up
+  ClearCatalog();
+}
+
+TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors2) {
+  string ng_encap_node_name = "xyz_1";
+  int ng_encap_cluster_id = 1;
+  int ng_encap_graph_id = 1;
+  int number_of_inputs = 7;
+  int number_of_outputs = 4;
+
+  // expected
+  vector<int> expected_pipelined_inp_indexes, expected_pipelined_out_indexes,
+      expected_var_inp_indexes, expected_var_out_indexes,
+      expected_out_indexes_need_copy, expected_prefetched_inp_indexes,
+      expected_pipelined_inp_indexes_prefetched;
+
+  if (ngraph_tf_are_variables_enabled()) {
+    // expected values
+    expected_pipelined_inp_indexes = {1, 3, 4, 6};
+    expected_prefetched_inp_indexes = {3, 6};
+    expected_pipelined_inp_indexes_prefetched = {1, 3};
+    expected_pipelined_out_indexes = {0, 2};
+    expected_var_inp_indexes =
+        FindComplement(number_of_inputs, expected_pipelined_inp_indexes);
+    expected_var_out_indexes =
+        FindComplement(number_of_outputs, expected_pipelined_out_indexes);
+    expected_out_indexes_need_copy = {2, 3};
+    // enter in catalog
+    EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name,
+                      expected_var_inp_indexes, expected_var_out_indexes,
+                      expected_out_indexes_need_copy);
+
+  } else {
+    expected_pipelined_inp_indexes = FillRange(number_of_inputs);
+    expected_pipelined_out_indexes = FillRange(number_of_outputs);
+    expected_prefetched_inp_indexes = {3, 6};
+    expected_pipelined_inp_indexes_prefetched = {
+        3, 6};  // all inputs are pipelined
+
+    expected_var_inp_indexes = {};
+    expected_var_out_indexes = {};
+    expected_out_indexes_need_copy = {};
+  }
+
+  EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
+                         expected_prefetched_inp_indexes);
+
+  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
+                                     ng_encap_graph_id, number_of_inputs,
+                                     number_of_outputs);
+
+  // Allocate tensors for arguments a, b, c
+  vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors(
+      expected_pipelined_inp_indexes.size());
+
+  for (int i = 0; i < pipelined_input_tensors.size(); i++) {
+    pipelined_input_tensors[i] =
+        CreateNGraphScalarTensor(expected_pipelined_inp_indexes[i]);
+  }
+
+  vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
+      tensor_manager.GetPrefetchedTensors(pipelined_input_tensors);
+  ASSERT_EQ(prefetched_input_tensors.size(),
+            expected_prefetched_inp_indexes.size());
+
+  for (int i = 0; i < expected_prefetched_inp_indexes.size(); i++) {
+    int tensor_val = 0;
+    prefetched_input_tensors[i]->read(&tensor_val, sizeof(tensor_val));
+    ASSERT_EQ(tensor_val, expected_prefetched_inp_indexes[i]);
+  }
+
+  // clean up
+  ClearCatalog();
+}
 
 }  // namespace testing
 }  // namespace ngraph_bridge

From ac9e32dea16a3c071048deeeaf146a3d423df770 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 3 Dec 2019 12:34:00 -0800
Subject: [PATCH 07/67] removed test

---
 test/test_ngraph_tensor_manager.cpp | 46 +----------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index 8b81510f4..e5d9056a2 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -341,50 +341,7 @@ TEST_F(NGraphTensorManagerTest, PrefetchNotInPipeline) {
   ClearCatalog();
 }
 
-TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors) {
-  string ng_encap_node_name = "xyz_1";
-  int ng_encap_cluster_id = 1;
-  int ng_encap_graph_id = 1;
-  int number_of_inputs = 5;
-  int number_of_outputs = 2;
-
-  // expected
-  vector<int> empty;
-  vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
-  vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
-  vector<int> expected_prefetched_inp_indexes = {1, 3};
-
-  EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
-                         expected_prefetched_inp_indexes);
-
-  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
-                                     ng_encap_graph_id, number_of_inputs,
-                                     number_of_outputs);
-
-  // Allocate tensors for arguments a, b, c
-  vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors(
-      number_of_inputs);
-
-  for (int i = 0; i < number_of_inputs; i++) {
-    pipelined_input_tensors[i] = CreateNGraphScalarTensor(i);
-  }
-
-  vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
-      tensor_manager.GetPrefetchedTensors(pipelined_input_tensors);
-  ASSERT_EQ(prefetched_input_tensors.size(),
-            expected_prefetched_inp_indexes.size());
-
-  for (int i = 0; i < expected_prefetched_inp_indexes.size(); i++) {
-    int tensor_val = 0;
-    prefetched_input_tensors[i]->read(&tensor_val, sizeof(tensor_val));
-    ASSERT_EQ(tensor_val, expected_prefetched_inp_indexes[i]);
-  }
-
-  // clean up
-  ClearCatalog();
-}
-
-TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors2) {
+TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors1) {
   string ng_encap_node_name = "xyz_1";
   int ng_encap_cluster_id = 1;
   int ng_encap_graph_id = 1;
@@ -432,7 +389,6 @@ TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors2) {
                                      ng_encap_graph_id, number_of_inputs,
                                      number_of_outputs);
 
-  // Allocate tensors for arguments a, b, c
   vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors(
       expected_pipelined_inp_indexes.size());
 

From 2db801d7a75abd9869f194be3be33a11d3ca3b79 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 3 Dec 2019 17:47:07 -0800
Subject: [PATCH 08/67] refactor pipeline

---
 ngraph_bridge/CMakeLists.txt                  |   1 +
 ngraph_bridge/ngraph_deassign_clusters.cc     |   1 +
 .../ngraph_encapsulate_get_prefetch.cc        | 125 ++++++++++++++++++
 .../ngraph_encapsulate_get_prefetch.h         |  33 +++++
 ngraph_bridge/ngraph_encapsulate_op.cc        | 104 ++-------------
 5 files changed, 169 insertions(+), 95 deletions(-)
 create mode 100644 ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
 create mode 100644 ngraph_bridge/ngraph_encapsulate_get_prefetch.h

diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index 0ceb40bd9..4c1b7a3cb 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -60,6 +60,7 @@ set(SRC
    tf_deadness_analysis.cc
    prefetch_autotuner.cc
    ngraph_prefetch_dataset_op.cc
+   ngraph_encasulate_get_prefetch.cc
    stats_utils.cc
    version.cc
 )
diff --git a/ngraph_bridge/ngraph_deassign_clusters.cc b/ngraph_bridge/ngraph_deassign_clusters.cc
index 2f51b3650..3d4a1bc4d 100644
--- a/ngraph_bridge/ngraph_deassign_clusters.cc
+++ b/ngraph_bridge/ngraph_deassign_clusters.cc
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
new file mode 100644
index 000000000..c65fcb71a
--- /dev/null
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ * Copyright 2017-2019 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#include "ngraph_bridge/ngraph_backend_manager.h"
+
+using namespace std;
+namespace ng = ngraph;
+
+namespace tensorflow {
+
+namespace ngraph_bridge {
+
+Status GetPrefetchTensors(Graph* graph) {
+  cout << "using prefetch env flag " << endl;
+  // Set the prefetch shared obj if applicable
+  NGraphPrefetchSharedResouce* shared_data = nullptr;
+  Status s = ctx->resource_manager()->Lookup(
+      NGraphPrefetchSharedResouce::CONTAINER_NAME,
+      NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
+
+  if (!s.ok()) {
+    // We are using this for the first time i.e., we need to do the following
+    // 1. Create the shared data object
+    // 2. We get another pipelined tensor pair for the current iteration and
+    //   copy the TF tensor to this set and continue with the execution for
+    //   for this iteration.
+    shared_data = new NGraphPrefetchSharedResouce(
+        name(), m_parallel_executor->GetOpBackendName(),
+        m_parallel_executor->GetGraphId(),
+        m_parallel_executor->GetNgraphClusterId());
+
+    // Get the set of IO tensors for the next iteration
+    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+        io_tensors_next_iter;
+    io_tensors_next_iter = pipelined_tensor_store->get_tensors();
+    // Get prefetched inputs
+    vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors_next_iter =
+        get<1>(io_tensors_next_iter);
+    vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors_next_iter =
+        tensor_manager->GetPrefetchedTensors(pipelined_input_tensors_next_iter);
+
+    // Save the prefetched input ngTensors for the next iteration
+    NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
+        get<0>(io_tensors_next_iter), prefetched_input_tensors_next_iter};
+
+    OP_REQUIRES(ctx,
+                current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
+                errors::Internal("Current Pipeline Depth is ",
+                                 current_iter_pipeline_depth,
+                                 " and next iter pipeline depth is also  ",
+                                 next_input_tensor_bundle.Id));
+
+    shared_data->AddNextInputTensorBundleForDeviceTransfer(
+        next_input_tensor_bundle);
+
+    ctx->SetStatus(ctx->resource_manager()->Create(
+        NGraphPrefetchSharedResouce::CONTAINER_NAME,
+        NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
+    // Continue the execution with the currently supplied TF tensor for the
+    // last time
+    NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
+                      "signal prefetching";
+  } else {
+    cout << "using prefetch inputs " << endl;
+
+    int prefetch_buffer_depth = shared_data->GetBufferDepth();
+    int skip_count = shared_data->GetSkipCount();
+    NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
+                   << " skip count; " << skip_count;
+    if (skip_count >= prefetch_buffer_depth) {
+      cout << "skip_tf2ng_copy true " << endl;
+      // We have been using the pipelined tensors - therefore do the
+      // following:
+      // 1. Save the prefetched Input tensors for the current iteration
+      //    to the shared data object so that the prefetcher
+      //    can continue with copying the next set of inout tensor to the
+      //    device
+      // 3. Execute the nGraph call for this iteration using the
+      //    nG prefeteched input tensors we got from the shared data
+
+      // Add the current prefetched tensors for the next iteration
+      // Get prefetched inputs
+      vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
+          tensor_manager->GetPrefetchedTensors(ng_inputs);
+      NGraphPrefetchSharedResouce::InputTensorBundle
+          prefetch_input_tensor_bundle{current_iter_pipeline_depth,
+                                       prefetched_input_tensors};
+      shared_data->AddNextInputTensorBundleForDeviceTransfer(
+          prefetch_input_tensor_bundle);
+
+      // Update the input_tensors with the one ready for exdcution
+      auto ng_input_tensor_bundle_ready =
+          shared_data->GetNextInputTensorBundleReadyForDeviceExecution();
+      current_iter_pipeline_depth = ng_input_tensor_bundle_ready.Id;
+      vector<shared_ptr<ng::runtime::Tensor>> ng_prefetched_inputs =
+          ng_input_tensor_bundle_ready.Inputs;
+      OP_REQUIRES(ctx, current_iter_pipeline_depth ==
+                           (!prefetch_input_tensor_bundle.Id),
+                  errors::Internal("Current Pipeline Depth is ",
+                                   current_iter_pipeline_depth,
+                                   " and next iter pipeline depth is ", "also ",
+                                   prefetch_input_tensor_bundle.Id));
+      skip_tf2ng_copy = true;
+      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
+    }
+    shared_data->IncrSkipCount();
+  }
+}
+}
+
+}  // namespace ngraph_bridge
+}  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
new file mode 100644
index 000000000..9f5b459e3
--- /dev/null
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
@@ -0,0 +1,33 @@
+/*******************************************************************************
+ * Copyright 2017-2019 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef NGRAPH_TF_BRIDGE_GET_PREFETCH_H
+#define NGRAPH_TF_BRIDGE_GET_PREFETCH_H
+
+#pragma once
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+namespace ngraph_bridge {
+
+Status GetPrefetchTensors(Graph* graph);
+
+}  // namespace ngraph_bridge
+}  // namespace tensorflow
+
+#endif  // NGRAPH_TF_BRIDGE_GET_PREFETCH_H
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 796336996..cedf7ce96 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -40,6 +40,7 @@
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_builder.h"
 #include "ngraph_bridge/ngraph_cluster_manager.h"
+#include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
 #include "ngraph_bridge/ngraph_encapsulate_impl.h"
 #include "ngraph_bridge/ngraph_encapsulate_op.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
@@ -451,11 +452,6 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
               errors::Internal("Pipeline Depth is not 2, got ",
                                m_parallel_executor->GetTensorPipelineDepth()));
 
-  std::tuple<int, PipelinedTensorVector, PipelinedTensorVector> io_tensors;
-  io_tensors = pipelined_tensor_store->get_tensors();
-  OP_REQUIRES(ctx, !(std::get<0>(io_tensors) < 0),
-              errors::Internal("No free tensor available"));
-
   // Get Tensor Manager and some error checking
   auto tensor_manager = m_parallel_executor->GetTensorManager();
   int num_of_inputs = tensor_manager->GetNumberOfInputs();
@@ -487,6 +483,10 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                                ng_exec->get_results().size(), " do not match"));
 
   // create inputs, outputs, pipelineId
+  std::tuple<int, PipelinedTensorVector, PipelinedTensorVector> io_tensors;
+  io_tensors = pipelined_tensor_store->get_tensors();
+  OP_REQUIRES(ctx, !(std::get<0>(io_tensors) < 0),
+              errors::Internal("No free tensor available"));
   int current_iter_pipeline_depth = get<0>(io_tensors);
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
@@ -499,96 +499,10 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
-    cout << "using prefetch env flag " << endl;
-    NGraphPrefetchSharedResouce::InputTensorBundle prefetch_input_tensor_bundle{
-        current_iter_pipeline_depth, ng_inputs};
-    // Set the prefetch shared obj if applicable
-    NGraphPrefetchSharedResouce* shared_data = nullptr;
-    Status s = ctx->resource_manager()->Lookup(
-        NGraphPrefetchSharedResouce::CONTAINER_NAME,
-        NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
-
-    if (!s.ok()) {
-      // We are using this for the first time i.e., we need to do the following
-      // 1. Create the shared data object
-      // 2. save the input/output nG tensor set to the shared data object
-      // 3. Get another pipelined tensor pair for the current iteration and
-      //    copy the TF tensor to this set and continue with the execution for
-      //    for this iteration.
-      shared_data = new NGraphPrefetchSharedResouce(
-          name(), m_parallel_executor->GetOpBackendName(),
-          m_parallel_executor->GetGraphId(),
-          m_parallel_executor->GetNgraphClusterId());
-      // Get the set of IO tensors for the next iteration
-      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
-          io_tensors_next_iter;
-      io_tensors_next_iter = pipelined_tensor_store->get_tensors();
-
-      // Get prefetched inputs
-      vector<shared_ptr<ng::runtime::Tensor>>
-          pipelined_input_tensors_next_iter = get<1>(io_tensors_next_iter);
-      vector<shared_ptr<ng::runtime::Tensor>>
-          prefetched_input_tensors_next_iter =
-              tensor_manager->GetPrefetchedTensors(
-                  pipelined_input_tensors_next_iter);
-
-      // Save the ngTensors for the next iteration
-      NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
-          get<0>(io_tensors_next_iter), prefetched_input_tensors_next_iter};
-
-      OP_REQUIRES(ctx,
-                  current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
-                  errors::Internal("Current Pipeline Depth is ",
-                                   current_iter_pipeline_depth,
-                                   " and next iter pipeline depth is also  ",
-                                   next_input_tensor_bundle.Id));
-
-      shared_data->AddNextInputTensorBundleForDeviceTransfer(
-          next_input_tensor_bundle);
-
-      ctx->SetStatus(ctx->resource_manager()->Create(
-          NGraphPrefetchSharedResouce::CONTAINER_NAME,
-          NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
-      // Continue the execution with the currently supplied TF tensor for the
-      // last time
-      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
-                        "signal prefetching";
-    } else {
-      cout << "using prefetch inputs " << endl;
-
-      int prefetch_buffer_depth = shared_data->GetBufferDepth();
-      int skip_count = shared_data->GetSkipCount();
-      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
-                     << " skip count; " << skip_count;
-      if (skip_count >= prefetch_buffer_depth) {
-        cout << "skip_tf2ng_copy true " << endl;
-        // We have been using the pipelined tensors - therefore do the
-        // following:
-        // 1. Get the next set of IO tensors from the pipelined store
-        // 2. Save that to the shared data object so that the prefetcher
-        //    can continue with copying the next set of inout tensor to the
-        //    device
-        // 3. Execute the nGraph call for this iteration using the
-        //    nG tensors we got from the shared data
-        auto ng_input_tensor_bundle_ready =
-            shared_data->GetNextInputTensorBundleReadyForDeviceExecution();
-        // Add the next set of tensors for the next iteration
-        shared_data->AddNextInputTensorBundleForDeviceTransfer(
-            prefetch_input_tensor_bundle);
-        // Update the input_tensors with the one ready for exdcution
-        current_iter_pipeline_depth = ng_input_tensor_bundle_ready.Id;
-        ng_inputs = ng_input_tensor_bundle_ready.Inputs;
-        OP_REQUIRES(ctx, current_iter_pipeline_depth ==
-                             (!prefetch_input_tensor_bundle.Id),
-                    errors::Internal("Current Pipeline Depth is ",
-                                     current_iter_pipeline_depth,
-                                     " and next iter pipeline depth is ",
-                                     "also ", prefetch_input_tensor_bundle.Id));
-        skip_tf2ng_copy = true;
-        NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
-      }
-      shared_data->IncrSkipCount();
-    }
+    Status s = GetPrefetchTensors(&skip_tf2ng_copy);
+    OP_REQUIRES(
+        ctx, s.ok(),
+        errors::Internal("Error encountered when prefetching tensors: "));
   }
 
   // Allocate the input/

From 9c0e03f207f0c26ae76fccf6df8d3e4dcf2e1ae5 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 11:20:26 -0800
Subject: [PATCH 09/67] Shared data keeps track of prefetched input indexes

---
 ngraph_bridge/ngraph_prefetch_dataset_op.cc | 22 +++++++++++++++++----
 ngraph_bridge/ngraph_prefetch_shared_data.h | 12 ++++++++---
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
index 7ff974a1a..18b444611 100644
--- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc
+++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
@@ -417,8 +417,21 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
         if (s.ok()) {
           ngraph::Event evt_dev_cp("Prf Dev Copy", "Copy", "");
           shared_data->SetBufferDepth(m_buffer_size);
+
           auto ng_input_tensor_bundle =
               shared_data->GetNextInputTensorBundleForDeviceTransfer();
+          auto ng_prefetch_input_indexes =
+              shared_data->GetPrefetchInputIndexes();
+
+          int number_of_buffer_elements = buffer_element.value.size();
+          if (number_of_buffer_elements != ng_prefetch_input_indexes.size()) {
+            throw std::runtime_error(
+                "Prefetch buffer elements size " +
+                to_string(number_of_buffer_elements) +
+                " does not match the number of prefetch inputs expected by "
+                "encap " +
+                to_string(ng_prefetch_input_indexes.size()));
+          }
 
           // Write to these tensors
           for (auto i = 0; i < buffer_element.value.size(); i++) {
@@ -432,10 +445,11 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
               NGRAPH_VLOG(2)
                   << "[PREFETCH] INPUT tensor being written by Prefetch: "
                   << " Value: " << buffer_element.value[i].DebugString();
-              ng_input_tensor_bundle.Inputs[i]->write(
-                  current_src_ptr,
-                  ng_input_tensor_bundle.Inputs[i]->get_element_count() *
-                      ng_element_type.size());
+              int input_index = ng_prefetch_input_indexes[i];
+              ng_input_tensor_bundle.Inputs[input_index]->write(
+                  current_src_ptr, ng_input_tensor_bundle.Inputs[input_index]
+                                           ->get_element_count() *
+                                       ng_element_type.size());
             } catch (const std::exception& exp) {
               throw exp;
             } catch (...) {
diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index 6f140c56c..1fe9054c5 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -40,11 +40,13 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
  public:
   explicit NGraphPrefetchSharedResouce(const std::string& ng_enc_op_name,
                                        const std::string& backend_name,
-                                       int cluster_id, int graph_id)
+                                       int cluster_id, int graph_id,
+                                       const vector<int> prefetch_input_indexes)
       : m_ng_enc_op_name(ng_enc_op_name),
         m_backend_name(backend_name),
         m_graph_id(graph_id),
-        m_cluster_id(cluster_id) {}
+        m_cluster_id(cluster_id),
+        m_prefetch_input_indexes(prefetch_input_indexes) {}
 
   // Returns a debug string for *this.
   string DebugString() const override { return "NGraphPrefetchSharedResouce"; }
@@ -115,12 +117,16 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   void IncrSkipCount() { m_skip_count++; }
   int GetSkipCount() { return m_skip_count; }
 
+  const vector<int>& GetPrefetchInputIndexes() {
+    return m_prefetch_input_indexes;
+  }
+
  private:
   const std::string m_ng_enc_op_name;
   const std::string m_backend_name;
   const int m_graph_id;
   const int m_cluster_id;
-
+  const vector<int> m_prefetch_input_indexes;
   // We need to maintain two queues as follows:
   // ----------+------------+------------+------------------------------------+
   // Queue     | Writer     | Reader     | Comments                           |

From 8dfc7953bc778a2c54532562729f9b5a310f2d2a Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 13:12:06 -0800
Subject: [PATCH 10/67] Working state

---
 ngraph_bridge/CMakeLists.txt                  |   2 +-
 ngraph_bridge/ngraph_catalog.cc               |   4 +
 .../ngraph_encapsulate_get_prefetch.cc        | 101 +----------------
 .../ngraph_encapsulate_get_prefetch.h         |   2 +-
 ngraph_bridge/ngraph_encapsulate_op.cc        | 107 ++++++++++++++++--
 .../ngraph_enter_prefetch_in_catalog.cc       |   4 +
 ngraph_bridge/ngraph_executor.cc              |   5 +-
 ngraph_bridge/ngraph_executor.h               |   3 +-
 ngraph_bridge/ngraph_tensor_manager.cc        |   8 ++
 test/python/test_api.py                       |   4 -
 test/test_parallel_executor.cpp               |  27 +++--
 11 files changed, 140 insertions(+), 127 deletions(-)

diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index 4c1b7a3cb..f48aeef56 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -60,7 +60,7 @@ set(SRC
    tf_deadness_analysis.cc
    prefetch_autotuner.cc
    ngraph_prefetch_dataset_op.cc
-   ngraph_encasulate_get_prefetch.cc
+   ngraph_encapsulate_get_prefetch.cc
    stats_utils.cc
    version.cc
 )
diff --git a/ngraph_bridge/ngraph_catalog.cc b/ngraph_bridge/ngraph_catalog.cc
index 95b65f506..fdb70790d 100644
--- a/ngraph_bridge/ngraph_catalog.cc
+++ b/ngraph_bridge/ngraph_catalog.cc
@@ -220,17 +220,21 @@ void NGraphCatalog::AddToPrefetchedInputIndexMap(
     throw runtime_error("Trying to add an already existing key ( " + key +
                         " ) in PrefetchedInputIndexMap ");
   }
+  cout << " AddToPrefetchedInputIndexMap key " << key << endl;
   NGraphCatalog::prefetched_input_index_map_.insert({key, val});
 }
 
 bool NGraphCatalog::ExistsInPrefetchedInputIndexMap(const int& graphid,
                                                     const string& node_name) {
   string key = NGraphCatalog::CreateNodeKey(graphid, node_name);
+  cout << " ExistsInPrefetchedInputIndexMap key " << key << endl;
   return NGraphCatalog::ExistsInPrefetchedInputIndexMap(key);
 }
 
 bool NGraphCatalog::ExistsInPrefetchedInputIndexMap(const string& key) {
   auto itr = NGraphCatalog::prefetched_input_index_map_.find(key);
+  cout << " ExistsInPrefetchedInputIndexMap check "
+       << (itr != NGraphCatalog::prefetched_input_index_map_.end()) << endl;
   return itr != NGraphCatalog::prefetched_input_index_map_.end();
 }
 
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
index c65fcb71a..b3c70c227 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
@@ -14,112 +14,15 @@
  * limitations under the License.
  *******************************************************************************/
 
-#include "ngraph_bridge/ngraph_backend_manager.h"
+#include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
 
 using namespace std;
-namespace ng = ngraph;
 
 namespace tensorflow {
 
 namespace ngraph_bridge {
 
-Status GetPrefetchTensors(Graph* graph) {
-  cout << "using prefetch env flag " << endl;
-  // Set the prefetch shared obj if applicable
-  NGraphPrefetchSharedResouce* shared_data = nullptr;
-  Status s = ctx->resource_manager()->Lookup(
-      NGraphPrefetchSharedResouce::CONTAINER_NAME,
-      NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
-
-  if (!s.ok()) {
-    // We are using this for the first time i.e., we need to do the following
-    // 1. Create the shared data object
-    // 2. We get another pipelined tensor pair for the current iteration and
-    //   copy the TF tensor to this set and continue with the execution for
-    //   for this iteration.
-    shared_data = new NGraphPrefetchSharedResouce(
-        name(), m_parallel_executor->GetOpBackendName(),
-        m_parallel_executor->GetGraphId(),
-        m_parallel_executor->GetNgraphClusterId());
-
-    // Get the set of IO tensors for the next iteration
-    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
-        io_tensors_next_iter;
-    io_tensors_next_iter = pipelined_tensor_store->get_tensors();
-    // Get prefetched inputs
-    vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors_next_iter =
-        get<1>(io_tensors_next_iter);
-    vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors_next_iter =
-        tensor_manager->GetPrefetchedTensors(pipelined_input_tensors_next_iter);
-
-    // Save the prefetched input ngTensors for the next iteration
-    NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
-        get<0>(io_tensors_next_iter), prefetched_input_tensors_next_iter};
-
-    OP_REQUIRES(ctx,
-                current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
-                errors::Internal("Current Pipeline Depth is ",
-                                 current_iter_pipeline_depth,
-                                 " and next iter pipeline depth is also  ",
-                                 next_input_tensor_bundle.Id));
-
-    shared_data->AddNextInputTensorBundleForDeviceTransfer(
-        next_input_tensor_bundle);
-
-    ctx->SetStatus(ctx->resource_manager()->Create(
-        NGraphPrefetchSharedResouce::CONTAINER_NAME,
-        NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
-    // Continue the execution with the currently supplied TF tensor for the
-    // last time
-    NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
-                      "signal prefetching";
-  } else {
-    cout << "using prefetch inputs " << endl;
-
-    int prefetch_buffer_depth = shared_data->GetBufferDepth();
-    int skip_count = shared_data->GetSkipCount();
-    NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
-                   << " skip count; " << skip_count;
-    if (skip_count >= prefetch_buffer_depth) {
-      cout << "skip_tf2ng_copy true " << endl;
-      // We have been using the pipelined tensors - therefore do the
-      // following:
-      // 1. Save the prefetched Input tensors for the current iteration
-      //    to the shared data object so that the prefetcher
-      //    can continue with copying the next set of inout tensor to the
-      //    device
-      // 3. Execute the nGraph call for this iteration using the
-      //    nG prefeteched input tensors we got from the shared data
-
-      // Add the current prefetched tensors for the next iteration
-      // Get prefetched inputs
-      vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
-          tensor_manager->GetPrefetchedTensors(ng_inputs);
-      NGraphPrefetchSharedResouce::InputTensorBundle
-          prefetch_input_tensor_bundle{current_iter_pipeline_depth,
-                                       prefetched_input_tensors};
-      shared_data->AddNextInputTensorBundleForDeviceTransfer(
-          prefetch_input_tensor_bundle);
-
-      // Update the input_tensors with the one ready for exdcution
-      auto ng_input_tensor_bundle_ready =
-          shared_data->GetNextInputTensorBundleReadyForDeviceExecution();
-      current_iter_pipeline_depth = ng_input_tensor_bundle_ready.Id;
-      vector<shared_ptr<ng::runtime::Tensor>> ng_prefetched_inputs =
-          ng_input_tensor_bundle_ready.Inputs;
-      OP_REQUIRES(ctx, current_iter_pipeline_depth ==
-                           (!prefetch_input_tensor_bundle.Id),
-                  errors::Internal("Current Pipeline Depth is ",
-                                   current_iter_pipeline_depth,
-                                   " and next iter pipeline depth is ", "also ",
-                                   prefetch_input_tensor_bundle.Id));
-      skip_tf2ng_copy = true;
-      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
-    }
-    shared_data->IncrSkipCount();
-  }
-}
-}
+Status GetPrefetchTensors() { return Status::OK(); }
 
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
index 9f5b459e3..cc09f3acf 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 namespace ngraph_bridge {
 
-Status GetPrefetchTensors(Graph* graph);
+Status GetPrefetchTensors();
 
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index cedf7ce96..500443255 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -121,8 +121,9 @@ void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
   OP_REQUIRES_OK(ctx, ctx->GetAttr<int>("ngraph_cluster", &cluster_id));
   graph_def = NGraphClusterManager::GetClusterGraph(cluster_id);
 
+  string node_name = name();
   if (graph_def == nullptr) {
-    string flib_key = "ngraph_cluster_" + to_string(cluster_id);
+    string flib_key = node_name;
     // Read graphdef from function library
     const FunctionLibraryDefinition flib =
         *ctx->function_library()->GetFunctionLibraryDefinition();
@@ -158,9 +159,9 @@ void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
   }
 
   // Create the Executor object
-  m_parallel_executor = move(unique_ptr<NGraphExecutor>(
-      new NGraphExecutor(s_instance_id, cluster_id, graph_id, encap_subgraph,
-                         backend_name, my_function_cache_depth_in_items)));
+  m_parallel_executor = move(unique_ptr<NGraphExecutor>(new NGraphExecutor(
+      s_instance_id, cluster_id, graph_id, encap_subgraph, backend_name,
+      my_function_cache_depth_in_items, node_name)));
 
   auto tensor_manager = m_parallel_executor->GetTensorManager();
   OP_REQUIRES(ctx, tensor_manager->GetNumberOfInputs() == ctx->num_inputs(),
@@ -499,10 +500,100 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
-    Status s = GetPrefetchTensors(&skip_tf2ng_copy);
-    OP_REQUIRES(
-        ctx, s.ok(),
-        errors::Internal("Error encountered when prefetching tensors: "));
+    cout << "using prefetch env flag " << endl;
+    // Set the prefetch shared obj if applicable
+    NGraphPrefetchSharedResouce* shared_data = nullptr;
+    Status s = ctx->resource_manager()->Lookup(
+        NGraphPrefetchSharedResouce::CONTAINER_NAME,
+        NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
+
+    if (!s.ok()) {
+      // We are using this for the first time i.e., we need to do the following
+      // 1. Create the shared data object
+      // 2. We get another pipelined tensor pair for the current iteration and
+      //   copy the TF tensor to this set and continue with the execution for
+      //   for this iteration.
+      auto ng_prefetch_input_indexes =
+          tensor_manager->GetPipelinedInputIndexesThatArePrefetched();
+      cout << "ng_prefetch_input_indexes " << ng_prefetch_input_indexes.size()
+           << endl;
+
+      for (auto inp : ng_prefetch_input_indexes) {
+        cout << " inp indez " << inp << endl;
+      }
+      shared_data = new NGraphPrefetchSharedResouce(
+          name(), m_parallel_executor->GetOpBackendName(),
+          m_parallel_executor->GetGraphId(),
+          m_parallel_executor->GetNgraphClusterId(), ng_prefetch_input_indexes);
+
+      // Get the set of IO tensors for the next iteration
+      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+          io_tensors_next_iter;
+      io_tensors_next_iter = pipelined_tensor_store->get_tensors();
+
+      // Save the prefetched input ngTensors for the next iteration
+      NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
+          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter)};
+
+      OP_REQUIRES(ctx,
+                  current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
+                  errors::Internal("Current Pipeline Depth is ",
+                                   current_iter_pipeline_depth,
+                                   " and next iter pipeline depth is also  ",
+                                   next_input_tensor_bundle.Id));
+
+      shared_data->AddNextInputTensorBundleForDeviceTransfer(
+          next_input_tensor_bundle);
+
+      ctx->SetStatus(ctx->resource_manager()->Create(
+          NGraphPrefetchSharedResouce::CONTAINER_NAME,
+          NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
+      // Continue the execution with the currently supplied TF tensor for the
+      // last time
+      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
+                        "signal prefetching";
+    } else {
+      cout << "using prefetch inputs " << endl;
+
+      int prefetch_buffer_depth = shared_data->GetBufferDepth();
+      int skip_count = shared_data->GetSkipCount();
+      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
+                     << " skip count; " << skip_count;
+      if (skip_count >= prefetch_buffer_depth) {
+        cout << "skip_tf2ng_copy true " << endl;
+        // We have been using the pipelined tensors - therefore do the
+        // following:
+        // 1. Save the prefetched Input tensors for the current iteration
+        //    to the shared data object so that the prefetcher
+        //    can continue with copying the next set of inout tensor to the
+        //    device
+        // 3. Execute the nGraph call for this iteration using the
+        //    nG prefeteched input tensors we got from the shared data
+
+        // Add the current prefetched tensors for the next iteration
+        // Get prefetched inputs
+        NGraphPrefetchSharedResouce::InputTensorBundle
+            prefetch_input_tensor_bundle{current_iter_pipeline_depth,
+                                         ng_inputs};
+        shared_data->AddNextInputTensorBundleForDeviceTransfer(
+            prefetch_input_tensor_bundle);
+
+        // Update the input_tensors with the one ready for exdcution
+        auto ng_input_tensor_bundle_ready =
+            shared_data->GetNextInputTensorBundleReadyForDeviceExecution();
+        current_iter_pipeline_depth = ng_input_tensor_bundle_ready.Id;
+        ng_inputs = ng_input_tensor_bundle_ready.Inputs;
+        OP_REQUIRES(ctx, current_iter_pipeline_depth ==
+                             (!prefetch_input_tensor_bundle.Id),
+                    errors::Internal("Current Pipeline Depth is ",
+                                     current_iter_pipeline_depth,
+                                     " and next iter pipeline depth is ",
+                                     "also ", prefetch_input_tensor_bundle.Id));
+        skip_tf2ng_copy = true;
+        NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
+      }
+      shared_data->IncrSkipCount();
+    }
   }
 
   // Allocate the input/
diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
index 356e907f7..84513087f 100644
--- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
+++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
@@ -62,6 +62,10 @@ Status EnterPrefetchInCatalog(Graph* graph, int graph_id) {
       }  // end loop over input edges
 
       if (in_indexes_for_encap.size() > 0) {
+        for (auto i : in_indexes_for_encap) {
+          cout << "Enter Prefetch in catalog " << i << endl;
+        }
+
         try {
           NGraphCatalog::AddToPrefetchedInputIndexMap(graph_id, node->name(),
                                                       in_indexes_for_encap);
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index 03153f80a..be076ec61 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -62,13 +62,14 @@ namespace ngraph_bridge {
 NGraphExecutor::NGraphExecutor(int instance_id, int cluster_id, int graph_id,
                                unique_ptr<tensorflow::Graph>& graph,
                                const string& backend_name,
-                               const int cache_depth)
+                               const int cache_depth, const string& node_name)
     : m_instance_id(instance_id),
       m_ngraph_cluster_id(cluster_id),
       m_graph_id(graph_id),
       m_graph(std::move(graph)),
       m_op_backend_name(backend_name),
-      m_ng_data_cache(cache_depth) {
+      m_ng_data_cache(cache_depth),
+      m_node_name(node_name) {
   // Sanity checks
   if (m_graph == nullptr) {
     throw std::runtime_error("Graph is nullptr!");
diff --git a/ngraph_bridge/ngraph_executor.h b/ngraph_bridge/ngraph_executor.h
index a71851c2c..e5e554b85 100644
--- a/ngraph_bridge/ngraph_executor.h
+++ b/ngraph_bridge/ngraph_executor.h
@@ -42,7 +42,8 @@ class NGraphExecutor {
   // Transforms, compiles and executes TesnorFlow computation graph using nGraph
   explicit NGraphExecutor(int instance_id, int cluster_id, int graph_id,
                           unique_ptr<tensorflow::Graph>& graph,
-                          const string& backend_name, const int cache_depth);
+                          const string& backend_name, const int cache_depth,
+                          const string& node_name);
 
   ~NGraphExecutor();
 
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 11daeb218..9576a7732 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -64,16 +64,24 @@ void NGraphTensorManager::Initialize() {
   m_pipelined_output_indexes =
       FindComplement(m_number_of_outputs, m_output_indexes_assigning_variable);
 
+  cout << "TM m_ng_encap_graph_id " << m_ng_encap_graph_id << endl;
+  cout << "TM m_ng_encap_node_name " << m_ng_encap_node_name << endl;
   if (NGraphCatalog::ExistsInPrefetchedInputIndexMap(m_ng_encap_graph_id,
                                                      m_ng_encap_node_name)) {
     auto prefetch_indexes =
         NGraphCatalog::GetIndexesFromPrefetchedInputIndexMap(
             m_ng_encap_graph_id, m_ng_encap_node_name);
+    for (auto i : m_prefetched_input_indexes) {
+      cout << "TM " << i << endl;
+    }
     m_prefetched_input_indexes.insert(m_prefetched_input_indexes.begin(),
                                       prefetch_indexes.begin(),
                                       prefetch_indexes.end());
     // keeping the indexes sorted, is helpful in general testing
     sort(m_prefetched_input_indexes.begin(), m_prefetched_input_indexes.end());
+    for (auto i : m_prefetched_input_indexes) {
+      cout << "TM " << i << endl;
+    }
   }
 
   // the prefetched input indexes will also be pipelined
diff --git a/test/python/test_api.py b/test/python/test_api.py
index 7859bb1b9..0bb347ddc 100644
--- a/test/python/test_api.py
+++ b/test/python/test_api.py
@@ -38,10 +38,6 @@ def test_enable(self):
     def test_backends_len(self):
         assert ngraph_bridge.backends_len()
 
-    def test_set_backend(self):
-        ngraph_bridge.set_backend('CPU')
-        assert ngraph_bridge.get_currently_set_backend_name() == "CPU"
-
     def test_set_backend_invalid(self):
         try:
             ngraph_bridge.set_backend('POTATO')
diff --git a/test/test_parallel_executor.cpp b/test/test_parallel_executor.cpp
index e73b39ee6..39c33c58f 100644
--- a/test/test_parallel_executor.cpp
+++ b/test/test_parallel_executor.cpp
@@ -60,13 +60,13 @@ TEST(ParallelExecutor, Construction) {
 
   // First test with a backend not yet created
   unique_ptr<NGraphExecutor> executor;
-  ASSERT_THROW(executor = unique_ptr<NGraphExecutor>(
-                   new NGraphExecutor(100, 500, 600, input_graph, "bogus", 5)),
+  ASSERT_THROW(executor = unique_ptr<NGraphExecutor>(new NGraphExecutor(
+                   100, 500, 600, input_graph, "bogus", 5, "xyz_5")),
                std::runtime_error);
 
   // Next test with a null graph not yet created
-  ASSERT_THROW(executor = unique_ptr<NGraphExecutor>(
-                   new NGraphExecutor(100, 500, 600, input_graph, "bogus", 12)),
+  ASSERT_THROW(executor = unique_ptr<NGraphExecutor>(new NGraphExecutor(
+                   100, 500, 600, input_graph, "bogus", 12, "xyz_12")),
                std::runtime_error);
 
   // Now read the graph
@@ -74,8 +74,9 @@ TEST(ParallelExecutor, Construction) {
 
   // Next test with a backend after creating
   tf::ngraph_bridge::BackendManager::CreateBackend("INTERPRETER");
-  ASSERT_NO_THROW(executor = unique_ptr<NGraphExecutor>(new NGraphExecutor(
-                      100, 500, 600, input_graph, "INTERPRETER", 16)));
+  ASSERT_NO_THROW(
+      executor = unique_ptr<NGraphExecutor>(new NGraphExecutor(
+          100, 500, 600, input_graph, "INTERPRETER", 16, "xyz_10")));
 
   // Now that the object has been cobstructed, test various internal parts
   // TODO: Create a Test Class and mark that as a friend of the Executor class
@@ -93,7 +94,8 @@ TEST(ParallelExecutor, CompilerTest) {
   ASSERT_OK(LoadGraphFromPbTxt("test_axpy_launchop.pbtxt", input_graph));
 
   tf::ngraph_bridge::BackendManager::CreateBackend("INTERPRETER");
-  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 10);
+  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 10,
+                          "xyz_10");
 
   // Create the inputs for this graph
   Tensor x(DT_FLOAT, TensorShape({2, 3}));
@@ -132,7 +134,8 @@ TEST(ParallelExecutor, ExecuteOnSingleThread) {
   unique_ptr<tf::Graph> input_graph;
   ASSERT_OK(LoadGraphFromPbTxt("test_axpy_launchop.pbtxt", input_graph));
   tf::ngraph_bridge::BackendManager::CreateBackend("INTERPRETER");
-  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 12);
+  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 12,
+                          "xyz_12");
 
   // Create the inputs for this graph
   Tensor x(DT_FLOAT, TensorShape({2, 3}));
@@ -214,7 +217,7 @@ TEST(ParallelExecutor, ExecuteOnSingleThread8Bit) {
   }
 
   tf::ngraph_bridge::BackendManager::CreateBackend(backend_name);
-  NGraphExecutor executor(100, 500, 600, input_graph, backend_name, 5);
+  NGraphExecutor executor(100, 500, 600, input_graph, backend_name, 5, "xyz_5");
 
   // Create the inputs for this graph
   Tensor x(DT_INT8, TensorShape({2, 2}));
@@ -296,7 +299,8 @@ TEST(ParallelExecutor, ExecuteOnMultipleThreads8Bit) {
   }
 
   tf::ngraph_bridge::BackendManager::CreateBackend(backend_name);
-  NGraphExecutor executor(100, 500, 600, input_graph, backend_name, 16);
+  NGraphExecutor executor(100, 500, 600, input_graph, backend_name, 16,
+                          "xyz_16");
 
   // Create the inputs for this graph
   Tensor x(DT_INT8, TensorShape({2, 2}));
@@ -383,7 +387,8 @@ TEST(ParallelExecutor, ExecuteOnMultipleThreads) {
   unique_ptr<tf::Graph> input_graph;
   ASSERT_OK(LoadGraphFromPbTxt("test_axpy_launchop.pbtxt", input_graph));
   tf::ngraph_bridge::BackendManager::CreateBackend("INTERPRETER");
-  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 16);
+  NGraphExecutor executor(100, 500, 600, input_graph, "INTERPRETER", 16,
+                          "xyz_16");
 
   // Create the inputs for this graph
   Tensor x(DT_FLOAT, TensorShape({2, 3}));

From 5464b7a42f6eee72d9be9013e4908285872de91c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 14:40:02 -0800
Subject: [PATCH 11/67] bazel fix

---
 bazel/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bazel/BUILD b/bazel/BUILD
index ce0025adb..15964fd1e 100644
--- a/bazel/BUILD
+++ b/bazel/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "ngraph_bridge/ngraph_encapsulate_op.h",
         "ngraph_bridge/ngraph_data_cache.h",
         "ngraph_bridge/ngraph_find_replace_prefetchdataset.h",
+        "ngraph_bridge/ngraph_encapsulate_get_prefetch.h",
         "ngraph_bridge/ngraph_freshness_tracker.h",
         "ngraph_bridge/ngraph_mark_for_clustering.h",
         "ngraph_bridge/ngraph_partial_shapes.h",
@@ -79,6 +80,7 @@ cc_library(
         "ngraph_bridge/ngraph_encapsulate_clusters.cc",
         "ngraph_bridge/ngraph_encapsulate_impl.cc",
         "ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc",
+        "ngraph_bridge/ngraph_encapsulate_get_prefetch.cc",
         "ngraph_bridge/ngraph_executor.cc",
         "ngraph_bridge/ngraph_encapsulate_op.cc",
         "ngraph_bridge/ngraph_freshness_tracker.cc",

From 5233e693dd7930754fcbea5ba6e6f9e71c0d4eab Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 14:42:21 -0800
Subject: [PATCH 12/67] Added test

---
 examples/axpy_pipelined_extended.py | 92 +++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 examples/axpy_pipelined_extended.py

diff --git a/examples/axpy_pipelined_extended.py b/examples/axpy_pipelined_extended.py
new file mode 100644
index 000000000..46c2b8baf
--- /dev/null
+++ b/examples/axpy_pipelined_extended.py
@@ -0,0 +1,92 @@
+# ==============================================================================
+#  Copyright 2019 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# ==============================================================================
+import warnings
+warnings.filterwarnings('ignore', category=FutureWarning)
+import numpy as np
+
+import tensorflow as tf
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+import os
+import ngraph_bridge
+
+import sys
+
+
+def build_simple_model(input_array):
+    # Convert the numpy array to TF Tensor
+    input = tf.cast(input_array, tf.float32)
+
+    # Define the Ops
+    mul = tf.compat.v1.math.multiply(input_array, 5)
+    add = tf.compat.v1.math.add(mul, 10)
+    output = add
+    return output
+
+
+def build_data_pipeline(input_array, map_function, batch_size):
+    dataset = (tf.data.Dataset.from_tensor_slices(
+        (tf.constant(input_array)
+        )).map(map_function).batch(batch_size).prefetch(1))
+
+    iterator = dataset.make_initializable_iterator()
+    data_to_be_prefetched_and_used = iterator.get_next()
+
+    return data_to_be_prefetched_and_used, iterator
+
+
+def run_axpy_pipeline():
+    input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
+    output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
+    multiplier = 10
+
+    for i in range(1, 10):
+        input_array[i - 1] = input_array[i - 1] * i * multiplier
+    map_function = lambda x: x * multiplier
+    batch_size = 1
+    pipeline, iterator = build_data_pipeline(input_array, map_function,
+                                             batch_size)
+    model = build_simple_model(pipeline)
+
+    with tf.Session() as sess:
+        # Initialize the globals and the dataset
+        sess.run(tf.global_variables_initializer())
+        sess.run(iterator.initializer)
+
+        for i in range(1, 10):
+            # Expected value is:
+            expected_output_array[i - 1] = (
+                (input_array[i - 1] * multiplier) * 5) + 10
+
+            # Run one iteration
+            output = sess.run(model)
+            output_array[i - 1] = output[0]
+    return input_array, output_array, expected_output_array
+
+
+def main(_):
+    input_array, output_array, expected_output_array = run_axpy_pipeline()
+    for i in range(1, 10):
+        print("Iteration:", i, " Input: ", input_array[i - 1], " Output: ",
+              output_array[i - 1], " Expected: ", expected_output_array[i - 1])
+        sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    os.environ['NGRAPH_TF_BACKEND'] = "INTERPRETER"
+    #os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
+    tf.app.run(main=main)

From 94cc4506288b157f7049eb0cbd855e1da15a2777 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 16:09:44 -0800
Subject: [PATCH 13/67] Changed tests. Put pipelined io tensors together to
 avoid unnecessary complications

---
 examples/axpy_pipelined.py                  |  1 -
 examples/axpy_pipelined_extended.py         | 34 +++++++++-------
 ngraph_bridge/ngraph_encapsulate_op.cc      | 45 +++++++++++----------
 ngraph_bridge/ngraph_prefetch_dataset_op.cc |  4 +-
 ngraph_bridge/ngraph_prefetch_shared_data.h | 36 +++++++++--------
 5 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/examples/axpy_pipelined.py b/examples/axpy_pipelined.py
index 46c2b8baf..8bf3dc2ef 100644
--- a/examples/axpy_pipelined.py
+++ b/examples/axpy_pipelined.py
@@ -64,7 +64,6 @@ def run_axpy_pipeline():
 
     with tf.Session() as sess:
         # Initialize the globals and the dataset
-        sess.run(tf.global_variables_initializer())
         sess.run(iterator.initializer)
 
         for i in range(1, 10):
diff --git a/examples/axpy_pipelined_extended.py b/examples/axpy_pipelined_extended.py
index 46c2b8baf..51d3d95e8 100644
--- a/examples/axpy_pipelined_extended.py
+++ b/examples/axpy_pipelined_extended.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import tensorflow as tf
+from tensorflow.python.framework import dtypes
 tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 
 import os
@@ -26,15 +27,18 @@
 import sys
 
 
-def build_simple_model(input_array):
+def build_simple_model(input_array, c1, c2):
     # Convert the numpy array to TF Tensor
-    input = tf.cast(input_array, tf.float32)
+    input_f = tf.cast(input_array, tf.float32)
 
     # Define the Ops
-    mul = tf.compat.v1.math.multiply(input_array, 5)
-    add = tf.compat.v1.math.add(mul, 10)
-    output = add
-    return output
+    pl = tf.placeholder(dtype=dtypes.int32)
+    pl_f = tf.cast(pl, tf.float32)
+    mul = tf.compat.v1.math.multiply(input_f, c1)
+    add = tf.compat.v1.math.add(mul, c2)
+    add2 = add + pl_f
+    output = add2
+    return output, pl
 
 
 def build_data_pipeline(input_array, map_function, batch_size):
@@ -52,28 +56,30 @@ def run_axpy_pipeline():
     input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
     output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
-    multiplier = 10
+    map_multiplier = 10
 
-    for i in range(1, 10):
-        input_array[i - 1] = input_array[i - 1] * i * multiplier
-    map_function = lambda x: x * multiplier
+    map_function = lambda x: x * map_multiplier
     batch_size = 1
     pipeline, iterator = build_data_pipeline(input_array, map_function,
                                              batch_size)
-    model = build_simple_model(pipeline)
+
+    # some constants
+    c1 = 5.0
+    c2 = 10.0
+    model, pl = build_simple_model(pipeline, c1, c2)
 
     with tf.Session() as sess:
         # Initialize the globals and the dataset
-        sess.run(tf.global_variables_initializer())
         sess.run(iterator.initializer)
 
         for i in range(1, 10):
             # Expected value is:
+            # Change it to run on TF if the model gets too complex
             expected_output_array[i - 1] = (
-                (input_array[i - 1] * multiplier) * 5) + 10
+                (input_array[i - 1] * map_multiplier) * c1) + c2 + i
 
             # Run one iteration
-            output = sess.run(model)
+            output = sess.run(model, feed_dict={pl: i})
             output_array[i - 1] = output[0]
     return input_array, output_array, expected_output_array
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 500443255..bb7f9d512 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -532,18 +532,19 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       io_tensors_next_iter = pipelined_tensor_store->get_tensors();
 
       // Save the prefetched input ngTensors for the next iteration
-      NGraphPrefetchSharedResouce::InputTensorBundle next_input_tensor_bundle{
-          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter)};
+      NGraphPrefetchSharedResouce::IOTensorBundle next_io_tensor_bundle{
+          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter),
+          get<2>(io_tensors_next_iter)};
 
       OP_REQUIRES(ctx,
-                  current_iter_pipeline_depth == (!next_input_tensor_bundle.Id),
+                  current_iter_pipeline_depth == (!next_io_tensor_bundle.Id),
                   errors::Internal("Current Pipeline Depth is ",
                                    current_iter_pipeline_depth,
                                    " and next iter pipeline depth is also  ",
-                                   next_input_tensor_bundle.Id));
+                                   next_io_tensor_bundle.Id));
 
-      shared_data->AddNextInputTensorBundleForDeviceTransfer(
-          next_input_tensor_bundle);
+      shared_data->AddNextIOTensorBundleForDeviceTransfer(
+          next_io_tensor_bundle);
 
       ctx->SetStatus(ctx->resource_manager()->Create(
           NGraphPrefetchSharedResouce::CONTAINER_NAME,
@@ -563,7 +564,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
         cout << "skip_tf2ng_copy true " << endl;
         // We have been using the pipelined tensors - therefore do the
         // following:
-        // 1. Save the prefetched Input tensors for the current iteration
+        // 1. Save the prefetched Input/Output tensors for the current iteration
         //    to the shared data object so that the prefetcher
         //    can continue with copying the next set of inout tensor to the
         //    device
@@ -572,23 +573,23 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 
         // Add the current prefetched tensors for the next iteration
         // Get prefetched inputs
-        NGraphPrefetchSharedResouce::InputTensorBundle
-            prefetch_input_tensor_bundle{current_iter_pipeline_depth,
-                                         ng_inputs};
-        shared_data->AddNextInputTensorBundleForDeviceTransfer(
-            prefetch_input_tensor_bundle);
+        NGraphPrefetchSharedResouce::IOTensorBundle prefetch_io_tensor_bundle{
+            current_iter_pipeline_depth, ng_inputs, ng_outputs};
+        shared_data->AddNextIOTensorBundleForDeviceTransfer(
+            prefetch_io_tensor_bundle);
 
         // Update the input_tensors with the one ready for exdcution
-        auto ng_input_tensor_bundle_ready =
-            shared_data->GetNextInputTensorBundleReadyForDeviceExecution();
-        current_iter_pipeline_depth = ng_input_tensor_bundle_ready.Id;
-        ng_inputs = ng_input_tensor_bundle_ready.Inputs;
-        OP_REQUIRES(ctx, current_iter_pipeline_depth ==
-                             (!prefetch_input_tensor_bundle.Id),
-                    errors::Internal("Current Pipeline Depth is ",
-                                     current_iter_pipeline_depth,
-                                     " and next iter pipeline depth is ",
-                                     "also ", prefetch_input_tensor_bundle.Id));
+        auto ng_io_tensor_bundle_ready =
+            shared_data->GetNextIOTensorBundleReadyForDeviceExecution();
+        current_iter_pipeline_depth = ng_io_tensor_bundle_ready.Id;
+        ng_inputs = ng_io_tensor_bundle_ready.Inputs;
+        ng_outputs = ng_io_tensor_bundle_ready.Outputs;
+        OP_REQUIRES(
+            ctx, current_iter_pipeline_depth == (!prefetch_io_tensor_bundle.Id),
+            errors::Internal("Current Pipeline Depth is ",
+                             current_iter_pipeline_depth,
+                             " and next iter pipeline depth is ", "also ",
+                             prefetch_io_tensor_bundle.Id));
         skip_tf2ng_copy = true;
         NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
       }
diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
index 18b444611..e7c5e737a 100644
--- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc
+++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
@@ -419,7 +419,7 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
           shared_data->SetBufferDepth(m_buffer_size);
 
           auto ng_input_tensor_bundle =
-              shared_data->GetNextInputTensorBundleForDeviceTransfer();
+              shared_data->GetNextIOTensorBundleForDeviceTransfer();
           auto ng_prefetch_input_indexes =
               shared_data->GetPrefetchInputIndexes();
 
@@ -459,7 +459,7 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
           }
 
           // Now add them back to the other queue
-          shared_data->AddNextInputTensorBundleReadyForDeviceExecution(
+          shared_data->AddNextIOTensorBundleReadyForDeviceExecution(
               ng_input_tensor_bundle);
           shared_data->Unref();
           evt_dev_cp.Stop();
diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index 1fe9054c5..066ce54a3 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -63,33 +63,36 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   static constexpr const char* NGRAPH_TF_USE_PREFETCH =
       "NGRAPH_TF_USE_PREFETCH";
 
-  struct InputTensorBundle {
+  struct IOTensorBundle {
     int Id;
     std::vector<shared_ptr<ng::runtime::Tensor>> Inputs;
+    std::vector<shared_ptr<ng::runtime::Tensor>> Outputs;
   };
 
-  // Adds the given nGraph input tensors to write to
+  // Adds the given nGraph input output tensors to write to
+  // Uses m_prefetch_input_indexes to figure out which input tensors
+  // are prefetched and writes into them
   // This is called by the NGraphEncapOp
-  void AddNextInputTensorBundleForDeviceTransfer(InputTensorBundle next) {
+  void AddNextIOTensorBundleForDeviceTransfer(IOTensorBundle next) {
     m_tf_2_ng.Add(std::move(next));
   }
 
-  // Returns the Input tensors to be used to copy TF tensors to NG device
+  // Returns the Input output tensors to be used to copy TF tensors to NG device
   // This will be called by the prefetcher
-  InputTensorBundle GetNextInputTensorBundleForDeviceTransfer() {
+  IOTensorBundle GetNextIOTensorBundleForDeviceTransfer() {
     return std::move(m_tf_2_ng.GetNextAvailable());
   }
 
-  // Adds the given nGraph input tensors to write to
+  // Adds the given nGraph input output tensors to write to
   // This is called by the prefetcher to add Tensors that are copied
   // from TF tensor and are now ready for the next iteration
-  void AddNextInputTensorBundleReadyForDeviceExecution(InputTensorBundle next) {
+  void AddNextIOTensorBundleReadyForDeviceExecution(IOTensorBundle next) {
     m_ng_2_tf.Add(std::move(next));
   }
 
-  // Returns the Input tensors to be ready to be executed by NG device
+  // Returns the Input output tensors to be ready to be executed by NG device
   // This will be called by the NGEncOp
-  InputTensorBundle GetNextInputTensorBundleReadyForDeviceExecution() {
+  IOTensorBundle GetNextIOTensorBundleReadyForDeviceExecution() {
     return std::move(m_ng_2_tf.GetNextAvailable());
   }
 
@@ -138,19 +141,20 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   //
   // The interaction is as follows:
   // Iteration  Action
-  // 1          NGEncOp pushes the Input tensors to m_ng_2_tf queue
+  // 1          NGEncOp pushes the Input/Output tensors to m_ng_2_tf queue
   // 2
-  //            Prefetcher pulls Input tensors out of m_ng_2_tf queue and copies
+  //            Prefetcher pulls Input/Output tensors out of m_ng_2_tf queue and
+  //            copies
   //            TF
-  //            data
+  //            data to the prefetched inputs
   //            Prefetcher pushes this item to the m_tf_2_ng queue
-  //            NGEncOp pushes the Input tensors to m_ng_2_tf queue
-  //            NGEncOp pulls Input tensors from m_tf_2_ng (from previous
+  //            NGEncOp pushes the Input/Output tensors to m_ng_2_tf queue
+  //            NGEncOp pulls Input/Output tensors from m_tf_2_ng (from previous
   //            iteration) and executes
   // 3          Repeat
 
-  ThreadSafeQueue<InputTensorBundle> m_tf_2_ng;
-  ThreadSafeQueue<InputTensorBundle> m_ng_2_tf;
+  ThreadSafeQueue<IOTensorBundle> m_tf_2_ng;
+  ThreadSafeQueue<IOTensorBundle> m_ng_2_tf;
 
   int m_prefetch_buffer_depth{-1};
   int m_skip_count{0};

From 8bc22bd9045b8f15bed23cc04dbc5120b7278ee7 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 17:37:17 -0800
Subject: [PATCH 14/67] refactored

---
 .../ngraph_encapsulate_get_prefetch.cc        | 203 +++++++++++++++++-
 .../ngraph_encapsulate_get_prefetch.h         |  13 +-
 ngraph_bridge/ngraph_encapsulate_op.cc        | 147 +------------
 .../ngraph_enter_prefetch_in_catalog.cc       |   7 +
 4 files changed, 228 insertions(+), 142 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
index b3c70c227..9a89143b7 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
@@ -15,6 +15,7 @@
  *******************************************************************************/
 
 #include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
+#include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 
 using namespace std;
 
@@ -22,7 +23,207 @@ namespace tensorflow {
 
 namespace ngraph_bridge {
 
-Status GetPrefetchTensors() { return Status::OK(); }
+Status GetPipelinedIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const std::vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
+    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
+        pipelined_io_tensors) {
+  auto io_tensors = pipelined_tensor_store->get_tensors();
+
+  int current_iter_pipeline_depth = get<0>(io_tensors);
+  PipelinedTensorVector ng_pipelined_inputs = get<1>(io_tensors);
+  PipelinedTensorVector ng_pipelined_outputs = get<2>(io_tensors);
+  auto pipelined_input_indexes = tensor_manager->GetPipelinedInputIndexes();
+  auto pipelined_output_indexes = tensor_manager->GetPipelinedInputIndexes();
+
+  if (current_iter_pipeline_depth < 0) {
+    return errors::Internal("No free tensor available"));
+  }
+
+  if (pipelined_input_indexes.size() != ng_pipelined_inputs.size()) {
+    return errors::Internal("Pipelined input tensors size ", ng_pipelined_inputs.size(), " does not match the no. of pipelined inputs indexes ", pipelined_input_indexes.size()));
+  }
+
+  if (pipelined_output_indexes.size() != ng_pipelined_outputs.size()) {
+    return errors::Internal("Pipelined output tensors size ", ng_pipelined_outputs.size(), " does not match the no. of pipelined output indexes ", pipelined_output_indexes.size()));
+  }
+
+  bool skip_tf2ng_copy = false;
+  if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
+      nullptr) {
+    cout << "using prefetch env flag " << endl;
+    // Set the prefetch shared obj if applicable
+    NGraphPrefetchSharedResouce* shared_data = nullptr;
+    Status s = ctx->resource_manager()->Lookup(
+        NGraphPrefetchSharedResouce::CONTAINER_NAME,
+        NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
+
+    if (!s.ok()) {
+      // We are using this for the first time i.e., we need to do the following
+      // 1. Create the shared data object
+      // 2. We get another pipelined tensor pair for the current iteration and
+      //   add it to the shared data. It will be accessed by prefetcher to copy
+      //   the
+      // prefetched inputs to device
+      auto ng_prefetch_input_indexes =
+          tensor_manager->GetPipelinedInputIndexesThatArePrefetched();
+      cout << "ng_prefetch_input_indexes " << ng_prefetch_input_indexes.size()
+           << endl;
+
+      for (auto inp : ng_prefetch_input_indexes) {
+        cout << " inp indez " << inp << endl;
+      }
+      shared_data = new NGraphPrefetchSharedResouce(
+          name(), m_parallel_executor->GetOpBackendName(),
+          m_parallel_executor->GetGraphId(),
+          m_parallel_executor->GetNgraphClusterId(), ng_prefetch_input_indexes);
+
+      // Get the set of IO tensors for the next iteration
+      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+          io_tensors_next_iter;
+      io_tensors_next_iter = pipelined_tensor_store->get_tensors();
+
+      // Save the prefetched input ngTensors for the next iteration
+      NGraphPrefetchSharedResouce::IOTensorBundle next_io_tensor_bundle{
+          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter),
+          get<2>(io_tensors_next_iter)};
+
+      if (current_iter_pipeline_depth != (!next_io_tensor_bundle.Id)) {
+        return errors::Internal("Current Pipeline Depth is ",
+                                current_iter_pipeline_depth,
+                                " and next iter pipeline depth is also  ",
+                                next_io_tensor_bundle.Id);
+      }
+
+      shared_data->AddNextIOTensorBundleForDeviceTransfer(
+          next_io_tensor_bundle);
+
+      ctx->SetStatus(ctx->resource_manager()->Create(
+          NGraphPrefetchSharedResouce::CONTAINER_NAME,
+          NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
+      // Continue the execution with the currently supplied TF tensor for the
+      // last time
+      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
+                        "signal prefetching";
+    } else {
+      cout << "using prefetch inputs " << endl;
+
+      int prefetch_buffer_depth = shared_data->GetBufferDepth();
+      int skip_count = shared_data->GetSkipCount();
+      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
+                     << " skip count; " << skip_count;
+      if (skip_count >= prefetch_buffer_depth) {
+        cout << "skip_tf2ng_copy true " << endl;
+        // We have been using the pipelined tensors - therefore do the
+        // following:
+        // 1. Save the prefetched Input/Output tensors for the current iteration
+        //    to the shared data object so that the prefetcher
+        //    can continue with copying the next set of inout tensor to the
+        //    device
+        // 3. Execute the nGraph call for this iteration using the
+        //    nG prefeteched input tensors we got from the shared data
+
+        // Add the current prefetched tensors for the next iteration
+        // Get prefetched inputs
+        NGraphPrefetchSharedResouce::IOTensorBundle prefetch_io_tensor_bundle{
+            current_iter_pipeline_depth, ng_pipelined_inputs,
+            ng_pipelined_outputs};
+        shared_data->AddNextIOTensorBundleForDeviceTransfer(
+            prefetch_io_tensor_bundle);
+
+        // Update the input_tensors with the one ready for exdcution
+        auto ng_io_tensor_bundle_ready =
+            shared_data->GetNextIOTensorBundleReadyForDeviceExecution();
+        current_iter_pipeline_depth = ng_io_tensor_bundle_ready.Id;
+        ng_pipelined_inputs = ng_io_tensor_bundle_ready.Inputs;
+        ng_pipelined_outputs = ng_io_tensor_bundle_ready.Outputs;
+        if (current_iter_pipeline_depth != (!prefetch_io_tensor_bundle.Id)) {
+          return errors::Internal("Current Pipeline Depth is ",
+                                  current_iter_pipeline_depth,
+                                  " and next iter pipeline depth is ", "also ",
+                                  prefetch_io_tensor_bundle.Id);
+        }
+        skip_tf2ng_copy = true;
+        NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
+      }
+      shared_data->IncrSkipCount();
+    }
+  }
+
+  // Allocate the input/
+  ngraph::Event event_copy_input_tensor("Copy Pipelined Input Tensors", "", "");
+
+  if (!skip_tf2ng_copy) {
+    // All pipelined inputs are copied
+
+    for (auto i = 0; i < pipelined_input_indexes.size(); i++) {
+      cout << "copying inputs true " << endl;
+      int index = pipelined_input_indexes[i];
+      ng::element::Type ng_element_type;
+      OP_REQUIRES_OK(
+          ctx, TFDataTypeToNGraphElementType(tf_input_tensors[index].dtype(),
+                                             &ng_element_type));
+
+      void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[index]);
+      try {
+        ng_pipelined_inputs[i]->write(
+            current_src_ptr, ng_pipelined_inputs[i]->get_element_count() *
+                                 ng_element_type.size());
+      } catch (const std::exception& exp) {
+        OP_REQUIRES(
+            ctx, false,
+            errors::Internal("Error copying TF tensor to device tensor: ",
+                             exp.what()));
+      } catch (...) {
+        OP_REQUIRES(
+            ctx, false,
+            errors::Internal("Error copying TF tensor to device tensor"));
+      }
+    }
+  } else {
+    // All pipelined inputs that are not prefetched are copied
+    // Note skip_tf2ng_copy will be true only when PREFETCH is enabled via env
+    // flag
+
+    // Gives the TF input index
+    auto pipelined_input_indexes_not_prefetched =
+        tensor_manager->GetPipelinedInputIndexes();
+
+    // Gives the mapping for corresponding
+    for (auto i = 0; i < pipelined_input_indexes_not_prefetched.size(); i++) {
+      cout << "copying inputs true " << endl;
+      int index = pipelined_input_indexes_not_prefetched[i];
+      ng::element::Type ng_element_type;
+      OP_REQUIRES_OK(
+          ctx, TFDataTypeToNGraphElementType(tf_input_tensors[index].dtype(),
+                                             &ng_element_type));
+
+      void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[index]);
+      try {
+        ng_pipelined_inputs[index]->write(
+            current_src_ptr, ng_pipelined_inputs[index]->get_element_count() *
+                                 ng_element_type.size());
+      } catch (const std::exception& exp) {
+        OP_REQUIRES(
+            ctx, false,
+            errors::Internal("Error copying TF tensor to device tensor: ",
+                             exp.what()));
+      } catch (...) {
+        OP_REQUIRES(
+            ctx, false,
+            errors::Internal("Error copying TF tensor to device tensor"));
+      }
+    }
+  }
+  event_copy_input_tensor.Stop();
+  ngraph::Event::write_trace(event_copy_input_tensor);
+
+  pipelined_io_tensors = make_tuple(current_iter_pipeline_depth,
+                                    ng_pipelined_inputs, ng_pipelined_outputs);
+
+  return Status::OK();
+}
 
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
index cc09f3acf..0774674f1 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  *******************************************************************************/
 
-#ifndef NGRAPH_TF_BRIDGE_GET_PREFETCH_H
-#define NGRAPH_TF_BRIDGE_GET_PREFETCH_H
+#ifndef NGRAPH_TF_BRIDGE_GET_PIPELINED_TENSORS_H
+#define NGRAPH_TF_BRIDGE_GET_PIPELINED_TENSORS_H
 
 #pragma once
 
@@ -25,9 +25,14 @@ namespace tensorflow {
 
 namespace ngraph_bridge {
 
-Status GetPrefetchTensors();
+Status GetPipelinedIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const std::vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
+    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
+        pipelined_io_tensors);
 
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 
-#endif  // NGRAPH_TF_BRIDGE_GET_PREFETCH_H
+#endif  // NGRAPH_TF_BRIDGE_GET_PIPELINED_TENSORS_H
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index bb7f9d512..99aba41f0 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -483,148 +483,21 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                                num_of_outputs, "and number of exec outputs ",
                                ng_exec->get_results().size(), " do not match"));
 
-  // create inputs, outputs, pipelineId
-  std::tuple<int, PipelinedTensorVector, PipelinedTensorVector> io_tensors;
-  io_tensors = pipelined_tensor_store->get_tensors();
-  OP_REQUIRES(ctx, !(std::get<0>(io_tensors) < 0),
-              errors::Internal("No free tensor available"));
-  int current_iter_pipeline_depth = get<0>(io_tensors);
+  // Get pipelined input output tensors for this iteration
+  std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+      pipelined_io_tensors;
+  OP_REQUIRES_OK(ctx, GetPipelinedIOTensorsReadyForExecution(
+                          ctx, tf_input_tensors, pipelined_tensor_store,
+                          pipelined_io_tensors));
+
+  int current_iter_pipeline_depth = get<0>(pipelined_io_tensors);
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
   // Assume All inputs and outputs are pipelined
   // TODO: Fit in variables
-  ng_inputs = get<1>(io_tensors);
-  ng_outputs = get<2>(io_tensors);
-
-  bool skip_tf2ng_copy = false;
-  if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
-      nullptr) {
-    cout << "using prefetch env flag " << endl;
-    // Set the prefetch shared obj if applicable
-    NGraphPrefetchSharedResouce* shared_data = nullptr;
-    Status s = ctx->resource_manager()->Lookup(
-        NGraphPrefetchSharedResouce::CONTAINER_NAME,
-        NGraphPrefetchSharedResouce::RESOURCE_NAME, &shared_data);
-
-    if (!s.ok()) {
-      // We are using this for the first time i.e., we need to do the following
-      // 1. Create the shared data object
-      // 2. We get another pipelined tensor pair for the current iteration and
-      //   copy the TF tensor to this set and continue with the execution for
-      //   for this iteration.
-      auto ng_prefetch_input_indexes =
-          tensor_manager->GetPipelinedInputIndexesThatArePrefetched();
-      cout << "ng_prefetch_input_indexes " << ng_prefetch_input_indexes.size()
-           << endl;
-
-      for (auto inp : ng_prefetch_input_indexes) {
-        cout << " inp indez " << inp << endl;
-      }
-      shared_data = new NGraphPrefetchSharedResouce(
-          name(), m_parallel_executor->GetOpBackendName(),
-          m_parallel_executor->GetGraphId(),
-          m_parallel_executor->GetNgraphClusterId(), ng_prefetch_input_indexes);
-
-      // Get the set of IO tensors for the next iteration
-      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
-          io_tensors_next_iter;
-      io_tensors_next_iter = pipelined_tensor_store->get_tensors();
-
-      // Save the prefetched input ngTensors for the next iteration
-      NGraphPrefetchSharedResouce::IOTensorBundle next_io_tensor_bundle{
-          get<0>(io_tensors_next_iter), get<1>(io_tensors_next_iter),
-          get<2>(io_tensors_next_iter)};
-
-      OP_REQUIRES(ctx,
-                  current_iter_pipeline_depth == (!next_io_tensor_bundle.Id),
-                  errors::Internal("Current Pipeline Depth is ",
-                                   current_iter_pipeline_depth,
-                                   " and next iter pipeline depth is also  ",
-                                   next_io_tensor_bundle.Id));
-
-      shared_data->AddNextIOTensorBundleForDeviceTransfer(
-          next_io_tensor_bundle);
-
-      ctx->SetStatus(ctx->resource_manager()->Create(
-          NGraphPrefetchSharedResouce::CONTAINER_NAME,
-          NGraphPrefetchSharedResouce::RESOURCE_NAME, shared_data));
-      // Continue the execution with the currently supplied TF tensor for the
-      // last time
-      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
-                        "signal prefetching";
-    } else {
-      cout << "using prefetch inputs " << endl;
-
-      int prefetch_buffer_depth = shared_data->GetBufferDepth();
-      int skip_count = shared_data->GetSkipCount();
-      NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
-                     << " skip count; " << skip_count;
-      if (skip_count >= prefetch_buffer_depth) {
-        cout << "skip_tf2ng_copy true " << endl;
-        // We have been using the pipelined tensors - therefore do the
-        // following:
-        // 1. Save the prefetched Input/Output tensors for the current iteration
-        //    to the shared data object so that the prefetcher
-        //    can continue with copying the next set of inout tensor to the
-        //    device
-        // 3. Execute the nGraph call for this iteration using the
-        //    nG prefeteched input tensors we got from the shared data
-
-        // Add the current prefetched tensors for the next iteration
-        // Get prefetched inputs
-        NGraphPrefetchSharedResouce::IOTensorBundle prefetch_io_tensor_bundle{
-            current_iter_pipeline_depth, ng_inputs, ng_outputs};
-        shared_data->AddNextIOTensorBundleForDeviceTransfer(
-            prefetch_io_tensor_bundle);
-
-        // Update the input_tensors with the one ready for exdcution
-        auto ng_io_tensor_bundle_ready =
-            shared_data->GetNextIOTensorBundleReadyForDeviceExecution();
-        current_iter_pipeline_depth = ng_io_tensor_bundle_ready.Id;
-        ng_inputs = ng_io_tensor_bundle_ready.Inputs;
-        ng_outputs = ng_io_tensor_bundle_ready.Outputs;
-        OP_REQUIRES(
-            ctx, current_iter_pipeline_depth == (!prefetch_io_tensor_bundle.Id),
-            errors::Internal("Current Pipeline Depth is ",
-                             current_iter_pipeline_depth,
-                             " and next iter pipeline depth is ", "also ",
-                             prefetch_io_tensor_bundle.Id));
-        skip_tf2ng_copy = true;
-        NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Using device tensors";
-      }
-      shared_data->IncrSkipCount();
-    }
-  }
-
-  // Allocate the input/
-  ngraph::Event event_copy_input_tensor("Copy Input Tensor", "", "");
-
-  if (!skip_tf2ng_copy) {
-    for (auto i = 0; i < tf_input_tensors.size(); i++) {
-      cout << "copying inputs true " << endl;
-      ng::element::Type ng_element_type;
-      OP_REQUIRES_OK(ctx, TFDataTypeToNGraphElementType(
-                              tf_input_tensors[i].dtype(), &ng_element_type));
-
-      void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[i]);
-      try {
-        ng_inputs[i]->write(current_src_ptr, ng_inputs[i]->get_element_count() *
-                                                 ng_element_type.size());
-      } catch (const std::exception& exp) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor: ",
-                             exp.what()));
-      } catch (...) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor"));
-      }
-    }
-  }
-  event_copy_input_tensor.Stop();
-  ngraph::Event::write_trace(event_copy_input_tensor);
+  ng_inputs = get<1>(pipelined_io_tensors);
+  ng_outputs = get<2>(pipelined_io_tensors);
 
   // And execute
   ngraph::Event event_execute_graph("Execute Graph", "", "");
diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
index 84513087f..8ff3bbd6e 100644
--- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
+++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
@@ -23,6 +23,7 @@
 #include "logging/ngraph_log.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_enter_prefetch_in_catalog.h"
+#include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
 using namespace std;
@@ -44,6 +45,12 @@ namespace ngraph_bridge {
 //
 
 Status EnterPrefetchInCatalog(Graph* graph, int graph_id) {
+  if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) ==
+      nullptr) {
+    // if prefetch is not requested return
+    return Status::OK();
+  }
+
   // Go over all the nodes in the graph
   for (auto node : graph->op_nodes()) {
     // If the node is a NGraphEncapsulate, go over all it's

From 33d4431399a53fbc9f4f008a5dacaa0721031112 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 4 Dec 2019 17:56:53 -0800
Subject: [PATCH 15/67] Indexes utilities

---
 ngraph_bridge/ngraph_encapsulate_get_prefetch.cc |  7 +++++--
 ngraph_bridge/ngraph_tensor_manager.h            | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
index 9a89143b7..f121d5b45 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
@@ -188,11 +188,14 @@ Status GetPipelinedIOTensorsReadyForExecution(
 
     // Gives the TF input index
     auto pipelined_input_indexes_not_prefetched =
-        tensor_manager->GetPipelinedInputIndexes();
+        tensor_manager->GetPipelinedInputIndexesNotPrefetched();
+
+    auto pipelined_input_indexes_not_prefetched =
+        tensor_manager->GetPipelinedInputIndexesNotPrefetched();
 
     // Gives the mapping for corresponding
     for (auto i = 0; i < pipelined_input_indexes_not_prefetched.size(); i++) {
-      cout << "copying inputs true " << endl;
+      cout << "copying some inputs true " << endl;
       int index = pipelined_input_indexes_not_prefetched[i];
       ng::element::Type ng_element_type;
       OP_REQUIRES_OK(
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 09060dd32..b22800d16 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -66,14 +66,26 @@ class NGraphTensorManager {
     return m_pipelined_output_indexes;
   }
 
+  // wrt to all inputs
   const vector<int>& GetPrefetchedInputIndexes() {
     return m_prefetched_input_indexes;
   }
 
+  // wrt to all inputs
+  const vector<int>& GetPipelinedButNotPrefetchedInputIndexes() {
+    return m_pipelined_not_prefetched_input_indexes;
+  }
+
+  // wrt to pipelined inputs
   const vector<int>& GetPipelinedInputIndexesThatArePrefetched() {
     return m_pipelined_input_indexes_prefetched;
   }
 
+  // wrt to pipelined inputs
+  const vector<int>& GetPipelinedInputIndexesThatAreNotPrefetched() {
+    return m_pipelined_input_indexes_not_prefetched;
+  }
+
   vector<shared_ptr<ng::runtime::Tensor>> GetPrefetchedTensors(
       const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors);
 
@@ -94,9 +106,11 @@ class NGraphTensorManager {
   vector<int> m_pipelined_input_indexes;
   vector<int> m_pipelined_output_indexes;
   vector<int> m_pipelined_input_indexes_prefetched;
+  vector<int> m_pipelined_input_indexes_not_prefetched;
 
   //[TODO] Book-keeping for prefetched inputs
   vector<int> m_prefetched_input_indexes;
+  vector<int> m_pipelined_not_prefetched_input_indexes;
 };
 
 }  // namespace ngraph_bridge

From 1c1aaffcc91bc5769440c40aa2aa09c73660e3c9 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 5 Dec 2019 17:54:40 -0800
Subject: [PATCH 16/67] Fix test

---
 .../ngraph_encapsulate_get_prefetch.cc        | 88 ++++++++++---------
 .../ngraph_encapsulate_get_prefetch.h         | 13 ++-
 ngraph_bridge/ngraph_encapsulate_op.cc        |  2 +-
 ngraph_bridge/ngraph_prefetch_shared_data.h   |  8 +-
 ngraph_bridge/ngraph_tensor_manager.cc        | 10 +++
 ngraph_bridge/ngraph_tensor_manager.h         | 14 ++-
 ngraph_bridge/ngraph_utils.cc                 | 15 ++++
 ngraph_bridge/ngraph_utils.h                  |  7 ++
 8 files changed, 103 insertions(+), 54 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
index f121d5b45..a8faeecfe 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
@@ -16,6 +16,7 @@
 
 #include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
+#include "ngraph_bridge/ngraph_utils.h"
 
 using namespace std;
 
@@ -24,9 +25,9 @@ namespace tensorflow {
 namespace ngraph_bridge {
 
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, const std::vector<Tensor>& tf_input_tensors,
-    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    const shared_ptr<NGraphTensorManager>& tensor_manager,
+    OpKernelContext* ctx, std::vector<Tensor>& tf_input_tensors,
+    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    shared_ptr<NGraphTensorManager>& tensor_manager,
     std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors) {
   auto io_tensors = pipelined_tensor_store->get_tensors();
@@ -35,18 +36,24 @@ Status GetPipelinedIOTensorsReadyForExecution(
   PipelinedTensorVector ng_pipelined_inputs = get<1>(io_tensors);
   PipelinedTensorVector ng_pipelined_outputs = get<2>(io_tensors);
   auto pipelined_input_indexes = tensor_manager->GetPipelinedInputIndexes();
-  auto pipelined_output_indexes = tensor_manager->GetPipelinedInputIndexes();
+  auto pipelined_output_indexes = tensor_manager->GetPipelinedOutputIndexes();
 
   if (current_iter_pipeline_depth < 0) {
-    return errors::Internal("No free tensor available"));
+    return errors::Internal("No free tensor available");
   }
 
   if (pipelined_input_indexes.size() != ng_pipelined_inputs.size()) {
-    return errors::Internal("Pipelined input tensors size ", ng_pipelined_inputs.size(), " does not match the no. of pipelined inputs indexes ", pipelined_input_indexes.size()));
+    return errors::Internal(
+        "Pipelined input tensors size ", ng_pipelined_inputs.size(),
+        " does not match the no. of pipelined inputs indexes ",
+        pipelined_input_indexes.size());
   }
 
   if (pipelined_output_indexes.size() != ng_pipelined_outputs.size()) {
-    return errors::Internal("Pipelined output tensors size ", ng_pipelined_outputs.size(), " does not match the no. of pipelined output indexes ", pipelined_output_indexes.size()));
+    return errors::Internal(
+        "Pipelined output tensors size ", ng_pipelined_outputs.size(),
+        " does not match the no. of pipelined output indexes ",
+        pipelined_output_indexes.size());
   }
 
   bool skip_tf2ng_copy = false;
@@ -75,9 +82,8 @@ Status GetPipelinedIOTensorsReadyForExecution(
         cout << " inp indez " << inp << endl;
       }
       shared_data = new NGraphPrefetchSharedResouce(
-          name(), m_parallel_executor->GetOpBackendName(),
-          m_parallel_executor->GetGraphId(),
-          m_parallel_executor->GetNgraphClusterId(), ng_prefetch_input_indexes);
+          tensor_manager->GetName(), tensor_manager->GetGraphId(),
+          tensor_manager->GetClusterId(), ng_prefetch_input_indexes);
 
       // Get the set of IO tensors for the next iteration
       std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
@@ -159,26 +165,23 @@ Status GetPipelinedIOTensorsReadyForExecution(
 
     for (auto i = 0; i < pipelined_input_indexes.size(); i++) {
       cout << "copying inputs true " << endl;
-      int index = pipelined_input_indexes[i];
-      ng::element::Type ng_element_type;
-      OP_REQUIRES_OK(
-          ctx, TFDataTypeToNGraphElementType(tf_input_tensors[index].dtype(),
-                                             &ng_element_type));
+      int tf_index = pipelined_input_indexes[i];
+      cout << "tf index " << tf_index << "ng index " << i << endl;
 
-      void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[index]);
+      ng::element::Type ng_element_type;
+      TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
+          tf_input_tensors[tf_index].dtype(), &ng_element_type));
+      void* current_src_ptr =
+          (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
       try {
         ng_pipelined_inputs[i]->write(
             current_src_ptr, ng_pipelined_inputs[i]->get_element_count() *
                                  ng_element_type.size());
       } catch (const std::exception& exp) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor: ",
-                             exp.what()));
+        return errors::Internal("Error copying TF tensor to device tensor: ",
+                                exp.what());
       } catch (...) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor"));
+        return errors::Internal("Error copying TF tensor to device tensor");
       }
     }
   } else {
@@ -186,36 +189,35 @@ Status GetPipelinedIOTensorsReadyForExecution(
     // Note skip_tf2ng_copy will be true only when PREFETCH is enabled via env
     // flag
 
-    // Gives the TF input index
-    auto pipelined_input_indexes_not_prefetched =
-        tensor_manager->GetPipelinedInputIndexesNotPrefetched();
+    // Gives the TF input index : wrt to all inputs
+    auto pipelined_not_prefetched_input_indexes =
+        tensor_manager->GetPipelinedButNotPrefetchedInputIndexes();
 
+    // Gives the corresponding pipelined input index : wrt pipelined
     auto pipelined_input_indexes_not_prefetched =
-        tensor_manager->GetPipelinedInputIndexesNotPrefetched();
+        tensor_manager->GetPipelinedInputIndexesThatAreNotPrefetched();
 
     // Gives the mapping for corresponding
     for (auto i = 0; i < pipelined_input_indexes_not_prefetched.size(); i++) {
       cout << "copying some inputs true " << endl;
-      int index = pipelined_input_indexes_not_prefetched[i];
+      int tf_index = pipelined_not_prefetched_input_indexes[i];
+      int ng_index = pipelined_input_indexes_not_prefetched[i];
       ng::element::Type ng_element_type;
-      OP_REQUIRES_OK(
-          ctx, TFDataTypeToNGraphElementType(tf_input_tensors[index].dtype(),
-                                             &ng_element_type));
-
-      void* current_src_ptr = (void*)DMAHelper::base(&tf_input_tensors[index]);
+      cout << "tf index " << tf_index << " ng_index " << ng_index << endl;
+      TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
+          tf_input_tensors[tf_index].dtype(), &ng_element_type));
+      void* current_src_ptr =
+          (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
       try {
-        ng_pipelined_inputs[index]->write(
-            current_src_ptr, ng_pipelined_inputs[index]->get_element_count() *
-                                 ng_element_type.size());
+        ng_pipelined_inputs[ng_index]->write(
+            current_src_ptr,
+            ng_pipelined_inputs[ng_index]->get_element_count() *
+                ng_element_type.size());
       } catch (const std::exception& exp) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor: ",
-                             exp.what()));
+        return errors::Internal("Error copying TF tensor to device tensor: ",
+                                exp.what());
       } catch (...) {
-        OP_REQUIRES(
-            ctx, false,
-            errors::Internal("Error copying TF tensor to device tensor"));
+        return errors::Internal("Error copying TF tensor to device tensor");
       }
     }
   }
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
index 0774674f1..33d1476d2 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
+++ b/ngraph_bridge/ngraph_encapsulate_get_prefetch.h
@@ -21,15 +21,20 @@
 
 #include "tensorflow/core/graph/graph.h"
 
+#include "logging/ngraph_log.h"
+#include "ngraph_bridge/ngraph_pipelined_tensors.h"
+#include "ngraph_bridge/ngraph_tensor_manager.h"
+
+using namespace std;
 namespace tensorflow {
 
 namespace ngraph_bridge {
 
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, const std::vector<Tensor>& tf_input_tensors,
-    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    const shared_ptr<NGraphTensorManager>& tensor_manager,
-    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
+    OpKernelContext* ctx, vector<Tensor>& tf_input_tensors,
+    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    shared_ptr<NGraphTensorManager>& tensor_manager,
+    tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors);
 
 }  // namespace ngraph_bridge
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 99aba41f0..8b62e4d78 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -488,7 +488,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       pipelined_io_tensors;
   OP_REQUIRES_OK(ctx, GetPipelinedIOTensorsReadyForExecution(
                           ctx, tf_input_tensors, pipelined_tensor_store,
-                          pipelined_io_tensors));
+                          tensor_manager, pipelined_io_tensors));
 
   int current_iter_pipeline_depth = get<0>(pipelined_io_tensors);
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index 066ce54a3..8ce683efd 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -39,11 +39,11 @@ namespace ngraph_bridge {
 class NGraphPrefetchSharedResouce : public ResourceBase {
  public:
   explicit NGraphPrefetchSharedResouce(const std::string& ng_enc_op_name,
-                                       const std::string& backend_name,
+                                       // const std::string& backend_name,
                                        int cluster_id, int graph_id,
                                        const vector<int> prefetch_input_indexes)
       : m_ng_enc_op_name(ng_enc_op_name),
-        m_backend_name(backend_name),
+        // m_backend_name(backend_name),
         m_graph_id(graph_id),
         m_cluster_id(cluster_id),
         m_prefetch_input_indexes(prefetch_input_indexes) {}
@@ -54,7 +54,7 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   // Returns memory used by this resource.
   int64 MemoryUsed() const override { return 0; }
   std::string GetName() const { return m_ng_enc_op_name; }
-  std::string GetBackendName() const { return m_backend_name; }
+  // std::string GetBackendName() const { return m_backend_name; }
   int GetGraphId() const { return m_graph_id; }
   int GetClusterId() const { return m_cluster_id; }
 
@@ -126,7 +126,7 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
 
  private:
   const std::string m_ng_enc_op_name;
-  const std::string m_backend_name;
+  // const std::string m_backend_name;
   const int m_graph_id;
   const int m_cluster_id;
   const vector<int> m_prefetch_input_indexes;
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 9576a7732..d8b0d6ffb 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -58,6 +58,10 @@ void NGraphTensorManager::Initialize() {
       m_output_indexes_that_need_copy.push_back(index);
     }
   }
+#else
+  m_output_indexes_that_need_copy.resize(m_number_of_outputs);
+  iota(begin(m_output_indexes_that_need_copy),
+       end(m_output_indexes_that_need_copy), 0);
 #endif
   m_pipelined_input_indexes =
       FindComplement(m_number_of_inputs, m_input_indexes_from_variables);
@@ -96,6 +100,12 @@ void NGraphTensorManager::Initialize() {
     m_pipelined_input_indexes_prefetched.push_back(
         position - m_pipelined_input_indexes.begin());
   }
+
+  // complements
+  m_pipelined_input_indexes_not_prefetched = FindComplement(
+      m_pipelined_input_indexes, m_pipelined_input_indexes_prefetched);
+  m_pipelined_not_prefetched_input_indexes =
+      FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
 }
 
 //---------------------------------------------------------------------------
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index b22800d16..af2b0f818 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -42,6 +42,12 @@ class NGraphTensorManager {
 
   ~NGraphTensorManager();
 
+  string GetName() { return m_ng_encap_node_name; }
+
+  int GetClusterId() { return m_ng_encap_cluster_id; }
+
+  int GetGraphId() { return m_ng_encap_graph_id; }
+
   const int& GetNumberOfInputs() { return m_number_of_inputs; }
 
   const int& GetNumberOfOutputs() { return m_number_of_outputs; }
@@ -98,17 +104,21 @@ class NGraphTensorManager {
   int m_number_of_outputs;
 
   // Book-keeping for weights-on-device optimizations
+  // indexes wrt all inputs/outputs
   vector<int> m_input_indexes_from_variables;
   vector<int> m_output_indexes_assigning_variable;
   vector<int> m_output_indexes_that_need_copy;
 
-  // All indexes that are not for from/to variables
+  // All indexes that are not from/to variables
+  // These are pipelined, some of these are also prefetched
+  // indexes wrt all inputs/outputs
   vector<int> m_pipelined_input_indexes;
   vector<int> m_pipelined_output_indexes;
+  // indexes wrt pipelined inputs
   vector<int> m_pipelined_input_indexes_prefetched;
   vector<int> m_pipelined_input_indexes_not_prefetched;
 
-  //[TODO] Book-keeping for prefetched inputs
+  // indexes wrt all inputs
   vector<int> m_prefetched_input_indexes;
   vector<int> m_pipelined_not_prefetched_input_indexes;
 };
diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc
index 686799e94..240ed82c6 100644
--- a/ngraph_bridge/ngraph_utils.cc
+++ b/ngraph_bridge/ngraph_utils.cc
@@ -57,6 +57,21 @@ vector<int> FindComplement(const int& max_element,
   return complement;
 }
 
+// Finds the complement of element_set
+// From the superset
+// Finds: superset - element_set
+// Assumes superset and element_superset are sorted
+vector<int> FindComplement(const vector<int>& superset,
+                           const vector<int>& element_set) {
+  // max size of complement is superset
+  vector<int> complement(superset.size());
+  vector<int>::iterator it = set_difference(
+      superset.begin(), superset.begin() + superset.size(), element_set.begin(),
+      element_set.begin() + element_set.size(), complement.begin());
+  complement.resize(it - complement.begin());
+  return complement;
+}
+
 int FindNumberOfNodes(const Graph* graph, const string op_type) {
   int count = 0;
   for (auto node : graph->nodes()) {
diff --git a/ngraph_bridge/ngraph_utils.h b/ngraph_bridge/ngraph_utils.h
index 6df01a1dd..daa6cbf5b 100644
--- a/ngraph_bridge/ngraph_utils.h
+++ b/ngraph_bridge/ngraph_utils.h
@@ -47,6 +47,13 @@ namespace ngraph_bridge {
 vector<int> FindComplement(const int& max_element,
                            const vector<int>& element_set);
 
+// Finds the complement of element_set
+// From the superset
+// Finds: superset - element_set
+// Assumes superset and element_superset are sorted
+vector<int> FindComplement(const vector<int>& element_superset,
+                           const vector<int>& element_set);
+
 int FindNumberOfNodes(const Graph* graph, const string op_type);
 
 Status IsNgraphTFLogTensorCopiesEnabled(int graph_id,

From 958f62e302bab0162c7e5f057346b1abc7817713 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 10 Dec 2019 17:38:49 -0800
Subject: [PATCH 17/67] renamed the files

---
 bazel/BUILD                                                   | 4 ++--
 ngraph_bridge/CMakeLists.txt                                  | 2 +-
 ngraph_bridge/ngraph_encapsulate_op.cc                        | 2 +-
 ...psulate_get_prefetch.cc => ngraph_encapsulate_op_utils.cc} | 2 +-
 ...capsulate_get_prefetch.h => ngraph_encapsulate_op_utils.h} | 0
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename ngraph_bridge/{ngraph_encapsulate_get_prefetch.cc => ngraph_encapsulate_op_utils.cc} (99%)
 rename ngraph_bridge/{ngraph_encapsulate_get_prefetch.h => ngraph_encapsulate_op_utils.h} (100%)

diff --git a/bazel/BUILD b/bazel/BUILD
index 15964fd1e..6e59a58a3 100644
--- a/bazel/BUILD
+++ b/bazel/BUILD
@@ -36,9 +36,9 @@ cc_library(
         "ngraph_bridge/ngraph_enter_prefetch_in_catalog.h",
         "ngraph_bridge/ngraph_executor.h",
         "ngraph_bridge/ngraph_encapsulate_op.h",
+        "ngraph_bridge/ngraph_encapsulate_op_utils.h",
         "ngraph_bridge/ngraph_data_cache.h",
         "ngraph_bridge/ngraph_find_replace_prefetchdataset.h",
-        "ngraph_bridge/ngraph_encapsulate_get_prefetch.h",
         "ngraph_bridge/ngraph_freshness_tracker.h",
         "ngraph_bridge/ngraph_mark_for_clustering.h",
         "ngraph_bridge/ngraph_partial_shapes.h",
@@ -80,9 +80,9 @@ cc_library(
         "ngraph_bridge/ngraph_encapsulate_clusters.cc",
         "ngraph_bridge/ngraph_encapsulate_impl.cc",
         "ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc",
-        "ngraph_bridge/ngraph_encapsulate_get_prefetch.cc",
         "ngraph_bridge/ngraph_executor.cc",
         "ngraph_bridge/ngraph_encapsulate_op.cc",
+        "ngraph_bridge/ngraph_encapsulate_op_utils.cc",
         "ngraph_bridge/ngraph_freshness_tracker.cc",
         "ngraph_bridge/ngraph_mark_for_clustering.cc",
         "ngraph_bridge/ngraph_partial_shapes.cc",
diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index f48aeef56..d895a3028 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -48,6 +48,7 @@ set(SRC
    ngraph_executor.cc
    ops/ngraph_ops.cc
    ngraph_encapsulate_op.cc
+   ngraph_encapsulate_op_utils.cc
    ngraph_freshness_tracker.cc
    ngraph_mark_for_clustering.cc
    ngraph_partial_shapes.cc
@@ -60,7 +61,6 @@ set(SRC
    tf_deadness_analysis.cc
    prefetch_autotuner.cc
    ngraph_prefetch_dataset_op.cc
-   ngraph_encapsulate_get_prefetch.cc
    stats_utils.cc
    version.cc
 )
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index c87e83c7e..35328586e 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -40,9 +40,9 @@
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_builder.h"
 #include "ngraph_bridge/ngraph_cluster_manager.h"
-#include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
 #include "ngraph_bridge/ngraph_encapsulate_impl.h"
 #include "ngraph_bridge/ngraph_encapsulate_op.h"
+#include "ngraph_bridge/ngraph_encapsulate_op_utils.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_mark_for_clustering.h"
 #include "ngraph_bridge/ngraph_pipelined_tensors.h"
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
similarity index 99%
rename from ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
rename to ngraph_bridge/ngraph_encapsulate_op_utils.cc
index a8faeecfe..1db2c164e 100644
--- a/ngraph_bridge/ngraph_encapsulate_get_prefetch.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  *******************************************************************************/
 
-#include "ngraph_bridge/ngraph_encapsulate_get_prefetch.h"
+#include "ngraph_bridge/ngraph_encapsulate_op_utils.h"
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
diff --git a/ngraph_bridge/ngraph_encapsulate_get_prefetch.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
similarity index 100%
rename from ngraph_bridge/ngraph_encapsulate_get_prefetch.h
rename to ngraph_bridge/ngraph_encapsulate_op_utils.h

From bfef9c08d6c02141cbf7ba4bf1eeee6a63c498a7 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 10 Dec 2019 17:56:30 -0800
Subject: [PATCH 18/67] Fixed Prefetch Tests

---
 test/test_enter_prefetch_in_catalog.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test/test_enter_prefetch_in_catalog.cc b/test/test_enter_prefetch_in_catalog.cc
index 0d4309260..2d23af0c3 100644
--- a/test/test_enter_prefetch_in_catalog.cc
+++ b/test/test_enter_prefetch_in_catalog.cc
@@ -46,6 +46,12 @@ namespace ngraph_bridge {
 
 namespace testing {
 TEST(PrefetchCatalogTest, SmallGraph1) {
+  // Set flag to enable prefetch
+  list<string> env_vars{"NGRAPH_TF_USE_PREFETCH"};
+  const unordered_map<string, string>& env_map = StoreEnv(env_vars);
+  SetEnvVariable("NGRAPH_TF_USE_PREFETCH", "1");
+
+  // Create Graph
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   Graph input_graph(OpRegistry::Global());
@@ -72,9 +78,17 @@ TEST(PrefetchCatalogTest, SmallGraph1) {
 
   // Clean up
   NGraphCatalog::ClearCatalog();
+  // Unset, Restore env flga
+  UnsetEnvVariable("NGRAPH_TF_USE_PREFETCH");
+  RestoreEnv(env_map);
 }
 
 TEST(PrefetchCatalogTest, SmallGraph2) {
+  // Set flag to enable prefetch
+  list<string> env_vars{"NGRAPH_TF_USE_PREFETCH"};
+  const unordered_map<string, string>& env_map = StoreEnv(env_vars);
+  SetEnvVariable("NGRAPH_TF_USE_PREFETCH", "1");
+
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   Graph input_graph(OpRegistry::Global());
@@ -98,6 +112,9 @@ TEST(PrefetchCatalogTest, SmallGraph2) {
 
   // Clean up
   NGraphCatalog::ClearCatalog();
+  // Unset, Restore env flga
+  UnsetEnvVariable("NGRAPH_TF_USE_PREFETCH");
+  RestoreEnv(env_map);
 }
 
 }  // namespace testing

From 065db005ea99ab10c8ee9f8ca8e7b9e812c089a1 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 14:56:07 -0800
Subject: [PATCH 19/67] fixed tests

---
 ngraph_bridge/ngraph_tensor_manager.cc | 14 -----
 ngraph_bridge/ngraph_tensor_manager.h  |  3 -
 test/test_ngraph_tensor_manager.cpp    | 83 +++-----------------------
 3 files changed, 8 insertions(+), 92 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index d8b0d6ffb..5a8db0ccb 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -113,19 +113,5 @@ void NGraphTensorManager::Initialize() {
 //---------------------------------------------------------------------------
 NGraphTensorManager::~NGraphTensorManager() {}
 
-//---------------------------------------------------------------------------
-//  NGraphTensorManager::GetPrefetchedTensors
-//---------------------------------------------------------------------------
-vector<shared_ptr<ng::runtime::Tensor>>
-NGraphTensorManager::GetPrefetchedTensors(
-    const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors) {
-  vector<shared_ptr<ng::runtime::Tensor>> prefetched_tensors;
-  auto prefetched_indexes = GetPipelinedInputIndexesThatArePrefetched();
-  for (auto index : prefetched_indexes) {
-    prefetched_tensors.push_back(pipelined_input_tensors[index]);
-  }
-  return prefetched_tensors;
-}
-
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index af2b0f818..2fe2aa0a7 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -92,9 +92,6 @@ class NGraphTensorManager {
     return m_pipelined_input_indexes_not_prefetched;
   }
 
-  vector<shared_ptr<ng::runtime::Tensor>> GetPrefetchedTensors(
-      const vector<shared_ptr<ng::runtime::Tensor>>& pipelined_input_tensors);
-
  private:
   void Initialize();
   string m_ng_encap_node_name;
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index e5d9056a2..a4eb96ae2 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -129,11 +129,13 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
   vector<int> empty;
   vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
   vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
+  vector<int> expected_out_indexes_need_copy = FillRange(number_of_outputs);
 
   // var related
   ASSERT_EQ(empty, tensor_manager.GetInputIndexesFedByVariables());
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
-  ASSERT_EQ(empty, tensor_manager.GetOutputIndexesThatNeedCopy());
+  ASSERT_EQ(expected_out_indexes_need_copy,
+            tensor_manager.GetOutputIndexesThatNeedCopy());
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
@@ -180,7 +182,7 @@ TEST_F(NGraphTensorManagerTest, HasVariablesNoPrefetch) {
 
     expected_var_inp_indexes = {};
     expected_var_out_indexes = {};
-    expected_out_indexes_need_copy = {};
+    expected_out_indexes_need_copy = FillRange(number_of_outputs);
     expected_prefetched_inp_indexes = {};
   }
 
@@ -220,6 +222,7 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   vector<int> empty;
   vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
   vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
+  vector<int> expected_out_indexes_need_copy = FillRange(number_of_outputs);
   vector<int> expected_prefetched_inp_indexes = {1, 3};
   vector<int> expected_pipelined_inp_indexes_prefetched = {
       1, 3};  // as all inputs are pipelined
@@ -234,7 +237,8 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   // var related
   ASSERT_EQ(empty, tensor_manager.GetInputIndexesFedByVariables());
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
-  ASSERT_EQ(empty, tensor_manager.GetOutputIndexesThatNeedCopy());
+  ASSERT_EQ(expected_out_indexes_need_copy,
+            tensor_manager.GetOutputIndexesThatNeedCopy());
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
@@ -289,7 +293,7 @@ TEST_F(NGraphTensorManagerTest, VariablesAndPrefetch) {
 
     expected_var_inp_indexes = {};
     expected_var_out_indexes = {};
-    expected_out_indexes_need_copy = {};
+    expected_out_indexes_need_copy = FillRange(number_of_outputs);
   }
 
   EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
@@ -341,77 +345,6 @@ TEST_F(NGraphTensorManagerTest, PrefetchNotInPipeline) {
   ClearCatalog();
 }
 
-TEST_F(NGraphTensorManagerTest, GetPrefetchedTensors1) {
-  string ng_encap_node_name = "xyz_1";
-  int ng_encap_cluster_id = 1;
-  int ng_encap_graph_id = 1;
-  int number_of_inputs = 7;
-  int number_of_outputs = 4;
-
-  // expected
-  vector<int> expected_pipelined_inp_indexes, expected_pipelined_out_indexes,
-      expected_var_inp_indexes, expected_var_out_indexes,
-      expected_out_indexes_need_copy, expected_prefetched_inp_indexes,
-      expected_pipelined_inp_indexes_prefetched;
-
-  if (ngraph_tf_are_variables_enabled()) {
-    // expected values
-    expected_pipelined_inp_indexes = {1, 3, 4, 6};
-    expected_prefetched_inp_indexes = {3, 6};
-    expected_pipelined_inp_indexes_prefetched = {1, 3};
-    expected_pipelined_out_indexes = {0, 2};
-    expected_var_inp_indexes =
-        FindComplement(number_of_inputs, expected_pipelined_inp_indexes);
-    expected_var_out_indexes =
-        FindComplement(number_of_outputs, expected_pipelined_out_indexes);
-    expected_out_indexes_need_copy = {2, 3};
-    // enter in catalog
-    EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name,
-                      expected_var_inp_indexes, expected_var_out_indexes,
-                      expected_out_indexes_need_copy);
-
-  } else {
-    expected_pipelined_inp_indexes = FillRange(number_of_inputs);
-    expected_pipelined_out_indexes = FillRange(number_of_outputs);
-    expected_prefetched_inp_indexes = {3, 6};
-    expected_pipelined_inp_indexes_prefetched = {
-        3, 6};  // all inputs are pipelined
-
-    expected_var_inp_indexes = {};
-    expected_var_out_indexes = {};
-    expected_out_indexes_need_copy = {};
-  }
-
-  EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
-                         expected_prefetched_inp_indexes);
-
-  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
-                                     ng_encap_graph_id, number_of_inputs,
-                                     number_of_outputs);
-
-  vector<shared_ptr<ng::runtime::Tensor>> pipelined_input_tensors(
-      expected_pipelined_inp_indexes.size());
-
-  for (int i = 0; i < pipelined_input_tensors.size(); i++) {
-    pipelined_input_tensors[i] =
-        CreateNGraphScalarTensor(expected_pipelined_inp_indexes[i]);
-  }
-
-  vector<shared_ptr<ng::runtime::Tensor>> prefetched_input_tensors =
-      tensor_manager.GetPrefetchedTensors(pipelined_input_tensors);
-  ASSERT_EQ(prefetched_input_tensors.size(),
-            expected_prefetched_inp_indexes.size());
-
-  for (int i = 0; i < expected_prefetched_inp_indexes.size(); i++) {
-    int tensor_val = 0;
-    prefetched_input_tensors[i]->read(&tensor_val, sizeof(tensor_val));
-    ASSERT_EQ(tensor_val, expected_prefetched_inp_indexes[i]);
-  }
-
-  // clean up
-  ClearCatalog();
-}
-
 }  // namespace testing
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file

From 30fec1664c6b3b12d50019e40e4d005848ba5aee Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 16:07:03 -0800
Subject: [PATCH 20/67] Removed couts

---
 ngraph_bridge/ngraph_catalog.cc                   |  4 ----
 ngraph_bridge/ngraph_encapsulate_op.cc            |  1 -
 ngraph_bridge/ngraph_encapsulate_op_utils.cc      | 13 -------------
 ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc |  4 ----
 ngraph_bridge/ngraph_tensor_manager.cc            |  8 --------
 5 files changed, 30 deletions(-)

diff --git a/ngraph_bridge/ngraph_catalog.cc b/ngraph_bridge/ngraph_catalog.cc
index fdb70790d..95b65f506 100644
--- a/ngraph_bridge/ngraph_catalog.cc
+++ b/ngraph_bridge/ngraph_catalog.cc
@@ -220,21 +220,17 @@ void NGraphCatalog::AddToPrefetchedInputIndexMap(
     throw runtime_error("Trying to add an already existing key ( " + key +
                         " ) in PrefetchedInputIndexMap ");
   }
-  cout << " AddToPrefetchedInputIndexMap key " << key << endl;
   NGraphCatalog::prefetched_input_index_map_.insert({key, val});
 }
 
 bool NGraphCatalog::ExistsInPrefetchedInputIndexMap(const int& graphid,
                                                     const string& node_name) {
   string key = NGraphCatalog::CreateNodeKey(graphid, node_name);
-  cout << " ExistsInPrefetchedInputIndexMap key " << key << endl;
   return NGraphCatalog::ExistsInPrefetchedInputIndexMap(key);
 }
 
 bool NGraphCatalog::ExistsInPrefetchedInputIndexMap(const string& key) {
   auto itr = NGraphCatalog::prefetched_input_index_map_.find(key);
-  cout << " ExistsInPrefetchedInputIndexMap check "
-       << (itr != NGraphCatalog::prefetched_input_index_map_.end()) << endl;
   return itr != NGraphCatalog::prefetched_input_index_map_.end();
 }
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 35328586e..72bc9bff7 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -418,7 +418,6 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
 // ComputeUsingParallelExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
-  cout << "using parallel exec " << endl;
   // TF input tensors
   std::vector<Tensor> tf_input_tensors;
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 1db2c164e..55faae905 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -59,7 +59,6 @@ Status GetPipelinedIOTensorsReadyForExecution(
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
-    cout << "using prefetch env flag " << endl;
     // Set the prefetch shared obj if applicable
     NGraphPrefetchSharedResouce* shared_data = nullptr;
     Status s = ctx->resource_manager()->Lookup(
@@ -75,12 +74,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
       // prefetched inputs to device
       auto ng_prefetch_input_indexes =
           tensor_manager->GetPipelinedInputIndexesThatArePrefetched();
-      cout << "ng_prefetch_input_indexes " << ng_prefetch_input_indexes.size()
-           << endl;
 
-      for (auto inp : ng_prefetch_input_indexes) {
-        cout << " inp indez " << inp << endl;
-      }
       shared_data = new NGraphPrefetchSharedResouce(
           tensor_manager->GetName(), tensor_manager->GetGraphId(),
           tensor_manager->GetClusterId(), ng_prefetch_input_indexes);
@@ -113,14 +107,11 @@ Status GetPipelinedIOTensorsReadyForExecution(
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
                         "signal prefetching";
     } else {
-      cout << "using prefetch inputs " << endl;
-
       int prefetch_buffer_depth = shared_data->GetBufferDepth();
       int skip_count = shared_data->GetSkipCount();
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
                      << " skip count; " << skip_count;
       if (skip_count >= prefetch_buffer_depth) {
-        cout << "skip_tf2ng_copy true " << endl;
         // We have been using the pipelined tensors - therefore do the
         // following:
         // 1. Save the prefetched Input/Output tensors for the current iteration
@@ -164,9 +155,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
     // All pipelined inputs are copied
 
     for (auto i = 0; i < pipelined_input_indexes.size(); i++) {
-      cout << "copying inputs true " << endl;
       int tf_index = pipelined_input_indexes[i];
-      cout << "tf index " << tf_index << "ng index " << i << endl;
 
       ng::element::Type ng_element_type;
       TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
@@ -199,11 +188,9 @@ Status GetPipelinedIOTensorsReadyForExecution(
 
     // Gives the mapping for corresponding
     for (auto i = 0; i < pipelined_input_indexes_not_prefetched.size(); i++) {
-      cout << "copying some inputs true " << endl;
       int tf_index = pipelined_not_prefetched_input_indexes[i];
       int ng_index = pipelined_input_indexes_not_prefetched[i];
       ng::element::Type ng_element_type;
-      cout << "tf index " << tf_index << " ng_index " << ng_index << endl;
       TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
index 8ff3bbd6e..75d96e954 100644
--- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
+++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.cc
@@ -69,10 +69,6 @@ Status EnterPrefetchInCatalog(Graph* graph, int graph_id) {
       }  // end loop over input edges
 
       if (in_indexes_for_encap.size() > 0) {
-        for (auto i : in_indexes_for_encap) {
-          cout << "Enter Prefetch in catalog " << i << endl;
-        }
-
         try {
           NGraphCatalog::AddToPrefetchedInputIndexMap(graph_id, node->name(),
                                                       in_indexes_for_encap);
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 5a8db0ccb..7255a0081 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -68,24 +68,16 @@ void NGraphTensorManager::Initialize() {
   m_pipelined_output_indexes =
       FindComplement(m_number_of_outputs, m_output_indexes_assigning_variable);
 
-  cout << "TM m_ng_encap_graph_id " << m_ng_encap_graph_id << endl;
-  cout << "TM m_ng_encap_node_name " << m_ng_encap_node_name << endl;
   if (NGraphCatalog::ExistsInPrefetchedInputIndexMap(m_ng_encap_graph_id,
                                                      m_ng_encap_node_name)) {
     auto prefetch_indexes =
         NGraphCatalog::GetIndexesFromPrefetchedInputIndexMap(
             m_ng_encap_graph_id, m_ng_encap_node_name);
-    for (auto i : m_prefetched_input_indexes) {
-      cout << "TM " << i << endl;
-    }
     m_prefetched_input_indexes.insert(m_prefetched_input_indexes.begin(),
                                       prefetch_indexes.begin(),
                                       prefetch_indexes.end());
     // keeping the indexes sorted, is helpful in general testing
     sort(m_prefetched_input_indexes.begin(), m_prefetched_input_indexes.end());
-    for (auto i : m_prefetched_input_indexes) {
-      cout << "TM " << i << endl;
-    }
   }
 
   // the prefetched input indexes will also be pipelined

From 5b0bd525576fc7a1076e463ed1d80f6973c12c49 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 16:12:19 -0800
Subject: [PATCH 21/67] minor

---
 ngraph_bridge/ngraph_deassign_clusters.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_deassign_clusters.cc b/ngraph_bridge/ngraph_deassign_clusters.cc
index 3d4a1bc4d..2f51b3650 100644
--- a/ngraph_bridge/ngraph_deassign_clusters.cc
+++ b/ngraph_bridge/ngraph_deassign_clusters.cc
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
-
 #include <algorithm>
 #include <fstream>
 #include <iostream>

From bface4032cd24f32f936edfe4e517e69e14e881e Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 17:07:30 -0800
Subject: [PATCH 22/67] fixed tests

---
 test/test_ngraph_tensor_manager.cpp | 134 +++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 32 deletions(-)

diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index a4eb96ae2..0efba0163 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -82,21 +82,6 @@ class NGraphTensorManagerTest : public ::testing::Test {
     iota(vout.begin(), vout.end(), 0);
     return vout;
   }
-
-  // Creates ngraph tensor
-  shared_ptr<ng::runtime::Tensor> CreateNGraphScalarTensor(
-      int value, string backend_type = "INTERPRETER") {
-    // create scalar tensor
-    ng::Shape ng_shape_scalar({});
-
-    // create Backend
-    auto backend = ng::runtime::Backend::create(backend_type);
-
-    auto temp = backend->create_tensor(ng::element::i32, ng_shape_scalar);
-
-    temp->write(&value, sizeof(value));
-    return temp;
-  }
 };
 
 TEST(NGraphUtils, FindComplement1) {
@@ -136,6 +121,7 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
   ASSERT_EQ(expected_out_indexes_need_copy,
             tensor_manager.GetOutputIndexesThatNeedCopy());
+  // piplined
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
@@ -143,6 +129,13 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
 
   // prefetched
   ASSERT_EQ(empty, tensor_manager.GetPrefetchedInputIndexes());
+  ASSERT_EQ(expected_pipelined_inp_indexes,
+            tensor_manager.GetPipelinedButNotPrefetchedInputIndexes());
+
+  // prefetched wrt pipelined
+  ASSERT_EQ(empty, tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
+  ASSERT_EQ(expected_pipelined_inp_indexes,
+            tensor_manager.GetPipelinedInputIndexesThatAreNotPrefetched());
 }
 
 // Tests scenario when the graph has variables but no prefetched inputs
@@ -158,18 +151,31 @@ TEST_F(NGraphTensorManagerTest, HasVariablesNoPrefetch) {
   // expected
   vector<int> expected_pipelined_inp_indexes, expected_pipelined_out_indexes,
       expected_var_inp_indexes, expected_var_out_indexes,
-      expected_out_indexes_need_copy, expected_prefetched_inp_indexes;
+      expected_out_indexes_need_copy, expected_prefetched_inp_indexes,
+      expected_pipelined_not_prefetched_input_indexes,
+      expected_pipelined_input_indexes_prefetched,
+      expected_pipelined_input_indexes_not_prefetched;
 
+  // expected values
   if (ngraph_tf_are_variables_enabled()) {
-    // expected values
+    // pipelined
     expected_pipelined_inp_indexes = {1, 3, 4};
     expected_pipelined_out_indexes = {1};
+    // var
     expected_var_inp_indexes =
         FindComplement(number_of_inputs, expected_pipelined_inp_indexes);
     expected_var_out_indexes =
         FindComplement(number_of_outputs, expected_pipelined_out_indexes);
     expected_out_indexes_need_copy = {1};
+
+    // prefetched
     expected_prefetched_inp_indexes = {};
+    expected_pipelined_not_prefetched_input_indexes =
+        expected_pipelined_inp_indexes;
+
+    // prefetched relative to pipelined tensors
+    expected_pipelined_input_indexes_prefetched = {};
+    expected_pipelined_input_indexes_not_prefetched = {0, 1, 2};
 
     // enter in catalog
     EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name,
@@ -177,19 +183,29 @@ TEST_F(NGraphTensorManagerTest, HasVariablesNoPrefetch) {
                       expected_out_indexes_need_copy);
 
   } else {
+    // pipelined
     expected_pipelined_inp_indexes = FillRange(number_of_inputs);
     expected_pipelined_out_indexes = FillRange(number_of_outputs);
-
+    // var
     expected_var_inp_indexes = {};
     expected_var_out_indexes = {};
     expected_out_indexes_need_copy = FillRange(number_of_outputs);
+    // prefetched
     expected_prefetched_inp_indexes = {};
+    expected_pipelined_not_prefetched_input_indexes =
+        expected_pipelined_inp_indexes;
+
+    // prefetched relative to pipelined tensors
+    expected_pipelined_input_indexes_prefetched = {};
+    expected_pipelined_input_indexes_not_prefetched =
+        expected_pipelined_not_prefetched_input_indexes;
   }
 
   NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
                                      ng_encap_graph_id, number_of_inputs,
                                      number_of_outputs);
 
+  // var
   ASSERT_EQ(expected_var_inp_indexes,
             tensor_manager.GetInputIndexesFedByVariables());
   ASSERT_EQ(expected_var_out_indexes,
@@ -197,14 +213,24 @@ TEST_F(NGraphTensorManagerTest, HasVariablesNoPrefetch) {
   ASSERT_EQ(expected_out_indexes_need_copy,
             tensor_manager.GetOutputIndexesThatNeedCopy());
 
-  ASSERT_EQ(expected_prefetched_inp_indexes,
-            tensor_manager.GetPrefetchedInputIndexes());
-
+  // pipelined
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
             tensor_manager.GetPipelinedOutputIndexes());
 
+  // prefetched
+  ASSERT_EQ(expected_prefetched_inp_indexes,
+            tensor_manager.GetPrefetchedInputIndexes());
+  ASSERT_EQ(expected_pipelined_not_prefetched_input_indexes,
+            tensor_manager.GetPipelinedButNotPrefetchedInputIndexes());
+
+  // prefetched wrt pipelined
+  ASSERT_EQ(expected_pipelined_input_indexes_prefetched,
+            tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
+  ASSERT_EQ(expected_pipelined_input_indexes_not_prefetched,
+            tensor_manager.GetPipelinedInputIndexesThatAreNotPrefetched());
+
   // clean up
   ClearCatalog();
 }
@@ -219,13 +245,24 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   int number_of_outputs = 2;
 
   // expected
+  // var
   vector<int> empty;
+  vector<int> expected_out_indexes_need_copy = FillRange(number_of_outputs);
+
+  // pipelined
   vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
   vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
-  vector<int> expected_out_indexes_need_copy = FillRange(number_of_outputs);
+
+  // prefetched
   vector<int> expected_prefetched_inp_indexes = {1, 3};
-  vector<int> expected_pipelined_inp_indexes_prefetched = {
-      1, 3};  // as all inputs are pipelined
+  vector<int> expected_pipelined_not_prefetched_input_indexes = {0, 2, 4};
+
+  // relative to pipelined tensors
+  // all pipelined are prefetched
+  vector<int> expected_pipelined_input_indexes_prefetched =
+      expected_prefetched_inp_indexes;
+  vector<int> expected_pipelined_input_indexes_not_prefetched =
+      expected_pipelined_not_prefetched_input_indexes;
 
   EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
                          expected_prefetched_inp_indexes);
@@ -239,6 +276,7 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
   ASSERT_EQ(expected_out_indexes_need_copy,
             tensor_manager.GetOutputIndexesThatNeedCopy());
+  // pipelined
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
@@ -247,9 +285,14 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   // prefetched
   ASSERT_EQ(expected_prefetched_inp_indexes,
             tensor_manager.GetPrefetchedInputIndexes());
-  ASSERT_EQ(expected_pipelined_inp_indexes_prefetched,
-            tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
+  ASSERT_EQ(expected_pipelined_not_prefetched_input_indexes,
+            tensor_manager.GetPipelinedButNotPrefetchedInputIndexes());
 
+  // prefetched wrt pipelined
+  ASSERT_EQ(expected_pipelined_input_indexes_prefetched,
+            tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
+  ASSERT_EQ(expected_pipelined_input_indexes_not_prefetched,
+            tensor_manager.GetPipelinedInputIndexesThatAreNotPrefetched());
   // clean up
   ClearCatalog();
 }
@@ -266,34 +309,53 @@ TEST_F(NGraphTensorManagerTest, VariablesAndPrefetch) {
   vector<int> expected_pipelined_inp_indexes, expected_pipelined_out_indexes,
       expected_var_inp_indexes, expected_var_out_indexes,
       expected_out_indexes_need_copy, expected_prefetched_inp_indexes,
-      expected_pipelined_inp_indexes_prefetched;
+      expected_pipelined_not_prefetched_input_indexes,
+      expected_pipelined_inp_indexes_prefetched,
+      expected_pipelined_inp_indexes_not_prefetched;
 
   if (ngraph_tf_are_variables_enabled()) {
     // expected values
+    // pipelined
     expected_pipelined_inp_indexes = {1, 3, 4, 6};
-    expected_prefetched_inp_indexes = {3, 6};
-    expected_pipelined_inp_indexes_prefetched = {1, 3};
     expected_pipelined_out_indexes = {0, 2};
+    // var
     expected_var_inp_indexes =
         FindComplement(number_of_inputs, expected_pipelined_inp_indexes);
     expected_var_out_indexes =
         FindComplement(number_of_outputs, expected_pipelined_out_indexes);
     expected_out_indexes_need_copy = {2, 3};
+
+    // prefetched
+    expected_prefetched_inp_indexes = {3, 6};
+    expected_pipelined_not_prefetched_input_indexes = {1, 4};
+
+    expected_pipelined_inp_indexes_prefetched = {1, 3};
+    expected_pipelined_inp_indexes_not_prefetched = {0, 2};
+
     // enter in catalog
     EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name,
                       expected_var_inp_indexes, expected_var_out_indexes,
                       expected_out_indexes_need_copy);
 
   } else {
+    // pipelined
     expected_pipelined_inp_indexes = FillRange(number_of_inputs);
     expected_pipelined_out_indexes = FillRange(number_of_outputs);
-    expected_prefetched_inp_indexes = {3, 6};
-    expected_pipelined_inp_indexes_prefetched = {
-        3, 6};  // all inputs are pipelined
 
+    // var
     expected_var_inp_indexes = {};
     expected_var_out_indexes = {};
     expected_out_indexes_need_copy = FillRange(number_of_outputs);
+
+    // prefetched
+    expected_prefetched_inp_indexes = {3, 6};
+    expected_pipelined_not_prefetched_input_indexes = {0, 1, 2, 4, 5};
+
+    // prefetched wrt to pipelining
+    expected_pipelined_inp_indexes_prefetched =
+        expected_prefetched_inp_indexes;  // all inputs are pipelined
+    expected_pipelined_inp_indexes_not_prefetched =
+        expected_pipelined_not_prefetched_input_indexes;
   }
 
   EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
@@ -310,6 +372,7 @@ TEST_F(NGraphTensorManagerTest, VariablesAndPrefetch) {
             tensor_manager.GetOutputIndexesAssigningVariables());
   ASSERT_EQ(expected_out_indexes_need_copy,
             tensor_manager.GetOutputIndexesThatNeedCopy());
+  // pipelined
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,
@@ -318,8 +381,15 @@ TEST_F(NGraphTensorManagerTest, VariablesAndPrefetch) {
   // prefetched
   ASSERT_EQ(expected_prefetched_inp_indexes,
             tensor_manager.GetPrefetchedInputIndexes());
+  ASSERT_EQ(expected_pipelined_not_prefetched_input_indexes,
+            tensor_manager.GetPipelinedButNotPrefetchedInputIndexes());
+
+  // prefetched wrt pipelined
   ASSERT_EQ(expected_pipelined_inp_indexes_prefetched,
             tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
+  ASSERT_EQ(expected_pipelined_inp_indexes_not_prefetched,
+            tensor_manager.GetPipelinedInputIndexesThatAreNotPrefetched());
+
   // clean up
   ClearCatalog();
 }

From d7a735f8a7cea966d57cf60be1f0a82037156d1b Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 19:04:50 -0800
Subject: [PATCH 23/67] added log

---
 examples/CMakeLists.txt                       |  6 +++
 examples/axpy_pipelined_extended.py           | 53 +++++++++++++++++--
 ngraph_bridge/ngraph_encapsulate_op.cc        |  6 ++-
 .../ngraph_find_replace_prefetchdataset.h     |  2 +-
 test/python/test_axpy_pipelined.py            | 10 ++--
 5 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index fcf41fd07..b7c73cdc1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -26,4 +26,10 @@ execute_process(
     COMMAND ${CMAKE_COMMAND} -E create_symlink
         ${CMAKE_CURRENT_SOURCE_DIR}/axpy_pipelined.py
         ${CMAKE_CURRENT_BINARY_DIR}/axpy_pipelined.py
+)
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${CMAKE_CURRENT_SOURCE_DIR}/axpy_pipelined_extended.py
+        ${CMAKE_CURRENT_BINARY_DIR}/axpy_pipelined_extended.py
 )
\ No newline at end of file
diff --git a/examples/axpy_pipelined_extended.py b/examples/axpy_pipelined_extended.py
index 51d3d95e8..5c06b6d2e 100644
--- a/examples/axpy_pipelined_extended.py
+++ b/examples/axpy_pipelined_extended.py
@@ -41,6 +41,21 @@ def build_simple_model(input_array, c1, c2):
     return output, pl
 
 
+def build_simple_model2(input_array, c1, c2):
+    # Convert the numpy array to TF Tensor
+    input_f = tf.cast(input_array, tf.float32)
+
+    # Define the Ops
+    pl = tf.placeholder(dtype=dtypes.int32)
+    pl_f = tf.cast(pl, tf.float32)
+    pl1 = tf.placeholder(dtype=dtypes.int32)
+    pl1_f = tf.cast(pl1, tf.float32)
+    mul = tf.compat.v1.math.multiply(pl1_f, input_f)
+    add = tf.compat.v1.math.add(mul, c1)
+    sub = add - pl_f
+    output = sub + c2
+    return output, pl, pl1
+
 def build_data_pipeline(input_array, map_function, batch_size):
     dataset = (tf.data.Dataset.from_tensor_slices(
         (tf.constant(input_array)
@@ -52,7 +67,7 @@ def build_data_pipeline(input_array, map_function, batch_size):
     return data_to_be_prefetched_and_used, iterator
 
 
-def run_axpy_pipeline():
+def run_axpy_pipeline_extended():
     input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
     output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
@@ -84,8 +99,41 @@ def run_axpy_pipeline():
     return input_array, output_array, expected_output_array
 
 
+
+def run_axpy_pipeline_extended2():
+    input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
+    output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
+    map_multiplier = 10
+
+    map_function = lambda x: x * map_multiplier
+    batch_size = 1
+    pipeline, iterator = build_data_pipeline(input_array, map_function,
+                                             batch_size)
+
+    # some constants
+    c1 = 5.0
+    c2 = 10.0
+    model, pl1, pl2 = build_simple_model2(pipeline, c1, c2)
+
+    with tf.Session() as sess:
+        # Initialize the globals and the dataset
+        sess.run(iterator.initializer)
+
+        for i in range(1, 10):
+            # Expected value is:
+            # Change it to run on TF if the model gets too complex
+            expected_output_array[i - 1] = (
+                (input_array[i - 1] * map_multiplier) * (i+4)) + c1 -i + c2
+
+            # Run one iteration
+            output = sess.run(model, feed_dict={pl1: i, pl2: (i+4)})
+            output_array[i - 1] = output[0]
+    return input_array, output_array, expected_output_array
+
+
 def main(_):
-    input_array, output_array, expected_output_array = run_axpy_pipeline()
+    input_array, output_array, expected_output_array = run_axpy_pipeline_extended()
     for i in range(1, 10):
         print("Iteration:", i, " Input: ", input_array[i - 1], " Output: ",
               output_array[i - 1], " Expected: ", expected_output_array[i - 1])
@@ -93,6 +141,5 @@ def main(_):
 
 
 if __name__ == '__main__':
-    os.environ['NGRAPH_TF_BACKEND'] = "INTERPRETER"
     #os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
     tf.app.run(main=main)
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 72bc9bff7..5938bcc87 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -114,6 +114,7 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
                                                  const string& backend_name) {
+  NGRAPH_VLOG(1)<<"Create Parallel Executor"<<name();
   GraphDef* graph_def;
   unique_ptr<Graph> encap_subgraph(new Graph(OpRegistry::Global()));
 
@@ -185,6 +186,7 @@ void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateLegacyExecutor(OpKernelConstruction* ctx,
                                                const string& backend_name) {
+  NGRAPH_VLOG(1)<<"Create Legacy Executor"<<name();
   ng_encap_impl_.SetName(name());
 
   std::ostringstream oss;
@@ -403,7 +405,7 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
   ngraph::Event event_compute("Compute", "", "");
 
   if (m_use_parallel_executor) {
-    NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Pipelined Executor";
+    NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Parallel Executor";
     ComputeUsingParallelExecutor(ctx);
   } else {
     NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Legacy Executor";
@@ -418,6 +420,7 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
 // ComputeUsingParallelExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
+  NGRAPH_VLOG(1)<<"Compute using Parallel Executor"<<name();
   // TF input tensors
   std::vector<Tensor> tf_input_tensors;
 
@@ -600,6 +603,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 //    ComputeUsingLegacyExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingLegacyExecutor(OpKernelContext* ctx) {
+  NGRAPH_VLOG(1)<<"Compute using Legacy Executor"<<name();
   std::ostringstream oss;
   oss << "Execute: Encapsulate_" << ng_encap_impl_.GetInstanceId() << ": "
       << name();
diff --git a/ngraph_bridge/ngraph_find_replace_prefetchdataset.h b/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
index 54ce2e225..7eef658c4 100644
--- a/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
+++ b/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
@@ -92,7 +92,7 @@ Status ReplacePrefetch(Graph* graph, Node* prefetch_node) {
                          .Finalize(graph, &replacement));
   replacement->set_assigned_device_name(prefetch_node->assigned_device_name());
 
-  string new_name = graph->NewName("NGraph" + prefetch_node->name());
+  string new_name = graph->NewName("NGraph_" + prefetch_node->name());
   replacement->set_name(new_name);
 
   std::vector<const Edge*> edges;
diff --git a/test/python/test_axpy_pipelined.py b/test/python/test_axpy_pipelined.py
index 2c769edec..6046fc94e 100644
--- a/test/python/test_axpy_pipelined.py
+++ b/test/python/test_axpy_pipelined.py
@@ -13,19 +13,18 @@
 
 # For eg. when running the test from ngraph-bridge/build_cmake/test/python
 # you can add this path as below
-#sys.path.insert(0, '../../examples')
+sys.path.insert(0, '../../examples')
 
 from axpy_pipelined import *
+from axpy_pipelined_extended import *
 
 
 class TestAxpyPipelined(NgraphTest):
 
     def test_axpy_pipelined(self):
         prefetch_env = "NGRAPH_TF_USE_PREFETCH"
-        ngraph_backend_i = "NGRAPH_TF_BACKEND"
-        env_var_map = self.store_env_variables([prefetch_env, ngraph_backend_i])
+        env_var_map = self.store_env_variables([prefetch_env])
         self.set_env_variable(prefetch_env, "1")
-        self.set_env_variable(ngraph_backend_i, "INTERPRETER")
         input_array, output_array, expected_output_array = run_axpy_pipeline()
         for i in range(1, 10):
             print("Iteration:", i, " Input: ", input_array[i - 1], " Output: ",
@@ -36,5 +35,4 @@ def test_axpy_pipelined(self):
                 output_array[i - 1], expected_output_array[i - 1],
                 atol=1e-3), "Output  and expected output values don't match"
         self.unset_env_variable(prefetch_env)
-        self.unset_env_variable(ngraph_backend_i)
-        self.restore_env_variables(env_var_map)
+        self.restore_env_variables(env_var_map)
\ No newline at end of file

From fa301de963f31edb779dbf7c421220dbe67e6d1d Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 19:52:25 -0800
Subject: [PATCH 24/67] Added logs

---
 ngraph_bridge/ngraph_encapsulate_op.cc       | 10 +++++-----
 ngraph_bridge/ngraph_encapsulate_op_utils.cc |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 5938bcc87..631531e7c 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -114,7 +114,7 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
                                                  const string& backend_name) {
-  NGRAPH_VLOG(1)<<"Create Parallel Executor"<<name();
+  NGRAPH_VLOG(1)<<"Create Parallel Executor "<<name();
   GraphDef* graph_def;
   unique_ptr<Graph> encap_subgraph(new Graph(OpRegistry::Global()));
 
@@ -186,7 +186,7 @@ void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateLegacyExecutor(OpKernelConstruction* ctx,
                                                const string& backend_name) {
-  NGRAPH_VLOG(1)<<"Create Legacy Executor"<<name();
+  NGRAPH_VLOG(1)<<"Create Legacy Executor "<<name();
   ng_encap_impl_.SetName(name());
 
   std::ostringstream oss;
@@ -420,7 +420,7 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
 // ComputeUsingParallelExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
-  NGRAPH_VLOG(1)<<"Compute using Parallel Executor"<<name();
+  NGRAPH_VLOG(1)<<"Compute using Parallel Executor "<<name();
   // TF input tensors
   std::vector<Tensor> tf_input_tensors;
 
@@ -596,14 +596,14 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   event_return_tensor.Stop();
   ngraph::Event::write_trace(event_return_tensor);
 
-  NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Done";
+  NGRAPH_VLOG(2) << "COMPUTE: Done "<< name();
 }
 
 //---------------------------------------------------------------------------
 //    ComputeUsingLegacyExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingLegacyExecutor(OpKernelContext* ctx) {
-  NGRAPH_VLOG(1)<<"Compute using Legacy Executor"<<name();
+  NGRAPH_VLOG(1)<<"Compute using Legacy Executor "<<name();
   std::ostringstream oss;
   oss << "Execute: Encapsulate_" << ng_encap_impl_.GetInstanceId() << ": "
       << name();
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 55faae905..11a152388 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -59,6 +59,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
+    NGRAPH_VLOG(2)<<"[PREFETCH] NGRAPH_TF_USE_PREFETCH Set";
     // Set the prefetch shared obj if applicable
     NGraphPrefetchSharedResouce* shared_data = nullptr;
     Status s = ctx->resource_manager()->Lookup(
@@ -107,6 +108,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
                         "signal prefetching";
     } else {
+      
       int prefetch_buffer_depth = shared_data->GetBufferDepth();
       int skip_count = shared_data->GetSkipCount();
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth

From 366c78fe90433610c85176067b54f4d504dd0003 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 19:53:12 -0800
Subject: [PATCH 25/67] Added prefetch test

---
 test/python/test_prefetched.py | 140 +++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 test/python/test_prefetched.py

diff --git a/test/python/test_prefetched.py b/test/python/test_prefetched.py
new file mode 100644
index 000000000..3eab28c4f
--- /dev/null
+++ b/test/python/test_prefetched.py
@@ -0,0 +1,140 @@
+import sys
+import pytest
+import getpass
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+import ngraph_bridge
+
+import numpy as np
+from common import NgraphTest
+
+import warnings
+warnings.filterwarnings('ignore', category=FutureWarning)
+
+import ngraph_bridge
+
+class TestPrefetched(NgraphTest):
+    def build_data_pipeline(self,input_array, map_function, batch_size):
+        dataset = (tf.data.Dataset.from_tensor_slices(
+            (tf.constant(input_array)
+            )).map(map_function).batch(batch_size).prefetch(1))
+
+        iterator = dataset.make_initializable_iterator()
+        data_to_be_prefetched_and_used = iterator.get_next()
+        return data_to_be_prefetched_and_used, iterator
+
+
+    def build_model1(self,input_array, c1, c2):
+        # Convert the numpy array to TF Tensor
+        input_f = tf.cast(input_array, tf.float32)
+
+        # Define the Ops
+        pl1 = tf.placeholder(dtype=dtypes.int32)
+        pl1_f = tf.cast(pl1, tf.float32)
+        pl2 = tf.placeholder(dtype=dtypes.int32)
+        pl2_f = tf.cast(pl2, tf.float32)
+
+        mul = tf.compat.v1.math.multiply(input_f, c1)
+        add = tf.compat.v1.math.add(mul, pl2_f)
+        add2 = add + pl1_f
+        output = add2 - c2
+        return output, pl1, pl2
+    
+    def build_model2(self,input_array, c1, c2):
+        # Convert the numpy array to TF Tensor
+        input_f = tf.cast(input_array, tf.float32)
+
+        # Define the Ops
+        pl1 = tf.placeholder(dtype=dtypes.int32)
+        pl1_f = tf.cast(pl1, tf.float32)
+        pl2 = tf.placeholder(dtype=dtypes.int32)
+        pl2_f = tf.cast(pl2, tf.float32)
+        
+        mul = tf.compat.v1.math.multiply(pl2_f, input_f)
+        add = tf.compat.v1.math.add(mul, c2)
+        add2 = add + pl1_f * c1
+        output = add2
+        return output, pl1, pl2
+
+
+    def __run_test(self, pipeline_creator, model):       
+        # build model
+        input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+        map_multiplier = 10
+        map_function = lambda x: x * map_multiplier
+        batch_size = 1
+        pipeline, iterator = pipeline_creator(input_array, map_function,
+                                                batch_size)
+
+        # some constants
+        c1 = 5.0
+        c2 = 10.0
+        model, pl1, pl2 = model(pipeline, c1, c2)
+
+        outputs=[]
+        
+        sess = tf.Session()
+
+        # Initialize the globals and the dataset
+        sess.run(iterator.initializer)
+
+        for i in range(1, 10):
+            output = sess.run(model, feed_dict={pl1: i, pl2: i+3})
+            outputs.append(output)
+        
+        return outputs
+
+
+    # test hangs when "NGRAPH_TF_DISABLE_DEASSIGN_CLUSTERS" is set
+    def test_prefetch1(self):
+        # set flags
+        prefetch_env = "NGRAPH_TF_USE_PREFETCH"
+        env_var_map = self.store_env_variables([prefetch_env])
+        self.set_env_variable(prefetch_env, "1")
+        
+        # Run on nGraph
+        ng_outputs = self.__run_test(self.build_data_pipeline,self.build_model1)
+
+        # Reset Graph
+        tf.reset_default_graph()
+
+        # Run on TF
+        disable_tf="NGRAPH_TF_DISABLE"
+        self.set_env_variable(disable_tf, "1")
+        tf_outputs = self.__run_test(self.build_data_pipeline,self.build_model1)
+
+        # Compare Values
+        assert np.allclose(ng_outputs, tf_outputs)
+                     
+        # unset env variable
+        self.unset_env_variable(prefetch_env)
+        self.unset_env_variable(disable_tf)
+        self.restore_env_variables(env_var_map)
+
+
+    def test_prefetch2(self):
+        # set flags
+        prefetch_env = "NGRAPH_TF_USE_PREFETCH"
+        env_var_map = self.store_env_variables([prefetch_env])
+        self.set_env_variable(prefetch_env, "1")
+        
+        # Run on nGraph
+        ng_outputs = self.__run_test(self.build_data_pipeline, self.build_model2)
+
+        # Reset Graph
+        tf.reset_default_graph()
+
+        # Run on TF
+        disable_tf="NGRAPH_TF_DISABLE"
+        self.set_env_variable(disable_tf, "1")
+        tf_outputs = self.__run_test(self.build_data_pipeline, self.build_model2)
+
+        # Compare Values
+        assert np.allclose(ng_outputs, tf_outputs)
+                     
+        # unset env variable
+        self.unset_env_variable(prefetch_env)
+        self.unset_env_variable(disable_tf)
+        self.restore_env_variables(env_var_map)

From f052c6ddef65bc60677b8d5f42c8aec5fee48e18 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 19:56:30 -0800
Subject: [PATCH 26/67] Format, removed extended file

---
 examples/axpy_pipelined_extended.py          | 145 -------------------
 ngraph_bridge/ngraph_encapsulate_op.cc       |  10 +-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc |   3 +-
 test/python/test_prefetched.py               |  52 +++----
 4 files changed, 33 insertions(+), 177 deletions(-)
 delete mode 100644 examples/axpy_pipelined_extended.py

diff --git a/examples/axpy_pipelined_extended.py b/examples/axpy_pipelined_extended.py
deleted file mode 100644
index 5c06b6d2e..000000000
--- a/examples/axpy_pipelined_extended.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# ==============================================================================
-#  Copyright 2019 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-# ==============================================================================
-import warnings
-warnings.filterwarnings('ignore', category=FutureWarning)
-import numpy as np
-
-import tensorflow as tf
-from tensorflow.python.framework import dtypes
-tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
-
-import os
-import ngraph_bridge
-
-import sys
-
-
-def build_simple_model(input_array, c1, c2):
-    # Convert the numpy array to TF Tensor
-    input_f = tf.cast(input_array, tf.float32)
-
-    # Define the Ops
-    pl = tf.placeholder(dtype=dtypes.int32)
-    pl_f = tf.cast(pl, tf.float32)
-    mul = tf.compat.v1.math.multiply(input_f, c1)
-    add = tf.compat.v1.math.add(mul, c2)
-    add2 = add + pl_f
-    output = add2
-    return output, pl
-
-
-def build_simple_model2(input_array, c1, c2):
-    # Convert the numpy array to TF Tensor
-    input_f = tf.cast(input_array, tf.float32)
-
-    # Define the Ops
-    pl = tf.placeholder(dtype=dtypes.int32)
-    pl_f = tf.cast(pl, tf.float32)
-    pl1 = tf.placeholder(dtype=dtypes.int32)
-    pl1_f = tf.cast(pl1, tf.float32)
-    mul = tf.compat.v1.math.multiply(pl1_f, input_f)
-    add = tf.compat.v1.math.add(mul, c1)
-    sub = add - pl_f
-    output = sub + c2
-    return output, pl, pl1
-
-def build_data_pipeline(input_array, map_function, batch_size):
-    dataset = (tf.data.Dataset.from_tensor_slices(
-        (tf.constant(input_array)
-        )).map(map_function).batch(batch_size).prefetch(1))
-
-    iterator = dataset.make_initializable_iterator()
-    data_to_be_prefetched_and_used = iterator.get_next()
-
-    return data_to_be_prefetched_and_used, iterator
-
-
-def run_axpy_pipeline_extended():
-    input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-    expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
-    output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
-    map_multiplier = 10
-
-    map_function = lambda x: x * map_multiplier
-    batch_size = 1
-    pipeline, iterator = build_data_pipeline(input_array, map_function,
-                                             batch_size)
-
-    # some constants
-    c1 = 5.0
-    c2 = 10.0
-    model, pl = build_simple_model(pipeline, c1, c2)
-
-    with tf.Session() as sess:
-        # Initialize the globals and the dataset
-        sess.run(iterator.initializer)
-
-        for i in range(1, 10):
-            # Expected value is:
-            # Change it to run on TF if the model gets too complex
-            expected_output_array[i - 1] = (
-                (input_array[i - 1] * map_multiplier) * c1) + c2 + i
-
-            # Run one iteration
-            output = sess.run(model, feed_dict={pl: i})
-            output_array[i - 1] = output[0]
-    return input_array, output_array, expected_output_array
-
-
-
-def run_axpy_pipeline_extended2():
-    input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-    expected_output_array = [-1, -1, 1, -1, -1, -1, -1, -1, -1]
-    output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
-    map_multiplier = 10
-
-    map_function = lambda x: x * map_multiplier
-    batch_size = 1
-    pipeline, iterator = build_data_pipeline(input_array, map_function,
-                                             batch_size)
-
-    # some constants
-    c1 = 5.0
-    c2 = 10.0
-    model, pl1, pl2 = build_simple_model2(pipeline, c1, c2)
-
-    with tf.Session() as sess:
-        # Initialize the globals and the dataset
-        sess.run(iterator.initializer)
-
-        for i in range(1, 10):
-            # Expected value is:
-            # Change it to run on TF if the model gets too complex
-            expected_output_array[i - 1] = (
-                (input_array[i - 1] * map_multiplier) * (i+4)) + c1 -i + c2
-
-            # Run one iteration
-            output = sess.run(model, feed_dict={pl1: i, pl2: (i+4)})
-            output_array[i - 1] = output[0]
-    return input_array, output_array, expected_output_array
-
-
-def main(_):
-    input_array, output_array, expected_output_array = run_axpy_pipeline_extended()
-    for i in range(1, 10):
-        print("Iteration:", i, " Input: ", input_array[i - 1], " Output: ",
-              output_array[i - 1], " Expected: ", expected_output_array[i - 1])
-        sys.stdout.flush()
-
-
-if __name__ == '__main__':
-    #os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
-    tf.app.run(main=main)
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 631531e7c..fd2b52bb4 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -114,7 +114,7 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
                                                  const string& backend_name) {
-  NGRAPH_VLOG(1)<<"Create Parallel Executor "<<name();
+  NGRAPH_VLOG(1) << "Create Parallel Executor " << name();
   GraphDef* graph_def;
   unique_ptr<Graph> encap_subgraph(new Graph(OpRegistry::Global()));
 
@@ -186,7 +186,7 @@ void NGraphEncapsulateOp::CreateParallelExecutor(OpKernelConstruction* ctx,
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::CreateLegacyExecutor(OpKernelConstruction* ctx,
                                                const string& backend_name) {
-  NGRAPH_VLOG(1)<<"Create Legacy Executor "<<name();
+  NGRAPH_VLOG(1) << "Create Legacy Executor " << name();
   ng_encap_impl_.SetName(name());
 
   std::ostringstream oss;
@@ -420,7 +420,7 @@ void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
 // ComputeUsingParallelExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
-  NGRAPH_VLOG(1)<<"Compute using Parallel Executor "<<name();
+  NGRAPH_VLOG(1) << "Compute using Parallel Executor " << name();
   // TF input tensors
   std::vector<Tensor> tf_input_tensors;
 
@@ -596,14 +596,14 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   event_return_tensor.Stop();
   ngraph::Event::write_trace(event_return_tensor);
 
-  NGRAPH_VLOG(2) << "COMPUTE: Done "<< name();
+  NGRAPH_VLOG(2) << "COMPUTE: Done " << name();
 }
 
 //---------------------------------------------------------------------------
 //    ComputeUsingLegacyExecutor
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::ComputeUsingLegacyExecutor(OpKernelContext* ctx) {
-  NGRAPH_VLOG(1)<<"Compute using Legacy Executor "<<name();
+  NGRAPH_VLOG(1) << "Compute using Legacy Executor " << name();
   std::ostringstream oss;
   oss << "Execute: Encapsulate_" << ng_encap_impl_.GetInstanceId() << ": "
       << name();
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 11a152388..236e15923 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -59,7 +59,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
   bool skip_tf2ng_copy = false;
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
       nullptr) {
-    NGRAPH_VLOG(2)<<"[PREFETCH] NGRAPH_TF_USE_PREFETCH Set";
+    NGRAPH_VLOG(2) << "[PREFETCH] NGRAPH_TF_USE_PREFETCH Set";
     // Set the prefetch shared obj if applicable
     NGraphPrefetchSharedResouce* shared_data = nullptr;
     Status s = ctx->resource_manager()->Lookup(
@@ -108,7 +108,6 @@ Status GetPipelinedIOTensorsReadyForExecution(
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: Creating the shared object to "
                         "signal prefetching";
     } else {
-      
       int prefetch_buffer_depth = shared_data->GetBufferDepth();
       int skip_count = shared_data->GetSkipCount();
       NGRAPH_VLOG(2) << "[PREFETCH] COMPUTE: DEPTH: " << prefetch_buffer_depth
diff --git a/test/python/test_prefetched.py b/test/python/test_prefetched.py
index 3eab28c4f..deb5008bd 100644
--- a/test/python/test_prefetched.py
+++ b/test/python/test_prefetched.py
@@ -15,8 +15,10 @@
 
 import ngraph_bridge
 
+
 class TestPrefetched(NgraphTest):
-    def build_data_pipeline(self,input_array, map_function, batch_size):
+
+    def build_data_pipeline(self, input_array, map_function, batch_size):
         dataset = (tf.data.Dataset.from_tensor_slices(
             (tf.constant(input_array)
             )).map(map_function).batch(batch_size).prefetch(1))
@@ -25,8 +27,7 @@ def build_data_pipeline(self,input_array, map_function, batch_size):
         data_to_be_prefetched_and_used = iterator.get_next()
         return data_to_be_prefetched_and_used, iterator
 
-
-    def build_model1(self,input_array, c1, c2):
+    def build_model1(self, input_array, c1, c2):
         # Convert the numpy array to TF Tensor
         input_f = tf.cast(input_array, tf.float32)
 
@@ -41,8 +42,8 @@ def build_model1(self,input_array, c1, c2):
         add2 = add + pl1_f
         output = add2 - c2
         return output, pl1, pl2
-    
-    def build_model2(self,input_array, c1, c2):
+
+    def build_model2(self, input_array, c1, c2):
         # Convert the numpy array to TF Tensor
         input_f = tf.cast(input_array, tf.float32)
 
@@ -51,41 +52,39 @@ def build_model2(self,input_array, c1, c2):
         pl1_f = tf.cast(pl1, tf.float32)
         pl2 = tf.placeholder(dtype=dtypes.int32)
         pl2_f = tf.cast(pl2, tf.float32)
-        
+
         mul = tf.compat.v1.math.multiply(pl2_f, input_f)
         add = tf.compat.v1.math.add(mul, c2)
         add2 = add + pl1_f * c1
         output = add2
         return output, pl1, pl2
 
-
-    def __run_test(self, pipeline_creator, model):       
+    def __run_test(self, pipeline_creator, model):
         # build model
         input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
         map_multiplier = 10
         map_function = lambda x: x * map_multiplier
         batch_size = 1
         pipeline, iterator = pipeline_creator(input_array, map_function,
-                                                batch_size)
+                                              batch_size)
 
         # some constants
         c1 = 5.0
         c2 = 10.0
         model, pl1, pl2 = model(pipeline, c1, c2)
 
-        outputs=[]
-        
+        outputs = []
+
         sess = tf.Session()
 
         # Initialize the globals and the dataset
         sess.run(iterator.initializer)
 
         for i in range(1, 10):
-            output = sess.run(model, feed_dict={pl1: i, pl2: i+3})
+            output = sess.run(model, feed_dict={pl1: i, pl2: i + 3})
             outputs.append(output)
-        
-        return outputs
 
+        return outputs
 
     # test hangs when "NGRAPH_TF_DISABLE_DEASSIGN_CLUSTERS" is set
     def test_prefetch1(self):
@@ -93,47 +92,50 @@ def test_prefetch1(self):
         prefetch_env = "NGRAPH_TF_USE_PREFETCH"
         env_var_map = self.store_env_variables([prefetch_env])
         self.set_env_variable(prefetch_env, "1")
-        
+
         # Run on nGraph
-        ng_outputs = self.__run_test(self.build_data_pipeline,self.build_model1)
+        ng_outputs = self.__run_test(self.build_data_pipeline,
+                                     self.build_model1)
 
         # Reset Graph
         tf.reset_default_graph()
 
         # Run on TF
-        disable_tf="NGRAPH_TF_DISABLE"
+        disable_tf = "NGRAPH_TF_DISABLE"
         self.set_env_variable(disable_tf, "1")
-        tf_outputs = self.__run_test(self.build_data_pipeline,self.build_model1)
+        tf_outputs = self.__run_test(self.build_data_pipeline,
+                                     self.build_model1)
 
         # Compare Values
         assert np.allclose(ng_outputs, tf_outputs)
-                     
+
         # unset env variable
         self.unset_env_variable(prefetch_env)
         self.unset_env_variable(disable_tf)
         self.restore_env_variables(env_var_map)
 
-
     def test_prefetch2(self):
         # set flags
         prefetch_env = "NGRAPH_TF_USE_PREFETCH"
         env_var_map = self.store_env_variables([prefetch_env])
         self.set_env_variable(prefetch_env, "1")
-        
+
         # Run on nGraph
-        ng_outputs = self.__run_test(self.build_data_pipeline, self.build_model2)
+        ng_outputs = self.__run_test(self.build_data_pipeline,
+                                     self.build_model2)
 
         # Reset Graph
         tf.reset_default_graph()
 
         # Run on TF
-        disable_tf="NGRAPH_TF_DISABLE"
+        disable_tf = "NGRAPH_TF_DISABLE"
         self.set_env_variable(disable_tf, "1")
-        tf_outputs = self.__run_test(self.build_data_pipeline, self.build_model2)
+        tf_outputs = self.__run_test(self.build_data_pipeline,
+                                     self.build_model2)
 
         # Compare Values
         assert np.allclose(ng_outputs, tf_outputs)
-                     
+
         # unset env variable
         self.unset_env_variable(prefetch_env)
         self.unset_env_variable(disable_tf)

From 120621cf45574473315e4a7e6e37f125b1602ea8 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 11 Dec 2019 20:09:55 -0800
Subject: [PATCH 27/67] minor

---
 examples/CMakeLists.txt            | 6 ------
 test/python/test_axpy_pipelined.py | 3 +--
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b7c73cdc1..0666fee83 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -27,9 +27,3 @@ execute_process(
         ${CMAKE_CURRENT_SOURCE_DIR}/axpy_pipelined.py
         ${CMAKE_CURRENT_BINARY_DIR}/axpy_pipelined.py
 )
-
-execute_process(
-    COMMAND ${CMAKE_COMMAND} -E create_symlink
-        ${CMAKE_CURRENT_SOURCE_DIR}/axpy_pipelined_extended.py
-        ${CMAKE_CURRENT_BINARY_DIR}/axpy_pipelined_extended.py
-)
\ No newline at end of file
diff --git a/test/python/test_axpy_pipelined.py b/test/python/test_axpy_pipelined.py
index 6046fc94e..42163ecec 100644
--- a/test/python/test_axpy_pipelined.py
+++ b/test/python/test_axpy_pipelined.py
@@ -16,7 +16,6 @@
 sys.path.insert(0, '../../examples')
 
 from axpy_pipelined import *
-from axpy_pipelined_extended import *
 
 
 class TestAxpyPipelined(NgraphTest):
@@ -35,4 +34,4 @@ def test_axpy_pipelined(self):
                 output_array[i - 1], expected_output_array[i - 1],
                 atol=1e-3), "Output  and expected output values don't match"
         self.unset_env_variable(prefetch_env)
-        self.restore_env_variables(env_var_map)
\ No newline at end of file
+        self.restore_env_variables(env_var_map)

From 7fc95071dbad4682c72fc3821730139d5dc1c1ef Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 12 Dec 2019 12:14:43 -0800
Subject: [PATCH 28/67] fixed test

---
 .../ngraph_find_replace_prefetchdataset.h      |  3 ++-
 ngraph_bridge/ngraph_tensor_manager.cc         |  2 +-
 test/test_ngraph_tensor_manager.cpp            | 18 +++++++++++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/ngraph_bridge/ngraph_find_replace_prefetchdataset.h b/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
index 7eef658c4..360517665 100644
--- a/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
+++ b/ngraph_bridge/ngraph_find_replace_prefetchdataset.h
@@ -129,9 +129,10 @@ Status ReplacePrefetch(Graph* graph, Node* prefetch_node) {
   }
 
   // Finally remove the current preftetch node
-  graph->RemoveNode(prefetch_node);
   NGRAPH_VLOG(4) << "Replaced TF Prefetch Node " << prefetch_node->name()
                  << " with NG Prefetch Node " << replacement->name();
+  graph->RemoveNode(prefetch_node);
+
   return Status::OK();
 }
 
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 7255a0081..a659cfa65 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -95,7 +95,7 @@ void NGraphTensorManager::Initialize() {
 
   // complements
   m_pipelined_input_indexes_not_prefetched = FindComplement(
-      m_pipelined_input_indexes, m_pipelined_input_indexes_prefetched);
+      m_pipelined_input_indexes.size(), m_pipelined_input_indexes_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
 }
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index 0efba0163..7eb435bdd 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -107,15 +107,20 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
   int number_of_inputs = 5;
   int number_of_outputs = 2;
 
-  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
-                                     ng_encap_graph_id, number_of_inputs,
-                                     number_of_outputs);
   // expected
   vector<int> empty;
   vector<int> expected_pipelined_inp_indexes = FillRange(number_of_inputs);
   vector<int> expected_pipelined_out_indexes = FillRange(number_of_outputs);
   vector<int> expected_out_indexes_need_copy = FillRange(number_of_outputs);
 
+  if (ngraph_tf_are_variables_enabled()) {
+    EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name, empty, empty,
+                      expected_out_indexes_need_copy);
+  }
+  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
+                                     ng_encap_graph_id, number_of_inputs,
+                                     number_of_outputs);
+
   // var related
   ASSERT_EQ(empty, tensor_manager.GetInputIndexesFedByVariables());
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
@@ -136,6 +141,8 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
   ASSERT_EQ(empty, tensor_manager.GetPipelinedInputIndexesThatArePrefetched());
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexesThatAreNotPrefetched());
+  // clean up
+  ClearCatalog();
 }
 
 // Tests scenario when the graph has variables but no prefetched inputs
@@ -264,6 +271,11 @@ TEST_F(NGraphTensorManagerTest, NoVariablesHasPrefetch) {
   vector<int> expected_pipelined_input_indexes_not_prefetched =
       expected_pipelined_not_prefetched_input_indexes;
 
+  if (ngraph_tf_are_variables_enabled()) {
+    EnterVarInCatalog(ng_encap_graph_id, ng_encap_node_name, empty, empty,
+                      expected_out_indexes_need_copy);
+  }
+
   EnterPrefetchInCatalog(ng_encap_graph_id, ng_encap_node_name,
                          expected_prefetched_inp_indexes);
 

From 6d4c03705bf6d3e36fe5c04da9873a5acdc80ee4 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 12 Dec 2019 13:42:16 -0800
Subject: [PATCH 29/67] FindComplement modified

---
 ngraph_bridge/ngraph_utils.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/ngraph_bridge/ngraph_utils.cc b/ngraph_bridge/ngraph_utils.cc
index 240ed82c6..e3cad7be8 100644
--- a/ngraph_bridge/ngraph_utils.cc
+++ b/ngraph_bridge/ngraph_utils.cc
@@ -48,13 +48,7 @@ vector<int> FindComplement(const int& max_element,
   vector<int> superset(max_element);
   iota(begin(superset), end(superset), 0);
 
-  // max size of complement is superset
-  vector<int> complement(superset.size());
-  vector<int>::iterator it = set_difference(
-      superset.begin(), superset.begin() + superset.size(), element_set.begin(),
-      element_set.begin() + element_set.size(), complement.begin());
-  complement.resize(it - complement.begin());
-  return complement;
+  return FindComplement(superset, element_set);
 }
 
 // Finds the complement of element_set

From 31620bb1c00393560d5e8328aab6b017dfe06945 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 12 Dec 2019 18:46:20 -0800
Subject: [PATCH 30/67] Apply suggestions from code review

Co-Authored-By: kanvi-nervana <kanvi.khanna@intel.com>
---
 ngraph_bridge/ngraph_prefetch_shared_data.h | 2 +-
 test/test_enter_prefetch_in_catalog.cc      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index 8ce683efd..f92bfb6a0 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -90,7 +90,7 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
     m_ng_2_tf.Add(std::move(next));
   }
 
-  // Returns the Input output tensors to be ready to be executed by NG device
+  // Returns the Input output tensors ready to be executed by NG device
   // This will be called by the NGEncOp
   IOTensorBundle GetNextIOTensorBundleReadyForDeviceExecution() {
     return std::move(m_ng_2_tf.GetNextAvailable());
diff --git a/test/test_enter_prefetch_in_catalog.cc b/test/test_enter_prefetch_in_catalog.cc
index 2d23af0c3..08cb4a6f5 100644
--- a/test/test_enter_prefetch_in_catalog.cc
+++ b/test/test_enter_prefetch_in_catalog.cc
@@ -112,11 +112,11 @@ TEST(PrefetchCatalogTest, SmallGraph2) {
 
   // Clean up
   NGraphCatalog::ClearCatalog();
-  // Unset, Restore env flga
+  // Unset, restore env flags
   UnsetEnvVariable("NGRAPH_TF_USE_PREFETCH");
   RestoreEnv(env_map);
 }
 
 }  // namespace testing
 }  // namespace ngraph_bridge
-}  // namespace tensorflow
\ No newline at end of file
+}  // namespace tensorflow

From 19154934514964a87090a4c4092883e0458fa80e Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 12 Dec 2019 18:47:22 -0800
Subject: [PATCH 31/67] incorporate review comemnts

---
 ngraph_bridge/ngraph_encapsulate_op_utils.cc |  3 +--
 ngraph_bridge/ngraph_prefetch_shared_data.h  |  6 +-----
 test/python/test_axpy_pipelined.py           |  2 +-
 test/python/test_prefetched.py               | 18 ++++++++++++++++++
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 236e15923..7f3da4a32 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -71,8 +71,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
       // 1. Create the shared data object
       // 2. We get another pipelined tensor pair for the current iteration and
       //   add it to the shared data. It will be accessed by prefetcher to copy
-      //   the
-      // prefetched inputs to device
+      //   the prefetched inputs to device
       auto ng_prefetch_input_indexes =
           tensor_manager->GetPipelinedInputIndexesThatArePrefetched();
 
diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index 8ce683efd..cb736798c 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -39,11 +39,9 @@ namespace ngraph_bridge {
 class NGraphPrefetchSharedResouce : public ResourceBase {
  public:
   explicit NGraphPrefetchSharedResouce(const std::string& ng_enc_op_name,
-                                       // const std::string& backend_name,
                                        int cluster_id, int graph_id,
                                        const vector<int> prefetch_input_indexes)
       : m_ng_enc_op_name(ng_enc_op_name),
-        // m_backend_name(backend_name),
         m_graph_id(graph_id),
         m_cluster_id(cluster_id),
         m_prefetch_input_indexes(prefetch_input_indexes) {}
@@ -54,7 +52,6 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   // Returns memory used by this resource.
   int64 MemoryUsed() const override { return 0; }
   std::string GetName() const { return m_ng_enc_op_name; }
-  // std::string GetBackendName() const { return m_backend_name; }
   int GetGraphId() const { return m_graph_id; }
   int GetClusterId() const { return m_cluster_id; }
 
@@ -90,7 +87,7 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
     m_ng_2_tf.Add(std::move(next));
   }
 
-  // Returns the Input output tensors to be ready to be executed by NG device
+  // Returns the Input output tensors ready to be executed by NG device
   // This will be called by the NGEncOp
   IOTensorBundle GetNextIOTensorBundleReadyForDeviceExecution() {
     return std::move(m_ng_2_tf.GetNextAvailable());
@@ -126,7 +123,6 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
 
  private:
   const std::string m_ng_enc_op_name;
-  // const std::string m_backend_name;
   const int m_graph_id;
   const int m_cluster_id;
   const vector<int> m_prefetch_input_indexes;
diff --git a/test/python/test_axpy_pipelined.py b/test/python/test_axpy_pipelined.py
index 42163ecec..c3607d973 100644
--- a/test/python/test_axpy_pipelined.py
+++ b/test/python/test_axpy_pipelined.py
@@ -13,7 +13,7 @@
 
 # For eg. when running the test from ngraph-bridge/build_cmake/test/python
 # you can add this path as below
-sys.path.insert(0, '../../examples')
+# sys.path.insert(0, '../../examples')
 
 from axpy_pipelined import *
 
diff --git a/test/python/test_prefetched.py b/test/python/test_prefetched.py
index deb5008bd..b964bb63e 100644
--- a/test/python/test_prefetched.py
+++ b/test/python/test_prefetched.py
@@ -1,3 +1,21 @@
+# ==============================================================================
+#  Copyright 2018-2019 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+# ==============================================================================
+"""nGraph TensorFlow bridge prefetch test
+
+"""
 import sys
 import pytest
 import getpass

From a4f9e1fb0500cb9715634003c6798554fa75c7e6 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 13 Dec 2019 14:20:20 -0800
Subject: [PATCH 32/67] addressed review comments

---
 ngraph_bridge/ngraph_encapsulate_op.cc      |  3 ++-
 ngraph_bridge/ngraph_encapsulate_op_utils.h | 15 +++++++++++++++
 ngraph_bridge/ngraph_prefetch_shared_data.h | 10 ++++------
 test/test_ngraph_tensor_manager.cpp         |  2 +-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index fd2b52bb4..9a48d8c92 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -499,7 +499,8 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
-  // Assume All inputs and outputs are pipelined
+  // All inputs and outputs are pipelined.
+  // Of all these pipelined inputs some are prefetched
   // TODO: Fit in variables
   ng_inputs = get<1>(pipelined_io_tensors);
   ng_outputs = get<2>(pipelined_io_tensors);
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 33d1476d2..7f48eb09c 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -30,6 +30,21 @@ namespace tensorflow {
 
 namespace ngraph_bridge {
 
+// This function does the following
+// 1. Gets pipelined tensors for current execution from pipelined tensor store
+// (PTS)
+// 2. If prefetch is enabled
+//          a. if prefetch shared resource is not created
+//               creates it
+//               gets next set of tensors from PTS and adds it to the shared
+//               object for prefetching
+//          b.  else
+//               gets the tensors from prefetch object and adds the tensors from
+//               step 1 to the prefetch object
+// 3. Copies the tf input tensors that are not prefetched to the ngraph
+// pipelined input tensors
+//
+
 Status GetPipelinedIOTensorsReadyForExecution(
     OpKernelContext* ctx, vector<Tensor>& tf_input_tensors,
     shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
diff --git a/ngraph_bridge/ngraph_prefetch_shared_data.h b/ngraph_bridge/ngraph_prefetch_shared_data.h
index cb736798c..d42abd7cf 100644
--- a/ngraph_bridge/ngraph_prefetch_shared_data.h
+++ b/ngraph_bridge/ngraph_prefetch_shared_data.h
@@ -38,9 +38,9 @@ namespace ngraph_bridge {
 
 class NGraphPrefetchSharedResouce : public ResourceBase {
  public:
-  explicit NGraphPrefetchSharedResouce(const std::string& ng_enc_op_name,
-                                       int cluster_id, int graph_id,
-                                       const vector<int> prefetch_input_indexes)
+  explicit NGraphPrefetchSharedResouce(
+      const std::string& ng_enc_op_name, int cluster_id, int graph_id,
+      const vector<int>& prefetch_input_indexes)
       : m_ng_enc_op_name(ng_enc_op_name),
         m_graph_id(graph_id),
         m_cluster_id(cluster_id),
@@ -140,9 +140,7 @@ class NGraphPrefetchSharedResouce : public ResourceBase {
   // 1          NGEncOp pushes the Input/Output tensors to m_ng_2_tf queue
   // 2
   //            Prefetcher pulls Input/Output tensors out of m_ng_2_tf queue and
-  //            copies
-  //            TF
-  //            data to the prefetched inputs
+  //            and copies TF data to the prefetched inputs
   //            Prefetcher pushes this item to the m_tf_2_ng queue
   //            NGEncOp pushes the Input/Output tensors to m_ng_2_tf queue
   //            NGEncOp pulls Input/Output tensors from m_tf_2_ng (from previous
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index 7eb435bdd..d92779efc 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -126,7 +126,7 @@ TEST_F(NGraphTensorManagerTest, NoVariablesNoPrefetch) {
   ASSERT_EQ(empty, tensor_manager.GetOutputIndexesAssigningVariables());
   ASSERT_EQ(expected_out_indexes_need_copy,
             tensor_manager.GetOutputIndexesThatNeedCopy());
-  // piplined
+  // pipelined
   ASSERT_EQ(expected_pipelined_inp_indexes,
             tensor_manager.GetPipelinedInputIndexes());
   ASSERT_EQ(expected_pipelined_out_indexes,

From 85368442e479d230061f3b21d3f6fd85a58ed6a2 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 13 Dec 2019 14:47:14 -0800
Subject: [PATCH 33/67] remove test-prefetch-2

---
 test/python/test_prefetched.py | 43 ----------------------------------
 1 file changed, 43 deletions(-)

diff --git a/test/python/test_prefetched.py b/test/python/test_prefetched.py
index b964bb63e..a501474cd 100644
--- a/test/python/test_prefetched.py
+++ b/test/python/test_prefetched.py
@@ -61,22 +61,6 @@ def build_model1(self, input_array, c1, c2):
         output = add2 - c2
         return output, pl1, pl2
 
-    def build_model2(self, input_array, c1, c2):
-        # Convert the numpy array to TF Tensor
-        input_f = tf.cast(input_array, tf.float32)
-
-        # Define the Ops
-        pl1 = tf.placeholder(dtype=dtypes.int32)
-        pl1_f = tf.cast(pl1, tf.float32)
-        pl2 = tf.placeholder(dtype=dtypes.int32)
-        pl2_f = tf.cast(pl2, tf.float32)
-
-        mul = tf.compat.v1.math.multiply(pl2_f, input_f)
-        add = tf.compat.v1.math.add(mul, c2)
-        add2 = add + pl1_f * c1
-        output = add2
-        return output, pl1, pl2
-
     def __run_test(self, pipeline_creator, model):
         # build model
         input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
@@ -131,30 +115,3 @@ def test_prefetch1(self):
         self.unset_env_variable(prefetch_env)
         self.unset_env_variable(disable_tf)
         self.restore_env_variables(env_var_map)
-
-    def test_prefetch2(self):
-        # set flags
-        prefetch_env = "NGRAPH_TF_USE_PREFETCH"
-        env_var_map = self.store_env_variables([prefetch_env])
-        self.set_env_variable(prefetch_env, "1")
-
-        # Run on nGraph
-        ng_outputs = self.__run_test(self.build_data_pipeline,
-                                     self.build_model2)
-
-        # Reset Graph
-        tf.reset_default_graph()
-
-        # Run on TF
-        disable_tf = "NGRAPH_TF_DISABLE"
-        self.set_env_variable(disable_tf, "1")
-        tf_outputs = self.__run_test(self.build_data_pipeline,
-                                     self.build_model2)
-
-        # Compare Values
-        assert np.allclose(ng_outputs, tf_outputs)
-
-        # unset env variable
-        self.unset_env_variable(prefetch_env)
-        self.unset_env_variable(disable_tf)
-        self.restore_env_variables(env_var_map)

From bf8e91892764f89589740533f02138791b59aaf4 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 13 Dec 2019 15:35:30 -0800
Subject: [PATCH 34/67] renamed the vars for indexes relative to pipelined
 indexes

---
 ngraph_bridge/ngraph_tensor_manager.cc | 7 ++++---
 ngraph_bridge/ngraph_tensor_manager.h  | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index a659cfa65..422b217ed 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -89,13 +89,14 @@ void NGraphTensorManager::Initialize() {
                                to_string(pref_index) +
                                " not found in pipelined inputs.");
     }
-    m_pipelined_input_indexes_prefetched.push_back(
+    m_pipelined_input_indexes_that_are_prefetched.push_back(
         position - m_pipelined_input_indexes.begin());
   }
 
   // complements
-  m_pipelined_input_indexes_not_prefetched = FindComplement(
-      m_pipelined_input_indexes.size(), m_pipelined_input_indexes_prefetched);
+  m_pipelined_input_indexes_that_are_not_prefetched =
+      FindComplement(m_pipelined_input_indexes.size(),
+                     m_pipelined_input_indexes_that_are_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
 }
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 2fe2aa0a7..47f118f86 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -84,12 +84,12 @@ class NGraphTensorManager {
 
   // wrt to pipelined inputs
   const vector<int>& GetPipelinedInputIndexesThatArePrefetched() {
-    return m_pipelined_input_indexes_prefetched;
+    return m_pipelined_input_indexes_that_are_prefetched;
   }
 
   // wrt to pipelined inputs
   const vector<int>& GetPipelinedInputIndexesThatAreNotPrefetched() {
-    return m_pipelined_input_indexes_not_prefetched;
+    return m_pipelined_input_indexes_that_are_not_prefetched;
   }
 
  private:
@@ -112,8 +112,8 @@ class NGraphTensorManager {
   vector<int> m_pipelined_input_indexes;
   vector<int> m_pipelined_output_indexes;
   // indexes wrt pipelined inputs
-  vector<int> m_pipelined_input_indexes_prefetched;
-  vector<int> m_pipelined_input_indexes_not_prefetched;
+  vector<int> m_pipelined_input_indexes_that_are_prefetched;
+  vector<int> m_pipelined_input_indexes_that_are_not_prefetched;
 
   // indexes wrt all inputs
   vector<int> m_prefetched_input_indexes;

From b2300b76f0556561446a933da5b10c3c3d293fa5 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 13 Dec 2019 15:53:46 -0800
Subject: [PATCH 35/67] examples

---
 examples/axpy_pipelined.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/axpy_pipelined.py b/examples/axpy_pipelined.py
index 8bf3dc2ef..00a9aec1d 100644
--- a/examples/axpy_pipelined.py
+++ b/examples/axpy_pipelined.py
@@ -86,6 +86,5 @@ def main(_):
 
 
 if __name__ == '__main__':
-    os.environ['NGRAPH_TF_BACKEND'] = "INTERPRETER"
     #os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
     tf.app.run(main=main)

From 5222150b78a0b5d8fe06dbd868679e6941b95379 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Fri, 13 Dec 2019 17:36:23 -0800
Subject: [PATCH 36/67] fixed hang seen when disable deassign

---
 ngraph_bridge/ngraph_encapsulate_op_utils.cc | 7 ++++++-
 test/python/test_prefetched.py               | 1 -
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 7f3da4a32..2bb71db76 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -57,8 +57,13 @@ Status GetPipelinedIOTensorsReadyForExecution(
   }
 
   bool skip_tf2ng_copy = false;
+  // Prefetch only if there are input tensors that are prefetched && prefetch
+  // has been requested
+  // [TODO] we support prefetching only when there is atmost 1 encap
+  // that has prefetched inputs
   if (std::getenv(NGraphPrefetchSharedResouce::NGRAPH_TF_USE_PREFETCH) !=
-      nullptr) {
+          nullptr &&
+      !(tensor_manager->GetPipelinedInputIndexesThatArePrefetched()).empty()) {
     NGRAPH_VLOG(2) << "[PREFETCH] NGRAPH_TF_USE_PREFETCH Set";
     // Set the prefetch shared obj if applicable
     NGraphPrefetchSharedResouce* shared_data = nullptr;
diff --git a/test/python/test_prefetched.py b/test/python/test_prefetched.py
index a501474cd..ea3357436 100644
--- a/test/python/test_prefetched.py
+++ b/test/python/test_prefetched.py
@@ -88,7 +88,6 @@ def __run_test(self, pipeline_creator, model):
 
         return outputs
 
-    # test hangs when "NGRAPH_TF_DISABLE_DEASSIGN_CLUSTERS" is set
     def test_prefetch1(self):
         # set flags
         prefetch_env = "NGRAPH_TF_USE_PREFETCH"

From f6baec4df46aae451b529001350967da082e29c0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Sat, 14 Dec 2019 16:42:09 -0800
Subject: [PATCH 37/67] Extended TM to store variable shared name

---
 ngraph_bridge/ngraph_catalog.cc        | 10 +++-
 ngraph_bridge/ngraph_catalog.h         |  4 ++
 ngraph_bridge/ngraph_tensor_manager.cc | 75 ++++++++++++++++++++++++++
 ngraph_bridge/ngraph_tensor_manager.h  | 17 ++++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/ngraph_catalog.cc b/ngraph_bridge/ngraph_catalog.cc
index 95b65f506..169478467 100644
--- a/ngraph_bridge/ngraph_catalog.cc
+++ b/ngraph_bridge/ngraph_catalog.cc
@@ -171,12 +171,18 @@ bool NGraphCatalog::ExistsInEncapOutputInfoMap(const string& key) {
 
 bool NGraphCatalog::ExistsInEncapOutputInfoMap(const int& graphid,
                                                const string& node_name,
-                                               const int& input_index) {
-  string key = NGraphCatalog::CreateNodeKey(graphid, node_name, input_index);
+                                               const int& output_index) {
+  string key = NGraphCatalog::CreateNodeKey(graphid, node_name, output_index);
   auto itr = NGraphCatalog::encap_output_info_map_.find(key);
   return itr != NGraphCatalog::encap_output_info_map_.end();
 }
 
+const tuple<string, bool>& NGraphCatalog::GetInfoFromEncapOutputInfoMap(
+    const int& graphid, const string& node_name, const int& output_index) {
+  string key = NGraphCatalog::CreateNodeKey(graphid, node_name, output_index);
+  return NGraphCatalog::GetInfoFromEncapOutputInfoMap(key);
+}
+
 const tuple<string, bool>& NGraphCatalog::GetInfoFromEncapOutputInfoMap(
     const string& key) {
   return NGraphCatalog::encap_output_info_map_.at(key);
diff --git a/ngraph_bridge/ngraph_catalog.h b/ngraph_bridge/ngraph_catalog.h
index 85c4c45a2..c27b3bccf 100644
--- a/ngraph_bridge/ngraph_catalog.h
+++ b/ngraph_bridge/ngraph_catalog.h
@@ -138,6 +138,10 @@ class NGraphCatalog {
                                          const int& output_index);
   static const tuple<string, bool>& GetInfoFromEncapOutputInfoMap(
       const string& key);
+
+  static const tuple<string, bool>& GetInfoFromEncapOutputInfoMap(
+      const int& graphid, const string& node_name, const int& output_index);
+
   static const string& GetVariableSharedNameFromEncapOutputInfoMap(
       const string& key);
   static const bool& GetCopyToTFFromEncapOutputInfoMap(const string& key);
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 422b217ed..e21a8663f 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -42,16 +42,44 @@ NGraphTensorManager::NGraphTensorManager(const string ng_encap_node_name,
 
 void NGraphTensorManager::Initialize() {
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
+
+  // input variables book-keeping
   for (int index = 0; index < m_number_of_inputs; index++) {
     if (NGraphCatalog::ExistsInInputVariableSharedNameMap(
             m_ng_encap_graph_id, m_ng_encap_node_name, index)) {
       m_input_indexes_from_variables.push_back(index);
+      // store the variable shared name
+      try {
+        auto shared_name = NGraphCatalog::GetInputVariableSharedName(
+            m_ng_encap_graph_id, m_ng_encap_node_name, index);
+        input_variable_shared_name_map.insert({index, shared_name});
+      } catch {
+        throw runtime_error(
+            "Could not find variable shared name in catalog for input index " +
+            to_string(index) + "for encapsulate op " + m_ng_encap_node_name);
+      }
     }
   }
+
+  // output variables book-keeping
+  // these weights are updated in place
   for (int index = 0; index < m_number_of_outputs; index++) {
     if (NGraphCatalog::ExistsInEncapOutputInfoMap(
             m_ng_encap_graph_id, m_ng_encap_node_name, index)) {
       m_output_indexes_assigning_variable.push_back(index);
+
+      // store the output variable shared name + copy_to_tf info
+      try {
+        auto shared_name_copy_to_tf =
+            NGraphCatalog::GetInfoFromEncapOutputInfoMap(
+                m_ng_encap_graph_id, m_ng_encap_node_name, index);
+        output_variable_info_map.insert({index, shared_name_copy_to_tf});
+      } catch {
+        throw runtime_error(
+            "Could not find variable shared name and copy_to_tf information in "
+            "catalog for output index " +
+            to_string(index) + " for encapsulate op " + m_ng_encap_node_name);
+      }
     }
     if (NGraphCatalog::EncapOutputIndexNeedsCopy(m_ng_encap_graph_id,
                                                  m_ng_encap_node_name, index)) {
@@ -106,5 +134,52 @@ void NGraphTensorManager::Initialize() {
 //---------------------------------------------------------------------------
 NGraphTensorManager::~NGraphTensorManager() {}
 
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::GetInputVariableSharedName
+//---------------------------------------------------------------------------
+Status NGraphTensorManager::GetInputVariableSharedName(
+    const int& input_index, string* input_var_shared_name) {
+  auto itr = input_variable_shared_name_map.find(input_index);
+  if (itr == input_variable_shared_name_map.end()) {
+    return errors::Internal(
+        "Could not find shared name for input index in tensor manager ",
+        input_index);
+  }
+  *input_var_shared_name = itr->second;
+  return Status::OK();
+}
+
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::GetOutputVariableSharedName
+//---------------------------------------------------------------------------
+Status NGraphTensorManager::GetOutputVariableSharedName(
+    const int& output_index, string* output_var_shared_name) {
+  auto itr = output_variable_info_map.find(output_index);
+  if (itr == output_variable_info_map.end()) {
+    return errors::Internal(
+        "Could not find shared name and copy_to_tf info for output index in "
+        "tensor manager ",
+        output_index);
+  }
+  *output_var_shared_name = get<0>(itr->second);
+  return Status::OK();
+}
+
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::GetOutputVariableCopyToTF
+//---------------------------------------------------------------------------
+Status NGraphTensorManager::GetOutputVariableCopyToTF(
+    const int& output_index, bool* output_var_copy_to_tf) {
+  auto itr = output_variable_info_map.find(output_index);
+  if (itr == output_variable_info_map.end()) {
+    return errors::Internal(
+        "Could not find shared name and copy_to_tf info for output index in "
+        "tensor manager ",
+        output_index);
+  }
+  *output_var_copy_to_tf = get<1>(itr->second);
+  return Status::OK();
+}
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 47f118f86..df0fc55ac 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -92,6 +92,18 @@ class NGraphTensorManager {
     return m_pipelined_input_indexes_that_are_not_prefetched;
   }
 
+  // input variable shared name
+  Status GetInputVariableSharedName(const int& input_index,
+                                    string* input_var_shared_name);
+
+  // output variable shared name
+  Status GetOutputVariableSharedName(const int& output_index,
+                                     string* output_var_shared_name);
+
+  // input variable shared name
+  Status GetOutputVariableCopyToTF(const int& output_index,
+                                   bool* output_var_copy_to_tf);
+
  private:
   void Initialize();
   string m_ng_encap_node_name;
@@ -107,6 +119,7 @@ class NGraphTensorManager {
   vector<int> m_output_indexes_that_need_copy;
 
   // All indexes that are not from/to variables
+  // Book-keeping primarily for data pipelining
   // These are pipelined, some of these are also prefetched
   // indexes wrt all inputs/outputs
   vector<int> m_pipelined_input_indexes;
@@ -118,6 +131,10 @@ class NGraphTensorManager {
   // indexes wrt all inputs
   vector<int> m_prefetched_input_indexes;
   vector<int> m_pipelined_not_prefetched_input_indexes;
+
+  // Book-keeping for weights-on-device optimizations
+  unordered_map<int, string> input_variable_shared_name_map;
+  unordered_map<int, tuple<string, bool>> output_variable_info_map;
 };
 
 }  // namespace ngraph_bridge

From fae2335f45606613ffcf5c1f9016a4a924e292d4 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Sat, 14 Dec 2019 17:19:29 -0800
Subject: [PATCH 38/67] added test

---
 ngraph_bridge/ngraph_tensor_manager.h |  6 +-
 test/test_ngraph_tensor_manager.cpp   | 88 +++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index df0fc55ac..d0e394dcd 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -92,15 +92,15 @@ class NGraphTensorManager {
     return m_pipelined_input_indexes_that_are_not_prefetched;
   }
 
-  // input variable shared name
+  // input ng-variable shared name
   Status GetInputVariableSharedName(const int& input_index,
                                     string* input_var_shared_name);
 
-  // output variable shared name
+  // output ng-variable shared name
   Status GetOutputVariableSharedName(const int& output_index,
                                      string* output_var_shared_name);
 
-  // input variable shared name
+  // does output ng-variable's host-TF tensor needs to be updated
   Status GetOutputVariableCopyToTF(const int& output_index,
                                    bool* output_var_copy_to_tf);
 
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index d92779efc..81d107ced 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -62,6 +62,7 @@ class NGraphTensorManagerTest : public ::testing::Test {
         ng_encap_graph_id, ng_encap_node_name, indexes_need_copy);
   }
 
+  // Utility to Simulate entering prefetch info in catalog
   void EnterPrefetchInCatalog(const int& ng_encap_graph_id,
                               const string& ng_encap_node_name,
                               const vector<int>& prefetched_inp_indexes) {
@@ -82,6 +83,24 @@ class NGraphTensorManagerTest : public ::testing::Test {
     iota(vout.begin(), vout.end(), 0);
     return vout;
   }
+
+  // Utility to Simulate entering variable shared name info info in catalog
+  void EnterVarSharedInfoInCatalog(
+      const int& ng_encap_graph_id, const string& ng_encap_node_name,
+      const unordered_map<int, string>& input_var_info_map,
+      const unordered_map<int, tuple<string, bool>>& output_var_info_map) {
+    for (auto itr : input_var_info_map) {
+      string key = NGraphCatalog::CreateNodeKey(ng_encap_graph_id,
+                                                ng_encap_node_name, itr.first);
+      NGraphCatalog::AddToInputVariableSharedNameMap(key, itr.second);
+    }
+
+    for (auto itr : output_var_info_map) {
+      string key = NGraphCatalog::CreateNodeKey(ng_encap_graph_id,
+                                                ng_encap_node_name, itr.first);
+      NGraphCatalog::AddToEncapOutputInfoMap(key, itr.second);
+    }
+  }
 };
 
 TEST(NGraphUtils, FindComplement1) {
@@ -427,6 +446,75 @@ TEST_F(NGraphTensorManagerTest, PrefetchNotInPipeline) {
   ClearCatalog();
 }
 
+// check book-keeping of shared information
+TEST_F(NGraphTensorManagerTest, SharedName) {
+  string ng_encap_node_name = "xyz_1";
+  int ng_encap_cluster_id = 1;
+  int ng_encap_graph_id = 1;
+  int number_of_inputs = 5;
+  int number_of_outputs = 2;
+
+  unordered_map<int, string> input_var_info_map = {{0, "A"}, {3, "C"}};
+  unordered_map<int, tuple<string, bool>> output_var_info_map = {
+      {1, make_tuple("X", false)}, {5, make_tuple("Y", true)}};
+
+  EnterVarSharedInfoInCatalog(ng_encap_graph_id, ng_encap_node_name,
+                              input_var_info_map, output_var_info_map);
+
+  NGraphTensorManager tensor_manager(ng_encap_node_name, ng_encap_cluster_id,
+                                     ng_encap_graph_id, number_of_inputs,
+                                     number_of_outputs);
+
+  if (ngraph_tf_are_variables_enabled()) {
+    string shared_name;
+    bool copy_to_tf;
+    // input var
+    ASSERT_OK(tensor_manager.GetInputVariableSharedName(0, &shared_name));
+    ASSERT_EQ(shared_name, "A");
+    ASSERT_OK(tensor_manager.GetInputVariableSharedName(3, &shared_name));
+    ASSERT_EQ(shared_name, "C");
+
+    ASSERT_NOT_OK(tensor_manager.GetInputVariableSharedName(2, &shared_name));
+
+    // output var
+    ASSERT_OK(tensor_manager.GetOutputVariableSharedName(1, &shared_name));
+    ASSERT_EQ(shared_name, "X");
+    ASSERT_OK(tensor_manager.GetOutputVariableSharedName(5, &shared_name));
+    ASSERT_EQ(shared_name, "Y");
+
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableSharedName(2, &shared_name));
+
+    // output var copy_to_tf
+    ASSERT_OK(tensor_manager.GetOutputVariableCopyToTF(1, &copy_to_tf));
+    ASSERT_FALSE(copy_to_tf);
+    ASSERT_OK(tensor_manager.GetOutputVariableCopyToTF(5, &copy_to_tf));
+    ASSERT_TRUE(copy_to_tf);
+
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableCopyToTF(2, &copy_to_tf));
+
+  } else {
+    string shared_name;
+    bool copy_to_tf;
+    // input var
+    ASSERT_NOT_OK(tensor_manager.GetInputVariableSharedName(0, &shared_name));
+    ASSERT_NOT_OK(tensor_manager.GetInputVariableSharedName(3, &shared_name));
+    ASSERT_NOT_OK(tensor_manager.GetInputVariableSharedName(2, &shared_name));
+
+    // output var
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableSharedName(1, &shared_name));
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableSharedName(5, &shared_name));
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableSharedName(2, &shared_name));
+
+    // output var copy_to_tf
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableCopyToTF(1, &copy_to_tf));
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableCopyToTF(5, &copy_to_tf));
+    ASSERT_NOT_OK(tensor_manager.GetOutputVariableCopyToTF(2, &copy_to_tf));
+  }
+
+  // clean up
+  ClearCatalog();
+}
+
 }  // namespace testing
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
\ No newline at end of file

From b1120c17af381e5b08df00441c2573147108ca70 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Sat, 14 Dec 2019 18:43:52 -0800
Subject: [PATCH 39/67] fix axpy var test

---
 examples/CMakeLists.txt                |  6 ++++++
 test/python/test_axpy_var_pipelined.py | 16 ++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0666fee83..c4c224953 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -27,3 +27,9 @@ execute_process(
         ${CMAKE_CURRENT_SOURCE_DIR}/axpy_pipelined.py
         ${CMAKE_CURRENT_BINARY_DIR}/axpy_pipelined.py
 )
+
+execute_process(
+    COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${CMAKE_CURRENT_SOURCE_DIR}/axpy_var_pipelined.py
+        ${CMAKE_CURRENT_BINARY_DIR}/axpy_var_pipelined.py
+)
\ No newline at end of file
diff --git a/test/python/test_axpy_var_pipelined.py b/test/python/test_axpy_var_pipelined.py
index 272e34c56..3f220b600 100644
--- a/test/python/test_axpy_var_pipelined.py
+++ b/test/python/test_axpy_var_pipelined.py
@@ -13,17 +13,17 @@
 
 # For eg. when running the test from ngraph-bridge/build_cmake/test/python
 # you can add this path as below
-#sys.path.insert(0, '../../examples')
+# sys.path.insert(0, '../../examples')
 
 from axpy_var_pipelined import *
 
 
-class TestAxpyPipelined(NgraphTest):
+class TestAxpyVarPipelined(NgraphTest):
 
-    def test_axpy_pipelined(self):
-        #prefetch_env = "NGRAPH_TF_USE_PREFETCH"
-        #env_var_map = self.store_env_variables([prefetch_env])
-        #self.set_env_variable(prefetch_env, "1")
+    def test_axpy_var_pipelined(self):
+        prefetch_env = "NGRAPH_TF_USE_PREFETCH"
+        env_var_map = self.store_env_variables([prefetch_env])
+        self.set_env_variable(prefetch_env, "1")
         input_array, output_array, expected_output_array = run_axpy_pipeline()
         for i in range(1, 10):
             print("Iteration:", i, " Input: ", input_array[i - 1], " Output: ",
@@ -33,5 +33,5 @@ def test_axpy_pipelined(self):
             assert np.allclose(
                 output_array[i - 1], expected_output_array[i - 1],
                 atol=1e-3), "Output  and expected output values don't match"
-        #self.unset_env_variable(prefetch_env)
-        #self.restore_env_variables(env_var_map)
+        self.unset_env_variable(prefetch_env)
+        self.restore_env_variables(env_var_map)

From 06c93efd90367ba3201f746e7da3856a3018493a Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Sat, 14 Dec 2019 19:10:34 -0800
Subject: [PATCH 40/67] fixed var tests

---
 ngraph_bridge/ngraph_tensor_manager.cc | 21 ++++++++-------------
 test/test_ngraph_tensor_manager.cpp    | 11 +++++++++--
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index e21a8663f..52a3e1bfe 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -53,7 +53,7 @@ void NGraphTensorManager::Initialize() {
         auto shared_name = NGraphCatalog::GetInputVariableSharedName(
             m_ng_encap_graph_id, m_ng_encap_node_name, index);
         input_variable_shared_name_map.insert({index, shared_name});
-      } catch {
+      } catch (const std::exception& exp) {
         throw runtime_error(
             "Could not find variable shared name in catalog for input index " +
             to_string(index) + "for encapsulate op " + m_ng_encap_node_name);
@@ -74,7 +74,7 @@ void NGraphTensorManager::Initialize() {
             NGraphCatalog::GetInfoFromEncapOutputInfoMap(
                 m_ng_encap_graph_id, m_ng_encap_node_name, index);
         output_variable_info_map.insert({index, shared_name_copy_to_tf});
-      } catch {
+      } catch (const std::exception& exp) {
         throw runtime_error(
             "Could not find variable shared name and copy_to_tf information in "
             "catalog for output index " +
@@ -141,9 +141,8 @@ Status NGraphTensorManager::GetInputVariableSharedName(
     const int& input_index, string* input_var_shared_name) {
   auto itr = input_variable_shared_name_map.find(input_index);
   if (itr == input_variable_shared_name_map.end()) {
-    return errors::Internal(
-        "Could not find shared name for input index in tensor manager ",
-        input_index);
+    return errors::Internal("Could not find shared name info for input index ",
+                            input_index, " in tensor manager ");
   }
   *input_var_shared_name = itr->second;
   return Status::OK();
@@ -156,10 +155,8 @@ Status NGraphTensorManager::GetOutputVariableSharedName(
     const int& output_index, string* output_var_shared_name) {
   auto itr = output_variable_info_map.find(output_index);
   if (itr == output_variable_info_map.end()) {
-    return errors::Internal(
-        "Could not find shared name and copy_to_tf info for output index in "
-        "tensor manager ",
-        output_index);
+    return errors::Internal("Could not find shared name info for output index ",
+                            output_index, " in tensor manager");
   }
   *output_var_shared_name = get<0>(itr->second);
   return Status::OK();
@@ -172,10 +169,8 @@ Status NGraphTensorManager::GetOutputVariableCopyToTF(
     const int& output_index, bool* output_var_copy_to_tf) {
   auto itr = output_variable_info_map.find(output_index);
   if (itr == output_variable_info_map.end()) {
-    return errors::Internal(
-        "Could not find shared name and copy_to_tf info for output index in "
-        "tensor manager ",
-        output_index);
+    return errors::Internal("Could not find copy_to_tf info for output index ",
+                            output_index, " in tensor manager");
   }
   *output_var_copy_to_tf = get<1>(itr->second);
   return Status::OK();
diff --git a/test/test_ngraph_tensor_manager.cpp b/test/test_ngraph_tensor_manager.cpp
index 81d107ced..42a570ae3 100644
--- a/test/test_ngraph_tensor_manager.cpp
+++ b/test/test_ngraph_tensor_manager.cpp
@@ -452,11 +452,13 @@ TEST_F(NGraphTensorManagerTest, SharedName) {
   int ng_encap_cluster_id = 1;
   int ng_encap_graph_id = 1;
   int number_of_inputs = 5;
-  int number_of_outputs = 2;
+  int number_of_outputs = 6;
 
   unordered_map<int, string> input_var_info_map = {{0, "A"}, {3, "C"}};
   unordered_map<int, tuple<string, bool>> output_var_info_map = {
-      {1, make_tuple("X", false)}, {5, make_tuple("Y", true)}};
+      {1, make_tuple("X", false)},
+      {5, make_tuple("Y", true)},
+      {0, make_tuple("Z", false)}};
 
   EnterVarSharedInfoInCatalog(ng_encap_graph_id, ng_encap_node_name,
                               input_var_info_map, output_var_info_map);
@@ -483,6 +485,8 @@ TEST_F(NGraphTensorManagerTest, SharedName) {
     ASSERT_EQ(shared_name, "Y");
 
     ASSERT_NOT_OK(tensor_manager.GetOutputVariableSharedName(2, &shared_name));
+    ASSERT_OK(tensor_manager.GetOutputVariableSharedName(0, &shared_name));
+    ASSERT_EQ(shared_name, "Z");
 
     // output var copy_to_tf
     ASSERT_OK(tensor_manager.GetOutputVariableCopyToTF(1, &copy_to_tf));
@@ -492,6 +496,9 @@ TEST_F(NGraphTensorManagerTest, SharedName) {
 
     ASSERT_NOT_OK(tensor_manager.GetOutputVariableCopyToTF(2, &copy_to_tf));
 
+    ASSERT_OK(tensor_manager.GetOutputVariableCopyToTF(0, &copy_to_tf));
+    ASSERT_FALSE(copy_to_tf);
+
   } else {
     string shared_name;
     bool copy_to_tf;

From f89a28e08b6248da6627cde4f16ab8bb52e05140 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Sat, 14 Dec 2019 19:48:09 -0800
Subject: [PATCH 41/67] Fixed axpy pipelined py

---
 examples/axpy_var_pipelined.py | 41 +++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/examples/axpy_var_pipelined.py b/examples/axpy_var_pipelined.py
index 63cbdafb0..017d860e8 100644
--- a/examples/axpy_var_pipelined.py
+++ b/examples/axpy_var_pipelined.py
@@ -32,10 +32,12 @@ def build_simple_model(input_array, tensor_var, var_modifier, array_multiplier):
 
     # Define the Ops
     mul = tf.compat.v1.math.multiply(input_array, array_multiplier)
-    tensor_var_assign = tensor_var.assign(tensor_var + var_modifier)
-    add = tf.compat.v1.math.add(mul, tensor_var_assign)
-    output = add
-    return output
+    add = tf.compat.v1.math.add(mul, tensor_var)
+    train_step = tensor_var.assign(add + var_modifier)
+
+    with tf.control_dependencies([train_step]):
+        train_op = tf.no_op('train_op')
+    return add, train_op
 
 
 def build_data_pipeline(input_array, map_function, batch_size):
@@ -51,21 +53,23 @@ def build_data_pipeline(input_array, map_function, batch_size):
 
 def run_axpy_pipeline():
     input_array = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-    expected_output_array = [-1, -1, -1, -1, -1, -1, -1, -1, -1]
-    output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0]
+
     multiplier = 10
-    init = tf.constant(10)
-    var = tf.get_variable('x', initializer=init)
-    for i in range(1, 10):
-        input_array[i - 1] = input_array[i - 1] * i * multiplier
     map_function = lambda x: x * multiplier
     batch_size = 1
     pipeline, iterator = build_data_pipeline(input_array, map_function,
                                              batch_size)
+    var_init = 10
+    init = tf.constant([var_init])
+    var = tf.get_variable('x', initializer=init)
+
     var_modifier = 1
     array_multiplier = 5
     model = build_simple_model(pipeline, var, var_modifier, array_multiplier)
-    var_sum = 11
+
+    expected_output_array = []
+    output_array = []
+    var_val = var_init
     with tf.Session() as sess:
         # Initialize the globals and the dataset
         sess.run(tf.global_variables_initializer())
@@ -73,12 +77,14 @@ def run_axpy_pipeline():
 
         for i in range(1, 10):
             # Expected value is:
-            expected_output_array[i - 1] = (
-                (input_array[i - 1] * multiplier) * array_multiplier) + var_sum
-            var_sum = var_sum + var_modifier
+            expected_output = (
+                (input_array[i - 1] * multiplier) * array_multiplier) + var_val
+            expected_output_array.append(expected_output)
+            var_val = expected_output + var_modifier
+
             # Run one iteration
-            output = sess.run(model)
-            output_array[i - 1] = output[0]
+            output, train_op = sess.run(model)
+            output_array.append(output[0])
     return input_array, output_array, expected_output_array
 
 
@@ -91,6 +97,5 @@ def main(_):
 
 
 if __name__ == '__main__':
-    os.environ['NGRAPH_TF_BACKEND'] = "INTERPRETER"
-    #os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
+    os.environ['NGRAPH_TF_USE_PREFETCH'] = "1"
     tf.app.run(main=main)

From 25613b670706801af6c3fb4b89d48ca3c1d0ede0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 15:07:49 -0800
Subject: [PATCH 42/67] Read only required outputs

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 40 +++++++++++++++-----------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 9a48d8c92..41106b3fe 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -540,12 +540,11 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_execute_graph);
 
   // Now prepare the output
-  ngraph::Event event_copy_output_tensor("Copy Output Tensor", "", "");
-
-  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
+  // Allocate TF Tensors
+  vector<Tensor*> tf_output_tensors;
+  ngraph::Event event_allocate_tf_output_tensors("Allocate TF Output Tensor",
+                                                 "", "");
   for (auto i = 0; i < ng_exec->get_results().size(); i++) {
-    std::unique_ptr<ngraph::Event> event_copy_prep(
-        new ngraph::Event("Copy Prep", "", ""));
     auto ng_element = ng_exec->get_results()[i];
     auto ng_shape = ng_element->get_shape();
     auto ng_element_type = ng_element->get_element_type();
@@ -558,7 +557,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
     TensorShape tf_shape(dims);
     Tensor* tf_output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(i, tf_shape, &tf_output_tensor));
-
+    tf_output_tensors.push_back(tf_output_tensor);
     // Make sure the nGraph-inferred element type agrees with what TensorFlow
     // expected.
     ng::element::Type expected_elem_type;
@@ -569,26 +568,33 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
         ctx, ng_element_type == expected_elem_type,
         errors::Internal("Element type inferred by nGraph does not match "
                          "the element type expected by TensorFlow"));
-    event_copy_prep->Stop();
-    output_copy_events.push_back(std::move(event_copy_prep));
+  }
+  event_allocate_tf_output_tensors.Stop();
+  ngraph::Event::write_trace(event_allocate_tf_output_tensors);
 
-    // Now copy the nGraph Tensor to Host Tensor
-    std::unique_ptr<ngraph::Event> event_copy_d2h(
-        new ngraph::Event("Device to Host Copy", "", ""));
-    void* dst_ptr = DMAHelper::base(tf_output_tensor);
+  // Copy Tensors that are required
+  ngraph::Event event_read_ng_tensors("Read NG Tensor", "", "");
+  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
 
-    ng_outputs[i]->read(
-        dst_ptr, ng_outputs[i]->get_element_count() * ng_element_type.size());
+  auto output_indexes_to_be_copied =
+      tensor_manager->GetOutputIndexesThatNeedCopy();
+  for (auto output_index : output_indexes_to_be_copied) {
+    // Copy the nGraph Tensor to Host Tensor
+    std::unique_ptr<ngraph::Event> event_copy_d2h(
+        new ngraph::Event("Output_" + std::to_string(output_index), "", ""));
+    ng_outputs[output_index]->read(
+        tf_output_tensors[output_index],
+        ng_outputs[output_index]->get_element_count() *
+            ng_outputs[output_index]->get_element_type().size());
     event_copy_d2h->Stop();
     output_copy_events.push_back(std::move(event_copy_d2h));
   }
-
   for (auto& next : output_copy_events) {
     ngraph::Event::write_trace(*next.get());
   }
 
-  event_copy_output_tensor.Stop();
-  ngraph::Event::write_trace(event_copy_output_tensor);
+  event_read_ng_tensors.Stop();
+  ngraph::Event::write_trace(event_read_ng_tensors);
 
   // Now return them to the cache
   ngraph::Event event_return_tensor("Return Tensor", "", "");

From ff18bfdc1ac00d4d814eb09bc579f7f73a174240 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 15:07:49 -0800
Subject: [PATCH 43/67] Read only required outputs

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 40 +++++++++++++++-----------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 9a48d8c92..323203290 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -540,12 +540,11 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_execute_graph);
 
   // Now prepare the output
-  ngraph::Event event_copy_output_tensor("Copy Output Tensor", "", "");
-
-  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
+  // Allocate TF Tensors
+  vector<Tensor*> tf_output_tensors;
+  ngraph::Event event_allocate_tf_output_tensors("Allocate TF Output Tensor",
+                                                 "", "");
   for (auto i = 0; i < ng_exec->get_results().size(); i++) {
-    std::unique_ptr<ngraph::Event> event_copy_prep(
-        new ngraph::Event("Copy Prep", "", ""));
     auto ng_element = ng_exec->get_results()[i];
     auto ng_shape = ng_element->get_shape();
     auto ng_element_type = ng_element->get_element_type();
@@ -558,7 +557,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
     TensorShape tf_shape(dims);
     Tensor* tf_output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(i, tf_shape, &tf_output_tensor));
-
+    tf_output_tensors.push_back(tf_output_tensor);
     // Make sure the nGraph-inferred element type agrees with what TensorFlow
     // expected.
     ng::element::Type expected_elem_type;
@@ -569,26 +568,33 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
         ctx, ng_element_type == expected_elem_type,
         errors::Internal("Element type inferred by nGraph does not match "
                          "the element type expected by TensorFlow"));
-    event_copy_prep->Stop();
-    output_copy_events.push_back(std::move(event_copy_prep));
+  }
+  event_allocate_tf_output_tensors.Stop();
+  ngraph::Event::write_trace(event_allocate_tf_output_tensors);
 
-    // Now copy the nGraph Tensor to Host Tensor
-    std::unique_ptr<ngraph::Event> event_copy_d2h(
-        new ngraph::Event("Device to Host Copy", "", ""));
-    void* dst_ptr = DMAHelper::base(tf_output_tensor);
+  // Copy Tensors that are required
+  ngraph::Event event_read_ng_tensors("Read NG Tensor", "", "");
+  std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
 
-    ng_outputs[i]->read(
-        dst_ptr, ng_outputs[i]->get_element_count() * ng_element_type.size());
+  auto output_indexes_to_be_copied =
+      tensor_manager->GetOutputIndexesThatNeedCopy();
+  for (auto output_index : output_indexes_to_be_copied) {
+    // Copy the nGraph Tensor to Host Tensor
+    std::unique_ptr<ngraph::Event> event_copy_d2h(
+        new ngraph::Event("Output_" + std::to_string(output_index), "", ""));
+    void* dst_ptr = DMAHelper::base(tf_output_tensors[output_index]);
+    ng_outputs[output_index]->read(
+        dst_ptr, ng_outputs[output_index]->get_element_count() *
+                     ng_outputs[output_index]->get_element_type().size());
     event_copy_d2h->Stop();
     output_copy_events.push_back(std::move(event_copy_d2h));
   }
-
   for (auto& next : output_copy_events) {
     ngraph::Event::write_trace(*next.get());
   }
 
-  event_copy_output_tensor.Stop();
-  ngraph::Event::write_trace(event_copy_output_tensor);
+  event_read_ng_tensors.Stop();
+  ngraph::Event::write_trace(event_read_ng_tensors);
 
   // Now return them to the cache
   ngraph::Event event_return_tensor("Return Tensor", "", "");

From 3e3d887375399a64fb85c4ea30bcdcf1f55be725 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 16:00:03 -0800
Subject: [PATCH 44/67] Var uses Parallel Executor

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 41106b3fe..029118815 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -88,13 +88,14 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
       ctx, backend != nullptr,
       errors::Internal("Cannot get the backend object for BE: ", be_name));
 
-// If we have the VARIABLE capture on then we can't use the
-// parallel executor until that support is added.
-#if !defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
+  // // If we have the VARIABLE capture on then we can't use the
+  // // parallel executor until that support is added.
+  // #if !defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
+  //   m_use_parallel_executor = backend->executable_can_create_tensors();
+  // #else
+  //   m_use_parallel_executor = false;
+  // #endif
   m_use_parallel_executor = backend->executable_can_create_tensors();
-#else
-  m_use_parallel_executor = false;
-#endif
 
   // Override the switch for debugging/testing
   if (std::getenv("NGRAPH_TF_USE_LEGACY_EXECUTOR") != nullptr) {

From 02888d3bb47e4fb7bef2336b45af64a81bafc98c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 17:44:44 -0800
Subject: [PATCH 45/67] Implemented IOTensorsReadyForExec

---
 ngraph_bridge/ngraph_encapsulate_op.cc       | 18 +++---
 ngraph_bridge/ngraph_encapsulate_op_utils.cc | 65 +++++++++++++++++++-
 ngraph_bridge/ngraph_encapsulate_op_utils.h  | 16 ++++-
 3 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 029118815..637c249c6 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -500,11 +500,15 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
-  // All inputs and outputs are pipelined.
-  // Of all these pipelined inputs some are prefetched
-  // TODO: Fit in variables
-  ng_inputs = get<1>(pipelined_io_tensors);
-  ng_outputs = get<2>(pipelined_io_tensors);
+  OP_REQUIRES_OK(ctx, GetIOTensorsReadyForExecution(
+                          ctx, tensor_manager, get<1>(pipelined_io_tensors),
+                          get<2>(pipelined_io_tensors), ng_inputs, ng_outputs));
+
+  // // All inputs and outputs are pipelined.
+  // // Of all these pipelined inputs some are prefetched
+  // // TODO: Fit in variables
+  // ng_inputs = get<1>(pipelined_io_tensors);
+  // ng_outputs = get<2>(pipelined_io_tensors);
 
   // And execute
   ngraph::Event event_execute_graph("Execute Graph", "", "");
@@ -581,8 +585,8 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
       tensor_manager->GetOutputIndexesThatNeedCopy();
   for (auto output_index : output_indexes_to_be_copied) {
     // Copy the nGraph Tensor to Host Tensor
-    std::unique_ptr<ngraph::Event> event_copy_d2h(
-        new ngraph::Event("Output_" + std::to_string(output_index), "", ""));
+    std::unique_ptr<ngraph::Event> event_copy_d2h(new ngraph::Event(
+        "D2H_Output_" + std::to_string(output_index), "", ""));
     ng_outputs[output_index]->read(
         tf_output_tensors[output_index],
         ng_outputs[output_index]->get_element_count() *
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 2bb71db76..eddd968e2 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -18,6 +18,10 @@
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
+#if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
+#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
+#endif
+
 using namespace std;
 
 namespace tensorflow {
@@ -25,9 +29,9 @@ namespace tensorflow {
 namespace ngraph_bridge {
 
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, std::vector<Tensor>& tf_input_tensors,
-    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    shared_ptr<NGraphTensorManager>& tensor_manager,
+    OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
     std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors) {
   auto io_tensors = pipelined_tensor_store->get_tensors();
@@ -222,5 +226,60 @@ Status GetPipelinedIOTensorsReadyForExecution(
   return Status::OK();
 }
 
+Status GetTensorFromContext(OpKernelContext* ctx, const string& shared_name,
+                            shared_ptr<ng::runtime::Tensor>& ng_tensor) {
+  // Get shared name from tensor manager
+  NGraphVar* var;
+  TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup<NGraphVar>(
+      ctx->resource_manager()->default_container(), shared_name, &var));
+  ng_tensor = var->ng_tensor();
+  var->Unref();
+  return Status::OK();
+}
+
+Status GetIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
+    const PipelinedTensorVector& pipelined_in_tensors,
+    const PipelinedTensorVector& pipelined_out_tensors,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs) {
+  // Get Variables that are inputs
+  auto var_input_indexes = tensor_manager->GetInputIndexesFedByVariables();
+  for (int input_index : var_input_indexes) {
+    string shared_name;
+    TF_RETURN_IF_ERROR(
+        tensor_manager->GetInputVariableSharedName(input_index, &shared_name));
+    TF_RETURN_IF_ERROR(
+        GetTensorFromContext(ctx, shared_name, ng_inputs[input_index]));
+  }
+
+  // Get Variables that are outputs
+  auto var_output_indexes =
+      tensor_manager->GetOutputIndexesAssigningVariables();
+  for (int output_index : var_output_indexes) {
+    string shared_name;
+    TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
+        output_index, &shared_name));
+    TF_RETURN_IF_ERROR(
+        GetTensorFromContext(ctx, shared_name, ng_outputs[output_index]));
+  }
+
+  // Fit Pipelined Input Tensors
+  auto pipelined_input_indexes = tensor_manager->GetPipelinedInputIndexes();
+  for (int i = 0; i < pipelined_input_indexes.size(); i++) {
+    int input_index = pipelined_input_indexes[i];
+    ng_inputs[input_index] = pipelined_in_tensors[i];
+  }
+
+  // Fit Pipelined Output Tensors
+  auto pipelined_output_indexes = tensor_manager->GetPipelinedOutputIndexes();
+  for (int i = 0; i < pipelined_output_indexes.size(); i++) {
+    int output_index = pipelined_output_indexes[i];
+    ng_outputs[output_index] = pipelined_out_tensors[i];
+  }
+
+  return Status::OK();
+}
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 7f48eb09c..64b2f432a 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -46,12 +46,22 @@ namespace ngraph_bridge {
 //
 
 Status GetPipelinedIOTensorsReadyForExecution(
-    OpKernelContext* ctx, vector<Tensor>& tf_input_tensors,
-    shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
-    shared_ptr<NGraphTensorManager>& tensor_manager,
+    OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
+    const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
+    const shared_ptr<NGraphTensorManager>& tensor_manager,
     tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors);
 
+Status GetIOTensorsReadyForExecution(
+    OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
+    const PipelinedTensorVector& pipelined_in_tensors,
+    const PipelinedTensorVector& pipelined_out_tensors,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
+    vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs);
+
+Status GetTensorFromContext(OpKernelContext* ctx, const string& shared_name,
+                            shared_ptr<ng::runtime::Tensor>& ng_tensor);
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 

From ca63606a08c1c82c4a816e2d634612c225045de5 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 17:51:50 -0800
Subject: [PATCH 46/67] Sync for output tensors

---
 ngraph_bridge/ngraph_encapsulate_op.cc       |  4 ++-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc | 28 +++++++++++++++++++-
 ngraph_bridge/ngraph_encapsulate_op_utils.h  |  7 ++++-
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 637c249c6..9100f0f8a 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -597,10 +597,12 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   for (auto& next : output_copy_events) {
     ngraph::Event::write_trace(*next.get());
   }
-
   event_read_ng_tensors.Stop();
   ngraph::Event::write_trace(event_read_ng_tensors);
 
+  // Synch Var Output Tensors as required
+  OP_REQUIRES_OK(ctx, SyncOutputVarTensors(ctx, tensor_manager));
+
   // Now return them to the cache
   ngraph::Event event_return_tensor("Return Tensor", "", "");
   pipelined_tensor_store->return_tensors(current_iter_pipeline_depth);
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index eddd968e2..6cd009bee 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -226,7 +226,8 @@ Status GetPipelinedIOTensorsReadyForExecution(
   return Status::OK();
 }
 
-Status GetTensorFromContext(OpKernelContext* ctx, const string& shared_name,
+Status GetTensorFromContext(const OpKernelContext* ctx,
+                            const string& shared_name,
                             shared_ptr<ng::runtime::Tensor>& ng_tensor) {
   // Get shared name from tensor manager
   NGraphVar* var;
@@ -281,5 +282,30 @@ Status GetIOTensorsReadyForExecution(
   return Status::OK();
 }
 
+Status SyncOutputVarTensors(
+    const OpKernelContext* ctx,
+    const shared_ptr<NGraphTensorManager>& tensor_manager) {
+  // Get Variables that are outputs
+  auto var_output_indexes =
+      tensor_manager->GetOutputIndexesAssigningVariables();
+  for (int output_index : var_output_indexes) {
+    bool copy_to_tf;
+    TF_RETURN_IF_ERROR(
+        tensor_manager->GetOutputVariableCopyToTF(output_index, &copy_to_tf));
+
+    if (copy_to_tf) {
+      string shared_name;
+      TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
+          output_index, &shared_name));
+      // Get shared name from tensor manager
+      NGraphVar* var;
+      TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup<NGraphVar>(
+          ctx->resource_manager()->default_container(), shared_name, &var));
+      var->copy_ng_to_tf();
+      var->Unref();
+    }
+  }
+}
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 64b2f432a..956a2af50 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -59,9 +59,14 @@ Status GetIOTensorsReadyForExecution(
     vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
     vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs);
 
-Status GetTensorFromContext(OpKernelContext* ctx, const string& shared_name,
+Status GetTensorFromContext(const OpKernelContext* ctx,
+                            const string& shared_name,
                             shared_ptr<ng::runtime::Tensor>& ng_tensor);
 
+Status SyncOutputVarTensors(
+    const OpKernelContext* ctx,
+    const shared_ptr<NGraphTensorManager>& tensor_manager);
+
 }  // namespace ngraph_bridge
 }  // namespace tensorflow
 

From 472b8d8ef5a4c6acd69d814155a3af7ec79112c0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 17:54:57 -0800
Subject: [PATCH 47/67] Fixed output

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 9100f0f8a..9cf0b2b8b 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -587,10 +587,10 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
     // Copy the nGraph Tensor to Host Tensor
     std::unique_ptr<ngraph::Event> event_copy_d2h(new ngraph::Event(
         "D2H_Output_" + std::to_string(output_index), "", ""));
+    void* dst_ptr = (void*)DMAHelper::base(tf_output_tensors[output_index]);
     ng_outputs[output_index]->read(
-        tf_output_tensors[output_index],
-        ng_outputs[output_index]->get_element_count() *
-            ng_outputs[output_index]->get_element_type().size());
+        dst_ptr, ng_outputs[output_index]->get_element_count() *
+                     ng_outputs[output_index]->get_element_type().size());
     event_copy_d2h->Stop();
     output_copy_events.push_back(std::move(event_copy_d2h));
   }

From 5576a4714f9b066fc86ea619b71f25f23ad5996c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 18:52:25 -0800
Subject: [PATCH 48/67] For non var build

---
 bazel/BUILD                                           | 2 ++
 ngraph_bridge/CMakeLists.txt                          | 2 +-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc          | 4 +---
 ngraph_bridge/{enable_variable_ops => }/ngraph_var.cc | 2 +-
 ngraph_bridge/{enable_variable_ops => }/ngraph_var.h  | 0
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.cc (98%)
 rename ngraph_bridge/{enable_variable_ops => }/ngraph_var.h (100%)

diff --git a/bazel/BUILD b/bazel/BUILD
index 7028b6a95..034ff0dec 100644
--- a/bazel/BUILD
+++ b/bazel/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "ngraph_bridge/ngraph_tensor_manager.h",
         "ngraph_bridge/ngraph_timer.h",
         "ngraph_bridge/ngraph_utils.h",
+        "ngraph_bridge/ngraph_var.h",
         "ngraph_bridge/ngraph_version_utils.h",
         "ngraph_bridge/tf_deadness_analysis.h",
         "ngraph_bridge/tf_graphcycles.h",
@@ -92,6 +93,7 @@ cc_library(
         "ngraph_bridge/ngraph_tensor_manager.cc",
         "ngraph_bridge/ngraph_tracked_variable.cc",
         "ngraph_bridge/ngraph_utils.cc",
+        "ngraph_bridge/ngraph_var.cc",
         "ngraph_bridge/tf_deadness_analysis.cc",
         "ngraph_bridge/tf_graphcycles.cc",
         "ngraph_bridge/ops/ngraph_ops.cc",
diff --git a/ngraph_bridge/CMakeLists.txt b/ngraph_bridge/CMakeLists.txt
index 18d218dad..eb104ae3b 100644
--- a/ngraph_bridge/CMakeLists.txt
+++ b/ngraph_bridge/CMakeLists.txt
@@ -57,6 +57,7 @@ set(SRC
    ngraph_rewrite_pass.cc
    ngraph_tensor_manager.cc
    ngraph_tracked_variable.cc
+   ngraph_var.cc
    ngraph_utils.cc
    tf_graphcycles.cc
    tf_deadness_analysis.cc
@@ -86,7 +87,6 @@ if(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
     list(APPEND SRC enable_variable_ops/ngraph_tracked_variable.cc)
 
     # new files
-    list(APPEND SRC enable_variable_ops/ngraph_var.cc)
     list(APPEND SRC enable_variable_ops/ngraph_assign_op.cc)
     list(APPEND SRC enable_variable_ops/ngraph_enter_in_catalog.cc)
     list(APPEND SRC enable_variable_ops/ngraph_remove_ngraphassigns.cc)
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 6cd009bee..dac9cd83e 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -18,9 +18,7 @@
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
-#if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
-#endif
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.cc b/ngraph_bridge/ngraph_var.cc
similarity index 98%
rename from ngraph_bridge/enable_variable_ops/ngraph_var.cc
rename to ngraph_bridge/ngraph_var.cc
index efab9e7c0..1fa6001bf 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_var.cc
+++ b/ngraph_bridge/ngraph_var.cc
@@ -24,10 +24,10 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_var.h b/ngraph_bridge/ngraph_var.h
similarity index 100%
rename from ngraph_bridge/enable_variable_ops/ngraph_var.h
rename to ngraph_bridge/ngraph_var.h

From b1309172ab0dbe6c692802311cb7537b82cc467e Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Mon, 16 Dec 2019 19:33:22 -0800
Subject: [PATCH 49/67] Solved build and fix

---
 .../enable_variable_ops/ngraph_assign_op.cc          |  2 +-
 .../enable_variable_ops/ngraph_tracked_variable.cc   |  2 +-
 .../enable_variable_ops/ngraph_variable_modifiers.cc |  2 +-
 .../ngraph_variable_update_ng_tensor_op.cc           |  2 +-
 ngraph_bridge/ngraph_encapsulate_impl.cc             |  2 +-
 ngraph_bridge/ngraph_encapsulate_op.cc               | 12 +++++++++++-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc         |  6 ++++++
 ngraph_bridge/ngraph_executor.cc                     |  2 +-
 test/graph_rewrites/test_ng_var_update_ng_tensor.cc  |  2 +-
 test/test_ng_var_update_ng_tensor_kernel.cc          |  2 +-
 10 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
index b9f041e8b..8eb1d5336 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
@@ -25,11 +25,11 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
index c034d13c7..f821cadbc 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
@@ -23,11 +23,11 @@
 #include "ngraph/event_tracing.hpp"
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
index 5fc190bea..376a596a9 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_modifiers.cc
@@ -26,12 +26,12 @@
 
 #include "ngraph/runtime/backend.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_backend_manager.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #include "ngraph_bridge/ngraph_freshness_tracker.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
index fdb432f79..faee2334d 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
@@ -24,10 +24,10 @@
 
 #include "ngraph/event_tracing.hpp"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 using namespace std;
 namespace ng = ngraph;
diff --git a/ngraph_bridge/ngraph_encapsulate_impl.cc b/ngraph_bridge/ngraph_encapsulate_impl.cc
index 7823f0a7d..f2ddf1ecd 100644
--- a/ngraph_bridge/ngraph_encapsulate_impl.cc
+++ b/ngraph_bridge/ngraph_encapsulate_impl.cc
@@ -45,8 +45,8 @@
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
 
+#include "ngraph_bridge/ngraph_var.h"
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 9cf0b2b8b..bc1f9d49e 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -49,9 +49,9 @@
 #include "ngraph_bridge/ngraph_prefetch_shared_data.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
@@ -546,6 +546,8 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
 
   // Now prepare the output
   // Allocate TF Tensors
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Allocating TF Output Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
   vector<Tensor*> tf_output_tensors;
   ngraph::Event event_allocate_tf_output_tensors("Allocate TF Output Tensor",
                                                  "", "");
@@ -578,6 +580,9 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_allocate_tf_output_tensors);
 
   // Copy Tensors that are required
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Read NG Output Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
+
   ngraph::Event event_read_ng_tensors("Read NG Tensor", "", "");
   std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
 
@@ -601,9 +606,14 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_read_ng_tensors);
 
   // Synch Var Output Tensors as required
+  NGRAPH_VLOG(4)
+      << "NGraphEncapsulateOp::Compute Sync NG Output Variable Tensors "
+      << m_parallel_executor->GetNgraphClusterId();
   OP_REQUIRES_OK(ctx, SyncOutputVarTensors(ctx, tensor_manager));
 
   // Now return them to the cache
+  NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Returning Tensors "
+                 << m_parallel_executor->GetNgraphClusterId();
   ngraph::Event event_return_tensor("Return Tensor", "", "");
   pipelined_tensor_store->return_tensors(current_iter_pipeline_depth);
 
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index dac9cd83e..472a72d1e 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -286,12 +286,16 @@ Status SyncOutputVarTensors(
   // Get Variables that are outputs
   auto var_output_indexes =
       tensor_manager->GetOutputIndexesAssigningVariables();
+  NGRAPH_VLOG(4) << "output indexes size " << var_output_indexes.size();
+
   for (int output_index : var_output_indexes) {
+    NGRAPH_VLOG(4) << "Checking Sync For " << output_index;
     bool copy_to_tf;
     TF_RETURN_IF_ERROR(
         tensor_manager->GetOutputVariableCopyToTF(output_index, &copy_to_tf));
 
     if (copy_to_tf) {
+      NGRAPH_VLOG(4) << "Sync NG Output Variable Tensors " << output_index;
       string shared_name;
       TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
           output_index, &shared_name));
@@ -301,8 +305,10 @@ Status SyncOutputVarTensors(
           ctx->resource_manager()->default_container(), shared_name, &var));
       var->copy_ng_to_tf();
       var->Unref();
+      NGRAPH_VLOG(4) << "Sync Completed " << output_index;
     }
   }
+  return Status::OK();
 }
 
 }  // namespace ngraph_bridge
diff --git a/ngraph_bridge/ngraph_executor.cc b/ngraph_bridge/ngraph_executor.cc
index 37e1b8b40..7d4fe2c2a 100644
--- a/ngraph_bridge/ngraph_executor.cc
+++ b/ngraph_bridge/ngraph_executor.cc
@@ -43,9 +43,9 @@
 #include "ngraph_bridge/ngraph_mark_for_clustering.h"
 #include "ngraph_bridge/ngraph_timer.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/ngraph_catalog.h"
 #endif
 
diff --git a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
index 0af2c7a57..924c54266 100644
--- a/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
+++ b/test/graph_rewrites/test_ng_var_update_ng_tensor.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/platform/test.h"
 
 #include "logging/tf_graph_writer.h"
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_rewrite_for_tracking.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 #include "test/test_utilities.h"
 
 namespace tensorflow {
diff --git a/test/test_ng_var_update_ng_tensor_kernel.cc b/test/test_ng_var_update_ng_tensor_kernel.cc
index 51742fcc9..4612d156b 100644
--- a/test/test_ng_var_update_ng_tensor_kernel.cc
+++ b/test/test_ng_var_update_ng_tensor_kernel.cc
@@ -30,9 +30,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
-#include "ngraph_bridge/enable_variable_ops/ngraph_var.h"
 #include "ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.h"
 #include "ngraph_bridge/ngraph_utils.h"
+#include "ngraph_bridge/ngraph_var.h"
 #include "test/test_utilities.h"
 #include "test/tf_fake_input.h"
 

From f821bace2df1513a3cf6cfedd24500083610fc15 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Tue, 17 Dec 2019 10:04:24 -0800
Subject: [PATCH 50/67] Fix test_flib

---
 .../ngraph_enter_in_catalog.cc                | 14 ++---
 ngraph_bridge/ngraph_tensor_manager.cc        | 61 +++++++++++++++++++
 ngraph_bridge/ngraph_tensor_manager.h         |  2 +
 test/python/test_flib.py                      |  4 ++
 4 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
index a456ef6e8..d730cf79d 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
@@ -161,14 +161,12 @@ Status EnterInCatalog(Graph* graph, int graph_id) {
       }
 
       // are there indexes that need copy
-      if (op_index_to_copy.size() > 0) {
-        try {
-          NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(),
-                                                        op_index_to_copy);
-        } catch (const std::exception& exp) {
-          return errors::Internal(
-              "Caught exception while entering in catalog: ", exp.what(), "\n");
-        }
+      try {
+        NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(),
+                                                      op_index_to_copy);
+      } catch (const std::exception& exp) {
+        return errors::Internal("Caught exception while entering in catalog: ",
+                                exp.what(), "\n");
       }
 
     }  // end of node is type NGraphEncapsulate
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 52a3e1bfe..11e0c5579 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -41,6 +41,9 @@ NGraphTensorManager::NGraphTensorManager(const string ng_encap_node_name,
 }
 
 void NGraphTensorManager::Initialize() {
+  cout << "Number of inputs " << m_number_of_inputs << endl;
+  cout << "Number of outputs " << m_number_of_outputs << endl;
+
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
 
   // input variables book-keeping
@@ -86,6 +89,16 @@ void NGraphTensorManager::Initialize() {
       m_output_indexes_that_need_copy.push_back(index);
     }
   }
+
+  // For graphs that were run through AOT
+  // Graph rewrite is not done
+  if (!NGraphCatalog::EncapOutputNeedsCopy(m_ng_encap_graph_id,
+                                           m_ng_encap_node_name)) {
+    m_output_indexes_that_need_copy.resize(m_number_of_outputs);
+    iota(begin(m_output_indexes_that_need_copy),
+         end(m_output_indexes_that_need_copy), 0);
+  }
+
 #else
   m_output_indexes_that_need_copy.resize(m_number_of_outputs);
   iota(begin(m_output_indexes_that_need_copy),
@@ -127,6 +140,54 @@ void NGraphTensorManager::Initialize() {
                      m_pipelined_input_indexes_that_are_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
+  Print();
+}
+
+void NGraphTensorManager::Print() {
+  cout << "Input indexes from Variables" << endl;
+  for (int index : m_input_indexes_from_variables) {
+    cout << index << endl;
+  }
+
+  cout << "Input indexes to Assigns" << endl;
+  for (int index : m_output_indexes_assigning_variable) {
+    cout << index << endl;
+  }
+
+  cout << "Input need copy" << endl;
+  for (int index : m_output_indexes_that_need_copy) {
+    cout << index << endl;
+  }
+
+  cout << "Input pipelined" << endl;
+  for (int index : m_pipelined_input_indexes) {
+    cout << index << endl;
+  }
+
+  cout << "Output pipelined" << endl;
+  for (int index : m_pipelined_output_indexes) {
+    cout << index << endl;
+  }
+
+  cout << "prefetched " << endl;
+  for (int index : m_prefetched_input_indexes) {
+    cout << index << endl;
+  }
+
+  cout << "not prefetched " << endl;
+  for (int index : m_pipelined_not_prefetched_input_indexes) {
+    cout << index << endl;
+  }
+
+  cout << "pipelined prefetched " << endl;
+  for (int index : m_pipelined_input_indexes_that_are_prefetched) {
+    cout << index << endl;
+  }
+
+  cout << "pipelined not prefetched " << endl;
+  for (int index : m_pipelined_input_indexes_that_are_not_prefetched) {
+    cout << index << endl;
+  }
 }
 
 //---------------------------------------------------------------------------
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index d0e394dcd..5f0442043 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -104,6 +104,8 @@ class NGraphTensorManager {
   Status GetOutputVariableCopyToTF(const int& output_index,
                                    bool* output_var_copy_to_tf);
 
+  void Print();
+
  private:
   void Initialize();
   string m_ng_encap_node_name;
diff --git a/test/python/test_flib.py b/test/python/test_flib.py
index 079e34449..7da55a85f 100644
--- a/test/python/test_flib.py
+++ b/test/python/test_flib.py
@@ -46,6 +46,10 @@ def test_flib_1(self):
 
             res1 = self.with_ngraph(sess_fn)
             res2 = self.without_ngraph(sess_fn)
+            print('res1')
+            print(res1)
+            print('res2')
+            print(res2)
             exp = [np.full((2, 3), 3.0), np.full((2, 3), 0.95257413)]
             # Note both run on Host (because NgraphEncapsulate can only run on host)
             assert np.isclose(res1, res2).all()

From 1af0efad8a922377f7dbb609d09cf78fce594476 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 13:12:24 -0800
Subject: [PATCH 51/67] Removed Print function

---
 ngraph_bridge/ngraph_tensor_manager.cc | 30 --------------------------
 ngraph_bridge/ngraph_tensor_manager.h  |  2 --
 2 files changed, 32 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 11e0c5579..3b342d17b 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -143,36 +143,6 @@ void NGraphTensorManager::Initialize() {
   Print();
 }
 
-void NGraphTensorManager::Print() {
-  cout << "Input indexes from Variables" << endl;
-  for (int index : m_input_indexes_from_variables) {
-    cout << index << endl;
-  }
-
-  cout << "Input indexes to Assigns" << endl;
-  for (int index : m_output_indexes_assigning_variable) {
-    cout << index << endl;
-  }
-
-  cout << "Input need copy" << endl;
-  for (int index : m_output_indexes_that_need_copy) {
-    cout << index << endl;
-  }
-
-  cout << "Input pipelined" << endl;
-  for (int index : m_pipelined_input_indexes) {
-    cout << index << endl;
-  }
-
-  cout << "Output pipelined" << endl;
-  for (int index : m_pipelined_output_indexes) {
-    cout << index << endl;
-  }
-
-  cout << "prefetched " << endl;
-  for (int index : m_prefetched_input_indexes) {
-    cout << index << endl;
-  }
 
   cout << "not prefetched " << endl;
   for (int index : m_pipelined_not_prefetched_input_indexes) {
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 5f0442043..d0e394dcd 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -104,8 +104,6 @@ class NGraphTensorManager {
   Status GetOutputVariableCopyToTF(const int& output_index,
                                    bool* output_var_copy_to_tf);
 
-  void Print();
-
  private:
   void Initialize();
   string m_ng_encap_node_name;

From fe5d3e8f943207401d4f5b30bc81b3491d442f35 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 13:34:00 -0800
Subject: [PATCH 52/67] Added Traces to Encap. Some clean up

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 34 +++++++++++---------------
 ngraph_bridge/ngraph_tensor_manager.cc | 18 --------------
 2 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index bc1f9d49e..12141bf04 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -88,13 +88,7 @@ NGraphEncapsulateOp::NGraphEncapsulateOp(OpKernelConstruction* ctx)
       ctx, backend != nullptr,
       errors::Internal("Cannot get the backend object for BE: ", be_name));
 
-  // // If we have the VARIABLE capture on then we can't use the
-  // // parallel executor until that support is added.
-  // #if !defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-  //   m_use_parallel_executor = backend->executable_can_create_tensors();
-  // #else
-  //   m_use_parallel_executor = false;
-  // #endif
+  // If backend executable can create tensors we use parallel executor
   m_use_parallel_executor = backend->executable_can_create_tensors();
 
   // Override the switch for debugging/testing
@@ -460,6 +454,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
                                m_parallel_executor->GetTensorPipelineDepth()));
 
   // Get Tensor Manager and some error checking
+  ngraph::Event event_prepare_ng_tensors("Prepare NG In/Out Tensors", "", "");
   auto tensor_manager = m_parallel_executor->GetTensorManager();
   int num_of_inputs = tensor_manager->GetNumberOfInputs();
   int num_of_outputs = tensor_manager->GetNumberOfOutputs();
@@ -500,15 +495,13 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   vector<shared_ptr<ng::runtime::Tensor>> ng_inputs(num_of_inputs);
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
+  // Prepare NG Input Output Tensors
+  // Retrofit Variable tensors and pipelined tensors to ng_input and ng_outputs
   OP_REQUIRES_OK(ctx, GetIOTensorsReadyForExecution(
                           ctx, tensor_manager, get<1>(pipelined_io_tensors),
                           get<2>(pipelined_io_tensors), ng_inputs, ng_outputs));
-
-  // // All inputs and outputs are pipelined.
-  // // Of all these pipelined inputs some are prefetched
-  // // TODO: Fit in variables
-  // ng_inputs = get<1>(pipelined_io_tensors);
-  // ng_outputs = get<2>(pipelined_io_tensors);
+  event_prepare_ng_tensors.Stop();
+  ngraph::Event::write_trace(event_prepare_ng_tensors);
 
   // And execute
   ngraph::Event event_execute_graph("Execute Graph", "", "");
@@ -548,9 +541,10 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   // Allocate TF Tensors
   NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Allocating TF Output Tensors "
                  << m_parallel_executor->GetNgraphClusterId();
+
+  ngraph::Event event_prepare_tf_output_tensors("Prepare TF Output Tensor", "",
+                                                "");
   vector<Tensor*> tf_output_tensors;
-  ngraph::Event event_allocate_tf_output_tensors("Allocate TF Output Tensor",
-                                                 "", "");
   for (auto i = 0; i < ng_exec->get_results().size(); i++) {
     auto ng_element = ng_exec->get_results()[i];
     auto ng_shape = ng_element->get_shape();
@@ -576,14 +570,11 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
         errors::Internal("Element type inferred by nGraph does not match "
                          "the element type expected by TensorFlow"));
   }
-  event_allocate_tf_output_tensors.Stop();
-  ngraph::Event::write_trace(event_allocate_tf_output_tensors);
 
   // Copy Tensors that are required
   NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute Read NG Output Tensors "
                  << m_parallel_executor->GetNgraphClusterId();
 
-  ngraph::Event event_read_ng_tensors("Read NG Tensor", "", "");
   std::vector<std::unique_ptr<ngraph::Event>> output_copy_events;
 
   auto output_indexes_to_be_copied =
@@ -602,14 +593,17 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   for (auto& next : output_copy_events) {
     ngraph::Event::write_trace(*next.get());
   }
-  event_read_ng_tensors.Stop();
-  ngraph::Event::write_trace(event_read_ng_tensors);
+  event_prepare_tf_output_tensors.Stop();
+  ngraph::Event::write_trace(event_prepare_tf_output_tensors);
 
   // Synch Var Output Tensors as required
   NGRAPH_VLOG(4)
       << "NGraphEncapsulateOp::Compute Sync NG Output Variable Tensors "
       << m_parallel_executor->GetNgraphClusterId();
+  ngraph::Event event_update_ngvar_tensors("Update NGVar Tensors", "", "");
   OP_REQUIRES_OK(ctx, SyncOutputVarTensors(ctx, tensor_manager));
+  event_update_ngvar_tensors.Stop();
+  ngraph::Event::write_trace(event_update_ngvar_tensors);
 
   // Now return them to the cache
   NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Returning Tensors "
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 3b342d17b..ade24ede9 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -140,24 +140,6 @@ void NGraphTensorManager::Initialize() {
                      m_pipelined_input_indexes_that_are_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
-  Print();
-}
-
-
-  cout << "not prefetched " << endl;
-  for (int index : m_pipelined_not_prefetched_input_indexes) {
-    cout << index << endl;
-  }
-
-  cout << "pipelined prefetched " << endl;
-  for (int index : m_pipelined_input_indexes_that_are_prefetched) {
-    cout << index << endl;
-  }
-
-  cout << "pipelined not prefetched " << endl;
-  for (int index : m_pipelined_input_indexes_that_are_not_prefetched) {
-    cout << index << endl;
-  }
 }
 
 //---------------------------------------------------------------------------

From 878572abeba251076ca1c7e23592ac1091379e51 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 14:07:33 -0800
Subject: [PATCH 53/67] Added comments, clean up, etc

---
 .../ngraph_enter_in_catalog.cc                   |  1 -
 ngraph_bridge/ngraph_encapsulate_op_utils.cc     | 16 ++++++++++++++--
 ngraph_bridge/ngraph_encapsulate_op_utils.h      | 16 ++++++++++++++++
 ngraph_bridge/ngraph_tensor_manager.cc           |  7 ++-----
 test/python/test_flib.py                         |  5 +----
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
index d730cf79d..c96a4932e 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_enter_in_catalog.cc
@@ -160,7 +160,6 @@ Status EnterInCatalog(Graph* graph, int graph_id) {
         }
       }
 
-      // are there indexes that need copy
       try {
         NGraphCatalog::AddToEncapOutputCopyIndexesMap(graph_id, node->name(),
                                                       op_index_to_copy);
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 472a72d1e..34f4710eb 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -26,6 +26,9 @@ namespace tensorflow {
 
 namespace ngraph_bridge {
 
+//---------------------------------------------------------------------------
+//  GetPipelinedIOTensorsReadyForExecution
+//---------------------------------------------------------------------------
 Status GetPipelinedIOTensorsReadyForExecution(
     OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
     const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
@@ -224,6 +227,9 @@ Status GetPipelinedIOTensorsReadyForExecution(
   return Status::OK();
 }
 
+//---------------------------------------------------------------------------
+//  GetTensorFromContext
+//---------------------------------------------------------------------------
 Status GetTensorFromContext(const OpKernelContext* ctx,
                             const string& shared_name,
                             shared_ptr<ng::runtime::Tensor>& ng_tensor) {
@@ -236,6 +242,9 @@ Status GetTensorFromContext(const OpKernelContext* ctx,
   return Status::OK();
 }
 
+//---------------------------------------------------------------------------
+//  GetIOTensorsReadyForExecution
+//---------------------------------------------------------------------------
 Status GetIOTensorsReadyForExecution(
     OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
     const PipelinedTensorVector& pipelined_in_tensors,
@@ -280,6 +289,9 @@ Status GetIOTensorsReadyForExecution(
   return Status::OK();
 }
 
+//---------------------------------------------------------------------------
+//  SyncOutputVarTensors
+//---------------------------------------------------------------------------
 Status SyncOutputVarTensors(
     const OpKernelContext* ctx,
     const shared_ptr<NGraphTensorManager>& tensor_manager) {
@@ -289,20 +301,20 @@ Status SyncOutputVarTensors(
   NGRAPH_VLOG(4) << "output indexes size " << var_output_indexes.size();
 
   for (int output_index : var_output_indexes) {
-    NGRAPH_VLOG(4) << "Checking Sync For " << output_index;
     bool copy_to_tf;
     TF_RETURN_IF_ERROR(
         tensor_manager->GetOutputVariableCopyToTF(output_index, &copy_to_tf));
 
     if (copy_to_tf) {
       NGRAPH_VLOG(4) << "Sync NG Output Variable Tensors " << output_index;
+      // Get shared name from tensor manager
       string shared_name;
       TF_RETURN_IF_ERROR(tensor_manager->GetOutputVariableSharedName(
           output_index, &shared_name));
-      // Get shared name from tensor manager
       NGraphVar* var;
       TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup<NGraphVar>(
           ctx->resource_manager()->default_container(), shared_name, &var));
+      // update tensor
       var->copy_ng_to_tf();
       var->Unref();
       NGRAPH_VLOG(4) << "Sync Completed " << output_index;
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 956a2af50..631171b4a 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -52,6 +52,15 @@ Status GetPipelinedIOTensorsReadyForExecution(
     tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors);
 
+// Assembles the different types of input and output tensors into a right order
+// Retrofit Variable tensors and pipelined tensors to ng_input and ng_outputs
+// 1. For input indexes that are fed by variables, get the variable tensors from
+// context
+// 2. For output indexes that are updating variables, get the variable tensors
+// from context
+//    This enable update-in-place
+// 3. For input and output indexes that are pipelined, get the respective tensor
+//
 Status GetIOTensorsReadyForExecution(
     OpKernelContext* ctx, const shared_ptr<NGraphTensorManager>& tensor_manager,
     const PipelinedTensorVector& pipelined_in_tensors,
@@ -59,10 +68,17 @@ Status GetIOTensorsReadyForExecution(
     vector<shared_ptr<ng::runtime::Tensor>>& ng_inputs,
     vector<shared_ptr<ng::runtime::Tensor>>& ng_outputs);
 
+// Gets the Tensor from OpKernelContext's Container for the given shared_name
 Status GetTensorFromContext(const OpKernelContext* ctx,
                             const string& shared_name,
                             shared_ptr<ng::runtime::Tensor>& ng_tensor);
 
+// Encapsulate Op updates the NGVariable's device tensor in-place
+// ie. the NGVariable's backend tensor is updated
+// Some of these Variables may be required by the TF ops and they will use the
+// host tensor
+// These were marked as "copy-to-tf" True in the Rewrite Phase
+// We will update these tensors here
 Status SyncOutputVarTensors(
     const OpKernelContext* ctx,
     const shared_ptr<NGraphTensorManager>& tensor_manager);
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index ade24ede9..50c65dbc1 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -41,11 +41,7 @@ NGraphTensorManager::NGraphTensorManager(const string ng_encap_node_name,
 }
 
 void NGraphTensorManager::Initialize() {
-  cout << "Number of inputs " << m_number_of_inputs << endl;
-  cout << "Number of outputs " << m_number_of_outputs << endl;
-
 #if defined(NGRAPH_TF_ENABLE_VARIABLES_AND_OPTIMIZERS)
-
   // input variables book-keeping
   for (int index = 0; index < m_number_of_inputs; index++) {
     if (NGraphCatalog::ExistsInInputVariableSharedNameMap(
@@ -91,7 +87,8 @@ void NGraphTensorManager::Initialize() {
   }
 
   // For graphs that were run through AOT
-  // Graph rewrite is not done
+  // Graph rewrite is not done, and there is no entry in catalog
+  // If there is not entry in catalog all outputs need to be copied
   if (!NGraphCatalog::EncapOutputNeedsCopy(m_ng_encap_graph_id,
                                            m_ng_encap_node_name)) {
     m_output_indexes_that_need_copy.resize(m_number_of_outputs);
diff --git a/test/python/test_flib.py b/test/python/test_flib.py
index 7da55a85f..f0c9b5b59 100644
--- a/test/python/test_flib.py
+++ b/test/python/test_flib.py
@@ -46,10 +46,7 @@ def test_flib_1(self):
 
             res1 = self.with_ngraph(sess_fn)
             res2 = self.without_ngraph(sess_fn)
-            print('res1')
-            print(res1)
-            print('res2')
-            print(res2)
+
             exp = [np.full((2, 3), 3.0), np.full((2, 3), 0.95257413)]
             # Note both run on Host (because NgraphEncapsulate can only run on host)
             assert np.isclose(res1, res2).all()

From 85882bdb8847658dcfff555cc93d53eda98e44c3 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 14:08:58 -0800
Subject: [PATCH 54/67] Removed ngraph-var in tracked_variable.cc

---
 ngraph_bridge/ngraph_tracked_variable.cc | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc
index bf277b6c1..765e1a1a2 100644
--- a/ngraph_bridge/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/ngraph_tracked_variable.cc
@@ -40,30 +40,6 @@ namespace ngraph_bridge {
 // (Changes: Renamed from LegacyVar, modified to take a TensorShape in
 // constructor.)
 
-// THIS CLASS IS NOT BEING USED ANYWHERE
-class NGraphVar : public ResourceBase {
- public:
-  explicit NGraphVar(DataType dtype, TensorShape shape)
-      : tensor_(dtype, shape) {}
-  // Not copyable or movable.
-  NGraphVar(const NGraphVar&) = delete;
-  NGraphVar& operator=(const NGraphVar&) = delete;
-
-  mutex* mu() { return &mu_; }
-  Tensor* tensor() { return &tensor_; }
-
-  string DebugString() const override {
-    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
-                           tensor_.shape().DebugString());
-  }
-
- private:
-  mutex mu_;
-  Tensor tensor_;
-
-  ~NGraphVar() override {}
-};
-
 class NGraphVariableOp : public OpKernel {
  public:
   explicit NGraphVariableOp(OpKernelConstruction* context);

From 983eb7b4d63c4c4e36f32b9c2eb0ba646d8a07d3 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 15:50:27 -0800
Subject: [PATCH 55/67] ngraph_tracked_variable.cc changes

---
 ngraph_bridge/ngraph_tracked_variable.cc | 30 +++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc
index 765e1a1a2..b8dcf9325 100644
--- a/ngraph_bridge/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/ngraph_tracked_variable.cc
@@ -40,6 +40,29 @@ namespace ngraph_bridge {
 // (Changes: Renamed from LegacyVar, modified to take a TensorShape in
 // constructor.)
 
+// THIS CLASS IS NOT BEING USED ANYWHERE
+class NGraphVar : public ResourceBase {
+ public:
+  explicit NGraphVar(DataType dtype, TensorShape shape)
+      : tensor_(dtype, shape) {}
+  // Not copyable or movable.
+  NGraphVar(const NGraphVar&) = delete;
+  NGraphVar& operator=(const NGraphVar&) = delete;
+
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() const override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+  ~NGraphVar() override {}
+};
+
 class NGraphVariableOp : public OpKernel {
  public:
   explicit NGraphVariableOp(OpKernelConstruction* context);
@@ -51,6 +74,8 @@ class NGraphVariableOp : public OpKernel {
   bool just_looking_;
   NGraphFreshnessTracker* tracker_;
   DataType dtype_;
+  int ng_graph_id_;
+  string ng_backend_name_;
 
   mutex init_mu_;
   ContainerInfo cinfo_ GUARDED_BY(init_mu_);
@@ -74,6 +99,9 @@ NGraphVariableOp::NGraphVariableOp(OpKernelConstruction* context)
 
   OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
   OP_REQUIRES_OK(context, context->GetAttr("just_looking", &just_looking_));
+  OP_REQUIRES_OK(context, context->GetAttr("ngraph_graph_id", &ng_graph_id_));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("_ngraph_backend", &ng_backend_name_));
   NGRAPH_VLOG(5) << def().name() << ": just looking? " << just_looking_;
 }
 
@@ -93,7 +121,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     initialized_ = true;
   }
   auto creator = [this](NGraphVar** var) {
-    *var = new NGraphVar(dtype_, shape_);
+    *var = new NGraphVar(dtype_, shape_, ng_backend_name_);
     //(*var)->tensor()->set_shape(shape_);
     return Status::OK();
   };

From c99202764613534225fef1e9c61d8902cae5aebc Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 16:43:23 -0800
Subject: [PATCH 56/67] added traces

---
 .../enable_variable_ops/ngraph_assign_op.cc     |  2 +-
 .../ngraph_tracked_variable.cc                  |  3 ++-
 .../ngraph_variable_update_ng_tensor_op.cc      |  1 +
 ngraph_bridge/ngraph_encapsulate_op.cc          |  2 +-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc    | 17 +++++++++++++++--
 ngraph_bridge/ngraph_tracked_variable.cc        |  3 ++-
 6 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
index 8eb1d5336..35099bbc7 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_assign_op.cc
@@ -83,7 +83,7 @@ class NGraphAssignOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     std::ostringstream oss;
-    oss << "Execute: Assign_" << my_instance_id << ": " << name();
+    oss << "NGAssign::Compute::" << name();
     ngraph::Event event_compute(oss.str(), name(), "");
 
     NGRAPH_VLOG(4) << "NGraphAssign:: Compute called for: " << def().name()
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
index f821cadbc..8b5b81f68 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_tracked_variable.cc
@@ -119,7 +119,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
                  << " ,backend_name " << ng_backend_name_;
 
   std::ostringstream oss;
-  oss << "NGraphVariable: " << my_instance_id << ": " << name();
+  oss << "NGVariable::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
 
   bool log_copies = false;
@@ -250,6 +250,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
   }
   var->Unref();
+  event_compute.Stop();
   ngraph::Event::write_trace(event_compute);
 }
 
diff --git a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
index faee2334d..8755f6f76 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_variable_update_ng_tensor_op.cc
@@ -67,6 +67,7 @@ NGraphVariableUpdateNGTensorOp::~NGraphVariableUpdateNGTensorOp() {
 void NGraphVariableUpdateNGTensorOp::Compute(OpKernelContext* context) {
   std::ostringstream oss;
   // Start event tracing
+  oss << "NGVariableUpdateNGTensor::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
   bool log_copies = false;
   OP_REQUIRES_OK(context,
diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 12141bf04..2ce5f46fc 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -397,7 +397,7 @@ NGraphEncapsulateOp::~NGraphEncapsulateOp() {
 // OpKernel::Compute
 //---------------------------------------------------------------------------
 void NGraphEncapsulateOp::Compute(OpKernelContext* ctx) {
-  ngraph::Event event_compute("Compute", "", "");
+  ngraph::Event event_compute("NGEncap::Compute::" + name(), name(), "");
 
   if (m_use_parallel_executor) {
     NGRAPH_VLOG(1) << "NGraphEncapsulateOp::Compute: Using Parallel Executor";
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index 57ee713a5..a2fe10a56 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -159,18 +159,21 @@ Status GetPipelinedIOTensorsReadyForExecution(
 
   // Allocate the input/
   ngraph::Event event_copy_input_tensor("Copy Pipelined Input Tensors", "", "");
-
+  std::vector<std::unique_ptr<ngraph::Event>> input_write_events;
   if (!skip_tf2ng_copy) {
     // All pipelined inputs are copied
 
     for (auto i = 0; i < pipelined_input_indexes.size(); i++) {
       int tf_index = pipelined_input_indexes[i];
-
       ng::element::Type ng_element_type;
       TF_RETURN_IF_ERROR(TFDataTypeToNGraphElementType(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
           (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
+
+      std::unique_ptr<ngraph::Event> event_copy_h2d(
+          new ngraph::Event("H2D_Input_" + std::to_string(tf_index), "", ""));
+
       try {
         ng_pipelined_inputs[i]->write(
             current_src_ptr, ng_pipelined_inputs[i]->get_element_count() *
@@ -181,6 +184,8 @@ Status GetPipelinedIOTensorsReadyForExecution(
       } catch (...) {
         return errors::Internal("Error copying TF tensor to device tensor");
       }
+      event_copy_h2d->Stop();
+      input_write_events.push_back(std::move(event_copy_h2d));
     }
   } else {
     // All pipelined inputs that are not prefetched are copied
@@ -204,6 +209,8 @@ Status GetPipelinedIOTensorsReadyForExecution(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
           (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
+      std::unique_ptr<ngraph::Event> event_copy_h2d(
+          new ngraph::Event("H2D_Input_" + std::to_string(tf_index), "", ""));
       try {
         ng_pipelined_inputs[ng_index]->write(
             current_src_ptr,
@@ -215,8 +222,14 @@ Status GetPipelinedIOTensorsReadyForExecution(
       } catch (...) {
         return errors::Internal("Error copying TF tensor to device tensor");
       }
+      event_copy_h2d->Stop();
+      input_write_events.push_back(std::move(event_copy_h2d));
     }
   }
+
+  for (auto& next : input_write_events) {
+    ngraph::Event::write_trace(*next.get());
+  }
   event_copy_input_tensor.Stop();
   ngraph::Event::write_trace(event_copy_input_tensor);
 
diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc
index b8dcf9325..6a55d96a3 100644
--- a/ngraph_bridge/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/ngraph_tracked_variable.cc
@@ -112,7 +112,7 @@ NGraphVariableOp::~NGraphVariableOp() { tracker_->Unref(); }
 void NGraphVariableOp::Compute(OpKernelContext* ctx) {
   mutex_lock l(init_mu_);
   std::ostringstream oss;
-  oss << "NGraphVariable: " << my_instance_id << ": " << name();
+  oss << "NGVariable::Compute::" << name();
   ngraph::Event event_compute(oss.str(), name(), "");
 
   if (!initialized_) {
@@ -186,6 +186,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
   }
   var->Unref();
+  event_compute.Stop();
   ngraph::Event::write_trace(event_compute);
 }
 

From 5ae567a251ac14c96b996557b367db34c2a83bb7 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 16:54:34 -0800
Subject: [PATCH 57/67] fix build

---
 ngraph_bridge/ngraph_tracked_variable.cc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/ngraph_bridge/ngraph_tracked_variable.cc b/ngraph_bridge/ngraph_tracked_variable.cc
index 6a55d96a3..22b1e584e 100644
--- a/ngraph_bridge/ngraph_tracked_variable.cc
+++ b/ngraph_bridge/ngraph_tracked_variable.cc
@@ -74,8 +74,6 @@ class NGraphVariableOp : public OpKernel {
   bool just_looking_;
   NGraphFreshnessTracker* tracker_;
   DataType dtype_;
-  int ng_graph_id_;
-  string ng_backend_name_;
 
   mutex init_mu_;
   ContainerInfo cinfo_ GUARDED_BY(init_mu_);
@@ -99,9 +97,6 @@ NGraphVariableOp::NGraphVariableOp(OpKernelConstruction* context)
 
   OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
   OP_REQUIRES_OK(context, context->GetAttr("just_looking", &just_looking_));
-  OP_REQUIRES_OK(context, context->GetAttr("ngraph_graph_id", &ng_graph_id_));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("_ngraph_backend", &ng_backend_name_));
   NGRAPH_VLOG(5) << def().name() << ": just looking? " << just_looking_;
 }
 
@@ -121,7 +116,7 @@ void NGraphVariableOp::Compute(OpKernelContext* ctx) {
     initialized_ = true;
   }
   auto creator = [this](NGraphVar** var) {
-    *var = new NGraphVar(dtype_, shape_, ng_backend_name_);
+    *var = new NGraphVar(dtype_, shape_);
     //(*var)->tensor()->set_shape(shape_);
     return Status::OK();
   };

From ab33837c1c573e3a8ea2f630561783e6c57429fe Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 19:28:59 -0800
Subject: [PATCH 58/67] Var Rewrite pass calls EnterPrefetchInCatalog,fixed
 header guard, tensor manager pringt utility

---
 .../ngraph_rewrite_pass.cc                    |  8 ++++
 .../ngraph_enter_prefetch_in_catalog.h        |  4 +-
 ngraph_bridge/ngraph_tensor_manager.cc        | 42 +++++++++++++++++++
 ngraph_bridge/ngraph_tensor_manager.h         |  2 +
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
index b764713ab..ea97ff417 100644
--- a/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
+++ b/ngraph_bridge/enable_variable_ops/ngraph_rewrite_pass.cc
@@ -30,6 +30,7 @@
 #include "ngraph_bridge/ngraph_cluster_manager.h"
 #include "ngraph_bridge/ngraph_deassign_clusters.h"
 #include "ngraph_bridge/ngraph_encapsulate_clusters.h"
+#include "ngraph_bridge/ngraph_enter_prefetch_in_catalog.h"
 #include "ngraph_bridge/ngraph_mark_for_clustering.h"
 #include "ngraph_bridge/ngraph_rewrite_for_tracking.h"
 #include "ngraph_bridge/ngraph_utils.h"
@@ -255,6 +256,13 @@ class NGraphEncapsulationPass : public NGraphRewritePass {
                  "Graph with NGraphAssigns Optimized/Removed");
     }
 
+    // 8. Enter Prefetch in catalog then.
+    TF_RETURN_IF_ERROR(EnterPrefetchInCatalog(options.graph->get(), idx));
+    if (DumpCatalogedGraphs()) {
+      DumpGraphs(options, idx, "prefetch-cataloged",
+                 "Graph with Prefetched Inputs Entered in Catalog");
+    }
+
     return Status::OK();
   }
 
diff --git a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
index d7ab8cc9c..534166aa1 100644
--- a/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
+++ b/ngraph_bridge/ngraph_enter_prefetch_in_catalog.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
-#ifndef NGRAPH_TF_ENTER_IN_CATALOG_H_
-#define NGRAPH_TF_ENTER_IN_CATALOG_H_
+#ifndef NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_
+#define NGRAPH_TF_ENTER_PREFETCH_IN_CATALOG_H_
 #pragma once
 
 #include "tensorflow/core/graph/graph.h"
diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 84ad7be90..2fbcfa4c9 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -148,6 +148,48 @@ void NGraphTensorManager::Initialize() {
                      m_pipelined_input_indexes_that_are_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
+  Print();
+}
+
+//---------------------------------------------------------------------------
+// PrintVector Utility
+//---------------------------------------------------------------------------
+void PrintVector(const vector<int>& input_vector, const string title) {
+  cout << title << endl;
+  std::stringstream ss;
+  for (int val : input_vector) {
+    ss << val << " ";
+  }
+  cout << ss.str() << endl;
+}
+
+//---------------------------------------------------------------------------
+//  NGraphTensorManager::Print
+//---------------------------------------------------------------------------
+void NGraphTensorManager::Print() {
+  cout << "** NGEncapsulate TensorManager:" << m_ng_encap_node_name << " **"
+       << endl;
+
+  cout << "** Variables Related **" << endl;
+  PrintVector(m_input_indexes_from_variables, "Input Indexes from Variables");
+  PrintVector(m_output_indexes_assigning_variable,
+              "Output Indexes Referring to Variables");
+  PrintVector(m_output_indexes_that_need_copy, "Output Indexes to be Read");
+
+  cout << "** Pipelined **" << endl;
+  PrintVector(m_pipelined_input_indexes, "Pipelined Input Indexes");
+  PrintVector(m_pipelined_output_indexes, "Pipelined Output Indexes");
+
+  cout << "** Prefetched **" << endl;
+  PrintVector(m_prefetched_input_indexes, "Prefetched Input Indexes");
+  PrintVector(m_pipelined_not_prefetched_input_indexes,
+              "Pipelined But Not Prefetched Input Indexes");
+
+  cout << "** Prefetched wrt pipelined indexes **" << endl;
+  PrintVector(m_pipelined_input_indexes_that_are_prefetched,
+              "Prefetched Input Indexes wrt Pipelined Inputs");
+  PrintVector(m_pipelined_input_indexes_that_are_not_prefetched,
+              "Not Prefetched Input Indexes wrt Pipelined Inputs");
 }
 
 //---------------------------------------------------------------------------
diff --git a/ngraph_bridge/ngraph_tensor_manager.h b/ngraph_bridge/ngraph_tensor_manager.h
index 9143241fb..73f2ca9d4 100644
--- a/ngraph_bridge/ngraph_tensor_manager.h
+++ b/ngraph_bridge/ngraph_tensor_manager.h
@@ -109,6 +109,8 @@ class NGraphTensorManager {
   Status GetOutputVariableCopyToTF(const int& output_index,
                                    bool* output_var_copy_to_tf);
 
+  void Print();
+
  private:
   void Initialize();
   string m_ng_encap_node_name;

From f8ae9372295091f7514967307e30f508ce16ace9 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Wed, 18 Dec 2019 19:36:41 -0800
Subject: [PATCH 59/67] small fix

---
 ngraph_bridge/ngraph_tensor_manager.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 2fbcfa4c9..9cd03d574 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -148,7 +148,6 @@ void NGraphTensorManager::Initialize() {
                      m_pipelined_input_indexes_that_are_prefetched);
   m_pipelined_not_prefetched_input_indexes =
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
-  Print();
 }
 
 //---------------------------------------------------------------------------

From 85f9a390a4dd80c7c1d691d210957ee6614afd63 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 13:37:55 -0800
Subject: [PATCH 60/67] incorporate review comments

---
 ngraph_bridge/ngraph_tensor_manager.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 9cd03d574..5de0e2e32 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -88,7 +88,7 @@ void NGraphTensorManager::Initialize() {
 
   // For graphs that were run through AOT
   // Graph rewrite is not done, and there is no entry in catalog
-  // If there is not entry in catalog all outputs need to be copied
+  // If there is no entry in catalog all outputs need to be copied
   if (!NGraphCatalog::EncapOutputNeedsCopy(m_ng_encap_graph_id,
                                            m_ng_encap_node_name)) {
     m_output_indexes_that_need_copy.resize(m_number_of_outputs);
@@ -155,11 +155,7 @@ void NGraphTensorManager::Initialize() {
 //---------------------------------------------------------------------------
 void PrintVector(const vector<int>& input_vector, const string title) {
   cout << title << endl;
-  std::stringstream ss;
-  for (int val : input_vector) {
-    ss << val << " ";
-  }
-  cout << ss.str() << endl;
+  cout << ng::join(input_vector) << endl;
 }
 
 //---------------------------------------------------------------------------

From a5f9ff28460e9fc0f45092266b5ade3ea1c56b4c Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 13:38:51 -0800
Subject: [PATCH 61/67] fixed path for axpy pipelined for test_ngtf.py

---
 tools/test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/test_utils.py b/tools/test_utils.py
index 12f2ead29..4c84a6d79 100755
--- a/tools/test_utils.py
+++ b/tools/test_utils.py
@@ -108,7 +108,7 @@ def run_ngtf_pytests(venv_dir, build_dir):
     build_dir = os.path.abspath(build_dir)
     venv_dir = os.path.abspath(venv_dir)
     mnist_dir = os.path.abspath(build_dir + '/examples/mnist/')
-
+    axpy_dir = os.path.abspath(build_dir + '/examples/')
     test_dir = os.path.join(build_dir, "test")
     test_dir = os.path.join(test_dir, "python")
 
@@ -130,7 +130,7 @@ def run_ngtf_pytests(venv_dir, build_dir):
         build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16"
     env = os.environ.copy()
     new_paths = venv_dir + '/bin/python3:' + os.path.abspath(
-        build_dir) + ":" + os.path.abspath(mnist_dir)
+        build_dir) + ":" + os.path.abspath(mnist_dir) + os.path.abspath(axpy_dir)
     if 'PYTHONPATH' in env:
         env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"]
     else:

From b639f9dd7a856a343cd237740b1d52c9b68cf1d0 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 13:55:00 -0800
Subject: [PATCH 62/67] Added more specific tracing for prefetched

---
 ngraph_bridge/ngraph_encapsulate_op.cc       |  4 +++-
 ngraph_bridge/ngraph_encapsulate_op_utils.cc | 12 ++++++------
 ngraph_bridge/ngraph_prefetch_dataset_op.cc  | 12 +++++++++---
 tools/test_utils.py                          |  3 ++-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 2ce5f46fc..5acd4eb5d 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -504,7 +504,9 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   ngraph::Event::write_trace(event_prepare_ng_tensors);
 
   // And execute
-  ngraph::Event event_execute_graph("Execute Graph", "", "");
+  ngraph::Event event_execute_graph(
+      "Execute Graph Pipeline Indx" + to_string(current_iter_pipeline_depth),
+      "", "");
 
   BackendManager::LockBackend(m_parallel_executor->GetOpBackendName());
   NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute call starting for cluster "
diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.cc b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
index a2fe10a56..d12494e45 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.cc
@@ -33,7 +33,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
     OpKernelContext* ctx, const vector<Tensor>& tf_input_tensors,
     const shared_ptr<PipelinedTensorsStore>& pipelined_tensor_store,
     const shared_ptr<NGraphTensorManager>& tensor_manager,
-    std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
+    tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors) {
   auto io_tensors = pipelined_tensor_store->get_tensors();
 
@@ -89,7 +89,7 @@ Status GetPipelinedIOTensorsReadyForExecution(
           tensor_manager->GetInputIndexesForPrefetchSharedObject());
 
       // Get the set of IO tensors for the next iteration
-      std::tuple<int, PipelinedTensorVector, PipelinedTensorVector>
+      tuple<int, PipelinedTensorVector, PipelinedTensorVector>
           io_tensors_next_iter;
       io_tensors_next_iter = pipelined_tensor_store->get_tensors();
 
@@ -209,21 +209,21 @@ Status GetPipelinedIOTensorsReadyForExecution(
           tf_input_tensors[tf_index].dtype(), &ng_element_type));
       void* current_src_ptr =
           (void*)DMAHelper::base(&tf_input_tensors[tf_index]);
-      std::unique_ptr<ngraph::Event> event_copy_h2d(
-          new ngraph::Event("H2D_Input_" + std::to_string(tf_index), "", ""));
+      unique_ptr<ngraph::Event> event_copy_h2d(
+          new ngraph::Event("H2D_Input_" + to_string(tf_index), "", ""));
       try {
         ng_pipelined_inputs[ng_index]->write(
             current_src_ptr,
             ng_pipelined_inputs[ng_index]->get_element_count() *
                 ng_element_type.size());
-      } catch (const std::exception& exp) {
+      } catch (const exception& exp) {
         return errors::Internal("Error copying TF tensor to device tensor: ",
                                 exp.what());
       } catch (...) {
         return errors::Internal("Error copying TF tensor to device tensor");
       }
       event_copy_h2d->Stop();
-      input_write_events.push_back(std::move(event_copy_h2d));
+      input_write_events.push_back(move(event_copy_h2d));
     }
   }
 
diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
index 18b946191..5435bfccb 100644
--- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc
+++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
@@ -415,14 +415,15 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
             ngraph_bridge::NGraphPrefetchSharedResouce::RESOURCE_NAME,
             &shared_data);
         if (s.ok()) {
-          ngraph::Event evt_dev_cp("Prf Dev Copy", "Copy", "");
           shared_data->SetBufferDepth(m_buffer_size);
 
           auto ng_input_tensor_bundle =
               shared_data->GetNextIOTensorBundleForDeviceTransfer();
           auto ng_prefetch_input_indexes_map =
               shared_data->GetPrefetchInputIndexesMap();
-
+          ngraph::Event evt_dev_cp(
+              "Prf Dev Copy: Pipe_Ind_" + to_string(ng_input_tensor_bundle.Id),
+              "Copy", "");
           int number_of_buffer_elements = buffer_element.value.size();
           if (number_of_buffer_elements !=
               ng_prefetch_input_indexes_map.size()) {
@@ -433,7 +434,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
                 "encap " +
                 to_string(ng_prefetch_input_indexes_map.size()));
           }
-
+          std::vector<std::unique_ptr<ngraph::Event>>
+              prefetch_input_write_events;
           // Write to these tensors
           for (auto itr : ng_prefetch_input_indexes_map) {
             int ng_index = itr.first;
@@ -445,6 +447,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
 
             void* current_src_ptr =
                 (void*)DMAHelper::base(&buffer_element.value[tf_index]);
+            std::unique_ptr<ngraph::Event> event_copy_h2d(new ngraph::Event(
+                "H2D_PrefetchInput_" + std::to_string(tf_index), "", ""));
             try {
               NGRAPH_VLOG(2)
                   << "[PREFETCH] INPUT tensor being written by Prefetch: "
@@ -459,6 +463,8 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
               throw std::runtime_error(
                   "Error copying TF tensor to device tensor");
             }
+            event_copy_h2d->Stop();
+            prefetch_input_write_events.push_back(std::move(event_copy_h2d));
           }
 
           // Now add them back to the other queue
diff --git a/tools/test_utils.py b/tools/test_utils.py
index 4c84a6d79..427dda0a9 100755
--- a/tools/test_utils.py
+++ b/tools/test_utils.py
@@ -130,7 +130,8 @@ def run_ngtf_pytests(venv_dir, build_dir):
         build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16"
     env = os.environ.copy()
     new_paths = venv_dir + '/bin/python3:' + os.path.abspath(
-        build_dir) + ":" + os.path.abspath(mnist_dir) + os.path.abspath(axpy_dir)
+        build_dir) + ":" + os.path.abspath(mnist_dir) + os.path.abspath(
+            axpy_dir)
     if 'PYTHONPATH' in env:
         env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"]
     else:

From 0c40739989d1965dcba5d1799bbc0a13de80cc6f Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 16:05:54 -0800
Subject: [PATCH 63/67] incorporate review comments

---
 ngraph_bridge/ngraph_encapsulate_op.cc | 2 +-
 tools/test_utils.py                    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op.cc b/ngraph_bridge/ngraph_encapsulate_op.cc
index 5acd4eb5d..4605757ae 100644
--- a/ngraph_bridge/ngraph_encapsulate_op.cc
+++ b/ngraph_bridge/ngraph_encapsulate_op.cc
@@ -496,7 +496,7 @@ void NGraphEncapsulateOp::ComputeUsingParallelExecutor(OpKernelContext* ctx) {
   vector<shared_ptr<ng::runtime::Tensor>> ng_outputs(num_of_outputs);
 
   // Prepare NG Input Output Tensors
-  // Retrofit Variable tensors and pipelined tensors to ng_input and ng_outputs
+  // Assemble Variable tensors and pipelined tensors to ng_input and ng_outputs
   OP_REQUIRES_OK(ctx, GetIOTensorsReadyForExecution(
                           ctx, tensor_manager, get<1>(pipelined_io_tensors),
                           get<2>(pipelined_io_tensors), ng_inputs, ng_outputs));
diff --git a/tools/test_utils.py b/tools/test_utils.py
index 427dda0a9..207097567 100755
--- a/tools/test_utils.py
+++ b/tools/test_utils.py
@@ -130,8 +130,8 @@ def run_ngtf_pytests(venv_dir, build_dir):
         build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16"
     env = os.environ.copy()
     new_paths = venv_dir + '/bin/python3:' + os.path.abspath(
-        build_dir) + ":" + os.path.abspath(mnist_dir) + os.path.abspath(
-            axpy_dir)
+        build_dir) + ":" + os.path.abspath(axpy_dir) + os.path.abspath(
+            mnist_dir)
     if 'PYTHONPATH' in env:
         env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"]
     else:

From 4b21cc1e65a0e67664a9aa5d26343952427c9673 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 16:14:15 -0800
Subject: [PATCH 64/67] minor

---
 ngraph_bridge/ngraph_encapsulate_op_utils.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ngraph_bridge/ngraph_encapsulate_op_utils.h b/ngraph_bridge/ngraph_encapsulate_op_utils.h
index 631171b4a..1a6df4ede 100644
--- a/ngraph_bridge/ngraph_encapsulate_op_utils.h
+++ b/ngraph_bridge/ngraph_encapsulate_op_utils.h
@@ -52,8 +52,9 @@ Status GetPipelinedIOTensorsReadyForExecution(
     tuple<int, PipelinedTensorVector, PipelinedTensorVector>&
         pipelined_io_tensors);
 
-// Assembles the different types of input and output tensors into a right order
-// Retrofit Variable tensors and pipelined tensors to ng_input and ng_outputs
+// Assembles the different types of input and output tensors
+// Variable tensors and pipelined tensors are put together in the right order
+// into ng_inputs and ng_outputs
 // 1. For input indexes that are fed by variables, get the variable tensors from
 // context
 // 2. For output indexes that are updating variables, get the variable tensors

From 36f4bec05a3748681ab6186082aca216ea7d53ba Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 16:49:10 -0800
Subject: [PATCH 65/67] removed print vector. added lambda

---
 ngraph_bridge/ngraph_tensor_manager.cc | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/ngraph_bridge/ngraph_tensor_manager.cc b/ngraph_bridge/ngraph_tensor_manager.cc
index 5de0e2e32..116c213ec 100644
--- a/ngraph_bridge/ngraph_tensor_manager.cc
+++ b/ngraph_bridge/ngraph_tensor_manager.cc
@@ -150,18 +150,15 @@ void NGraphTensorManager::Initialize() {
       FindComplement(m_pipelined_input_indexes, m_prefetched_input_indexes);
 }
 
-//---------------------------------------------------------------------------
-// PrintVector Utility
-//---------------------------------------------------------------------------
-void PrintVector(const vector<int>& input_vector, const string title) {
-  cout << title << endl;
-  cout << ng::join(input_vector) << endl;
-}
-
 //---------------------------------------------------------------------------
 //  NGraphTensorManager::Print
 //---------------------------------------------------------------------------
 void NGraphTensorManager::Print() {
+  auto PrintVector = [](const vector<int>& input_vector, const string title) {
+    cout << title << endl;
+    cout << ng::join(input_vector) << endl;
+  };
+
   cout << "** NGEncapsulate TensorManager:" << m_ng_encap_node_name << " **"
        << endl;
 

From bf3b84664a35ec43aa5b5e5d52ccffbbee937728 Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 17:00:18 -0800
Subject: [PATCH 66/67] fix test_utils.py

---
 tools/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/test_utils.py b/tools/test_utils.py
index 207097567..e8f5752d3 100755
--- a/tools/test_utils.py
+++ b/tools/test_utils.py
@@ -130,7 +130,7 @@ def run_ngtf_pytests(venv_dir, build_dir):
         build_dir) + " --ignore=" + build_dir + "/test/python/bfloat16"
     env = os.environ.copy()
     new_paths = venv_dir + '/bin/python3:' + os.path.abspath(
-        build_dir) + ":" + os.path.abspath(axpy_dir) + os.path.abspath(
+        build_dir) + ":" + os.path.abspath(axpy_dir) + ":" + os.path.abspath(
             mnist_dir)
     if 'PYTHONPATH' in env:
         env["PYTHONPATH"] = new_paths + ":" + env["PYTHONPATH"]

From 8e346e8122f888155595c51f31ac529e443efbaf Mon Sep 17 00:00:00 2001
From: Shrestha Malik <shrestha.malik@intel.com>
Date: Thu, 19 Dec 2019 17:46:18 -0800
Subject: [PATCH 67/67] write prefetch traces

---
 ngraph_bridge/ngraph_prefetch_dataset_op.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ngraph_bridge/ngraph_prefetch_dataset_op.cc b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
index 5435bfccb..7c131bcce 100644
--- a/ngraph_bridge/ngraph_prefetch_dataset_op.cc
+++ b/ngraph_bridge/ngraph_prefetch_dataset_op.cc
@@ -448,7 +448,7 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
             void* current_src_ptr =
                 (void*)DMAHelper::base(&buffer_element.value[tf_index]);
             std::unique_ptr<ngraph::Event> event_copy_h2d(new ngraph::Event(
-                "H2D_PrefetchInput_" + std::to_string(tf_index), "", ""));
+                "H2D_PrefetchInput_" + std::to_string(tf_index), "Copy", ""));
             try {
               NGRAPH_VLOG(2)
                   << "[PREFETCH] INPUT tensor being written by Prefetch: "
@@ -467,6 +467,10 @@ class NGraphPrefetchDatasetOp::Dataset : public DatasetBase {
             prefetch_input_write_events.push_back(std::move(event_copy_h2d));
           }
 
+          for (auto& next : prefetch_input_write_events) {
+            ngraph::Event::write_trace(*next.get());
+          }
+
           // Now add them back to the other queue
           shared_data->AddNextIOTensorBundleReadyForDeviceExecution(
               ng_input_tensor_bundle);