tensorflow · vrv · Jul 28, 2017 · Jul 27, 2017 · Jul 27, 2017 · Jul 27, 2017
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
@@ -61,6 +61,7 @@ tf_cuda_library(
             "//tensorflow/cc:grad_ops",
             "//tensorflow/cc:scope_internal",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #endif
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -79,6 +80,7 @@ using tensorflow::TensorId;
 using tensorflow::TensorShape;
 using tensorflow::TensorShapeProto;
 using tensorflow::error::Code;
+using tensorflow::errors::FailedPrecondition;
 using tensorflow::errors::InvalidArgument;
 using tensorflow::gtl::ArraySlice;
 using tensorflow::mutex_lock;
@@ -179,6 +181,26 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
 
 }  // namespace
 
+TF_BufferAndDevice::TF_BufferAndDevice(TensorBuffer* buffer)
+    : buffer_(buffer), device_owner_(nullptr), device_index_(-1) {}
+
+TF_BufferAndDevice::TF_BufferAndDevice(TensorBuffer* buffer,
+                                       TF_Session* session, int device_index)
+    : buffer_(buffer), device_owner_(session), device_index_(device_index) {
+  mutex_lock l(device_owner_->mu);
+  device_owner_->num_outstanding_buffers++;
+}
+
+TF_BufferAndDevice::~TF_BufferAndDevice() {
+  buffer_->Unref();
+  if (device_owner_ != nullptr) {
+    mutex_lock l(device_owner_->mu);
+    device_owner_->num_outstanding_buffers--;
+  }
+}
+
+TF_Tensor::~TF_Tensor() { delete buffer; }
+
 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                              int num_dims, size_t len) {
   void* data = allocate_tensor("TF_AllocateTensor", len);
@@ -211,33 +233,35 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     buf->deallocator_ = deallocator;
     buf->deallocator_arg_ = deallocator_arg;
   }
-  return new TF_Tensor{dtype, TensorShape(dimvec), buf};
+  return new TF_Tensor{dtype, TensorShape(dimvec), new TF_BufferAndDevice(buf)};
 }
 
 TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
   // It is safe to move the Tensor if and only if we own the unique reference to
   // it. In that case, we might as well not delete and reallocate, but a future
   // implementation might need to do so.
-  if (tensor->buffer->RefCountIsOne() &&
-      tensor->buffer->root_buffer()->RefCountIsOne() &&
-      tensor->buffer->OwnsMemory()) {
+  TensorBuffer* buf = tensor->buffer->buffer();
+  if (buf->RefCountIsOne() && buf->root_buffer()->RefCountIsOne() &&
+      buf->OwnsMemory()) {
     return tensor;
   }
   return nullptr;
 }
 
-void TF_DeleteTensor(TF_Tensor* t) {
-  t->buffer->Unref();
-  delete t;
-}
+void TF_DeleteTensor(TF_Tensor* t) { delete t; }
 
 TF_DataType TF_TensorType(const TF_Tensor* t) { return t->dtype; }
 int TF_NumDims(const TF_Tensor* t) { return t->shape.dims(); }
 int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
   return static_cast<int64_t>(t->shape.dim_size(dim_index));
 }
-size_t TF_TensorByteSize(const TF_Tensor* t) { return t->buffer->size(); }
-void* TF_TensorData(const TF_Tensor* t) { return t->buffer->data(); }
+size_t TF_TensorByteSize(const TF_Tensor* t) {
+  return t->buffer->buffer()->size();
+}
+void* TF_TensorData(const TF_Tensor* t) {
+  if (t->buffer->on_cpu()) return t->buffer->buffer()->data();
+  return nullptr;
+}
 
 // --------------------------------------------------------------------------
 size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
@@ -396,7 +420,8 @@ namespace tensorflow {
 
 Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
   if (src->dtype != TF_STRING) {
-    *dst = TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer);
+    *dst =
+        TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer->buffer());
     return Status::OK();
   }
   // TF_STRING tensors require copying since Tensor class expects a sequence of
@@ -437,7 +462,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src) {
     TensorBuffer* buf = TensorCApi::Buffer(src);
     buf->Ref();
     return new TF_Tensor{static_cast<TF_DataType>(src.dtype()), src.shape(),
-                         buf};
+                         new TF_BufferAndDevice(buf)};
   }
   // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly
   // encoded sequence of strings.
@@ -2119,6 +2144,17 @@ void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
 
 // TF_Session functions ----------------------------------------------
 
+TF_Session::TF_Session(tensorflow::Session* s, TF_Graph* g)
+    : session(s),
+      graph(g),
+      last_num_graph_nodes(0),
+      device_mgr(nullptr),
+      num_outstanding_buffers(0) {
+  if (s->LocalDeviceManager(&device_mgr).ok()) {
+    devices = device_mgr->ListDevices();
+  }
+}
+
 TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
                           TF_Status* status) {
   Session* session;
@@ -2149,7 +2185,6 @@ TF_Session* TF_LoadSessionFromSavedModel(
   return nullptr;
 #else
   mutex_lock l(graph->mu);
-
   if (!graph->name_map.empty()) {
     status->status = InvalidArgument("Graph is non-empty.");
     return nullptr;
@@ -2203,16 +2238,30 @@ void TF_CloseSession(TF_Session* s, TF_Status* status) {
 }
 
 void TF_DeleteSession(TF_Session* s, TF_Status* status) {
-  status->status = Status::OK();
-  TF_Graph* const graph = s->graph;
-  if (graph != nullptr) {
-    graph->mu.lock();
-    graph->num_sessions -= 1;
-    const bool del = graph->delete_requested && graph->num_sessions == 0;
-    graph->mu.unlock();
-    if (del) delete graph;
+  {
+    mutex_lock l(s->mu);
+    if (s->num_outstanding_buffers > 0) {
+      // This can probably be relaxed: An alternative might be to mark
+      // this session for deletion and do the actual delete only when
+      // the last TF_BufferAndDevice has been deleted.
+      status->status = FailedPrecondition(
+          s->num_outstanding_buffers,
+          " TF_Tensor objects with memory backed by a device "
+          "owned by this TF_Session are still alive. Release "
+          "them using TF_DeleteTensor and retry");
+      return;
+    }
+    status->status = Status::OK();
+    TF_Graph* const graph = s->graph;
+    if (graph != nullptr) {
+      graph->mu.lock();
+      graph->num_sessions -= 1;
+      const bool del = graph->delete_requested && graph->num_sessions == 0;
+      graph->mu.unlock();
+      if (del) delete graph;
+    }
+    delete s->session;
   }
-  delete s->session;
   delete s;
 }
 

diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
@@ -263,6 +263,9 @@ TF_CAPI_EXPORT extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
 TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 
 // Return a pointer to the underlying data buffer.
+//
+// Returns NULL if the underlying data is not in host memory
+// (for example, if it refers to addresses in GPU memory).
 TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
 // --------------------------------------------------------------------------

diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
@@ -18,19 +18,25 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+class Device;
+class DeviceMgr;
+}  // namespace tensorflow
+class TF_BufferAndDevice;
 
 // Internal structures used by the C API. These are likely to change and should
 // not be depended on.
@@ -40,9 +46,11 @@ struct TF_Status {
 };
 
 struct TF_Tensor {
+  ~TF_Tensor();
+
   TF_DataType dtype;
   tensorflow::TensorShape shape;
-  tensorflow::TensorBuffer* buffer;
+  TF_BufferAndDevice* buffer;
 };
 
 struct TF_SessionOptions {
@@ -100,12 +108,19 @@ struct TF_Operation {
 };
 
 struct TF_Session {
-  TF_Session(tensorflow::Session* s, TF_Graph* g)
-      : session(s), graph(g), last_num_graph_nodes(0) {}
+  TF_Session(tensorflow::Session* s, TF_Graph* g);
+
   tensorflow::Session* session;
   TF_Graph* graph;
+
   tensorflow::mutex mu;
   int last_num_graph_nodes;
+
+  // NOTE(ashankar): Experimental fields to help keep the
+  // buffers of a TF_Tensor pinned in device memory.
+  const tensorflow::DeviceMgr* device_mgr;   // Owned by session.
+  std::vector<tensorflow::Device*> devices;  // Owned by device_mgr.
+  int num_outstanding_buffers GUARDED_BY(mu);
 };
 
 struct TF_ImportGraphDefOptions {
@@ -116,6 +131,28 @@ struct TF_DeviceList {
   std::vector<tensorflow::DeviceAttributes> response;
 };
 
+// TF_BufferAndDevice encapsulates the memory addresses of data backing a Tensor
+// and the device (e.g., GPU or host) whose memory the addresses refer to.
+class TF_BufferAndDevice {
+ public:
+  explicit TF_BufferAndDevice(tensorflow::TensorBuffer* buffer);
+  TF_BufferAndDevice(tensorflow::TensorBuffer* buffer, TF_Session* session,
+                     int device_index);
+  ~TF_BufferAndDevice();
+
+  tensorflow::TensorBuffer* buffer() const { return buffer_; }
+  tensorflow::Device* device() const {
+    if (device_owner_ == nullptr) return nullptr;
+    return device_owner_->devices[device_index_];
+  }
+  bool on_cpu() const { return device() == nullptr; }
+
+ private:
+  tensorflow::TensorBuffer* buffer_;
+  TF_Session* device_owner_;
+  const int device_index_;
+};
+
 namespace tensorflow {
 
 class TensorCApi {

diff --git a/tensorflow/compiler/plugin/executor/executable.cc b/tensorflow/compiler/plugin/executor/executable.cc
@@ -30,17 +30,17 @@ ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
 
 ExecutorExecutable::~ExecutorExecutable() {}
 
-static se::DeviceMemoryBase AllocateSingleOutput(sep::ExecutorExecutor* executor,
-                                                 const Literal& literal) {
+static se::DeviceMemoryBase AllocateSingleOutput(
+    sep::ExecutorExecutor* executor, const Literal& literal) {
   int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
   void* buf = executor->Allocate(size);
   const void* src = literal.InternalData();
   memcpy(buf, src, size);
   return se::DeviceMemoryBase(buf, size);
 }
 
-static se::DeviceMemoryBase AllocateOutputBuffer(sep::ExecutorExecutor* executor,
-                                                 const Literal& literal) {
+static se::DeviceMemoryBase AllocateOutputBuffer(
+    sep::ExecutorExecutor* executor, const Literal& literal) {
   const Shape& shape = literal.shape();
   if (shape.element_type() != xla::TUPLE) {
     return AllocateSingleOutput(executor, literal);
@@ -97,7 +97,7 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
   // Execute the graph using the evaluator
   HloEvaluator evaluator;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> output,
-                      evaluator.Evaluate(computation, arg_literals_ptrs));
+                      evaluator.Evaluate(*computation, arg_literals_ptrs));
 
   // Copy the result into the return buffer
   perftools::gputools::StreamExecutor* executor(stream->parent());
@@ -140,6 +140,5 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
 }
 
-
 }  // namespace executorplugin
 }  // namespace xla
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
@@ -773,6 +773,24 @@ def testTranspose(self):
           np.array([1, 0], dtype=np.int32),
           expected=np.array([[1, 3], [2, 4]], dtype=dtype))
 
+  def testCross(self):
+    for dtype in self.float_types:
+      self._testBinary(
+          gen_math_ops.cross,
+          np.zeros((4, 3), dtype=dtype),
+          np.zeros((4, 3), dtype=dtype),
+          expected=np.zeros((4, 3), dtype=dtype))
+      self._testBinary(
+          gen_math_ops.cross,
+          np.array([1, 2, 3], dtype=dtype),
+          np.array([4, 5, 6], dtype=dtype),
+          expected=np.array([-3, 6, -3], dtype=dtype))
+      self._testBinary(
+          gen_math_ops.cross,
+          np.array([[1, 2, 3], [10, 11, 12]], dtype=dtype),
+          np.array([[4, 5, 6], [40, 50, 60]], dtype=dtype),
+          expected=np.array([[-3, 6, -3], [60, -120, 60]], dtype=dtype))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -24,6 +24,7 @@ tf_kernel_library(
         "concat_op.cc",
         "const_op.cc",
         "conv_ops.cc",
+        "cross_op.cc",
         "cwise_ops.cc",
         "depthwise_conv_ops.cc",
         "diag_op.cc",