TensorFlow: upstream changes to git.

Change 109321497 Move all images to images directory to make docs versioning easier - adjust all paths in the docs to point to the new locations - remove some now redundant section-order tags added for the old website Change 109317807 Added a kernel op to compute the eigendecomposition of a self-adjoint matrix. Added a new kernel op called self_adjoint_eig (and a batch_self_adjoint_eig) that computes the eigendecomposition of a self-adjoint matrix. The return value is the concatenation of the eigenvalues as a row vector, and the eigenvectors. Change 109310773 Change `_read32()` in the MNIST input example to return an int. Currently we return a 1-D numpy array with 1 element. Numpy has recently deprecated the ability to treat this as a scalar, and as a result this tutorial fails. The fix returns the 0th element of the array instead. Change 109301269 Re-arrange TensorBoard demo files. Change 109273589 add ci_build for ci.tensorflow.org Change 109260293 Speed up NodeDef -> OpKernel process by not spending time generating an error message for missing "_kernel" attr that will be thrown away. Change 109257179 TensorFlow:make event_file_loader_test hermetic by using tempfile instead of fixed filenames. Without this change, running event_file_loader_test twice in the same client (locally) causes it to fail, because it writes into the same file and appends another event, instead of starting from scratch. Change 109256464 Minor cleanup in TensorBoard server code Change 109255382 Change to reduce critical section times in gpu_event_mgr.h: (1) Call stream->ThenRecordEvent outside the EventMgr critical section (2) Do memory deallocation outside the critical section Speeds up one configuration of ptb_word_lm from 2924 words per second (wps) to 3278 wps on my desktop machine with a Titan X. Change 109254843 Fix use of uninitialized memory in test. Change 109250995 python_config.sh needs a license header Otherwise the license test fails. Change 109249914 add ci_build for ci.tensorflow.org Change 109249397 Fixes reduce_sum (complex) on GPU segfaults. Fixes #357 Change 109245652 add ci_build for ci.tensorflow.org Base CL: 109321563
tensorflow · Dec 3, 2015 · a4806a3 · a4806a3
1 parent bb7a7a8
commit a4806a3
Show file tree

Hide file tree

Showing 64 changed files with 1,016 additions and 298 deletions.
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
@@ -2,7 +2,9 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
 
 licenses(["notice"])  # Apache 2.0
 

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -2,7 +2,9 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
 
 package_group(name = "friends")
 

diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -40,13 +40,13 @@ EventMgr::~EventMgr() {
     delete e;
   }
   while (!used_events_.empty()) {
-    InUse* ue = &used_events_[0];
-    delete ue->event;
-    delete ue->mem;
-    if (ue->bufrec.buf) {
-      ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
+    delete used_events_[0].event;
+    delete used_events_[0].mem;
+    if (used_events_[0].bufrec.buf) {
+      used_events_[0].bufrec.alloc->DeallocateRaw(used_events_[0].bufrec.buf);
     }
-    if (ue->func != nullptr) threadpool_.Schedule(ue->func);
+    if (used_events_[0].func != nullptr)
+      threadpool_.Schedule(used_events_[0].func);
     used_events_.pop_front();
   }
 }
@@ -60,17 +60,15 @@ EventMgr::~EventMgr() {
 void EventMgr::PollLoop() {
   while (!stop_polling_.HasBeenNotified()) {
     Env::Default()->SleepForMicroseconds(1 * 1000);
-    ToFreeVector to_free;
     {
       mutex_lock l(mu_);
-      PollEvents(true, &to_free);
+      PollEvents(true);
     }
-    FreeMemory(to_free);
   }
   polling_stopped_.Notify();
 }
 
-void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
+void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
@@ -79,9 +77,10 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
     free_events_.push_back(new gpu::Event(exec_));
     free_events_.back()->Init();
   }
-  *e = free_events_.back();
+  gpu::Event* e = free_events_.back();
   free_events_.pop_back();
-  iu.event = *e;
+  stream->ThenRecordEvent(e);
+  iu.event = e;
   used_events_.push_back(iu);
 }
 
@@ -104,8 +103,7 @@ void EventMgr::QueueInUse(gpu::Stream* stream, InUse iu, gpu::Event** e) {
 // GPU memory use to spike needlessly.  An alternative strategy would
 // be to throttle new Op execution until the pending event queue
 // clears.
-void EventMgr::PollEvents(bool is_dedicated_poller,
-                          gtl::InlinedVector<InUse, 4>* to_free) {
+void EventMgr::PollEvents(bool is_dedicated_poller) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Sweep the remaining events in order.  If this is the dedicated
@@ -125,9 +123,11 @@ void EventMgr::PollEvents(bool is_dedicated_poller,
         if (!is_dedicated_poller) return;  // quit processing queue
         break;
       case gpu::Event::Status::kComplete:
-        // Make a copy of the InUse record so we can free it after releasing
-        // the lock
-        to_free->push_back(iu);
+        delete iu.mem;
+        if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
+        // The function must be called in another thread, outside of
+        // the mutex held here.
+        if (iu.func != nullptr) threadpool_.Schedule(iu.func);
         free_events_.push_back(iu.event);
         // Mark this InUse record as completed.
         iu.event = nullptr;

diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -18,10 +18,8 @@ limitations under the License.
 
 #include <deque>
 #include <vector>
-#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/port.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/public/tensor.h"
@@ -49,15 +47,9 @@ class EventMgr {
   // currently enqueued on *stream have completed.
   inline void ThenDeleteTensors(perftools::gputools::Stream* stream,
                                 std::vector<Tensor>* tensors) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueTensors(stream, tensors, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueTensors(stream, tensors);
+    PollEvents(false);
   }
 
   struct BufRec {
@@ -69,28 +61,16 @@ class EventMgr {
   // on it as soon as all events currently enqueued on *stream have completed.
   inline void ThenDeleteBuffer(perftools::gputools::Stream* stream,
                                BufRec bufrec) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueBuffer(stream, bufrec, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueBuffer(stream, bufrec);
+    PollEvents(false);
   }
 
   inline void ThenExecute(perftools::gputools::Stream* stream,
                           std::function<void()> func) {
-    ToFreeVector to_free;
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(mu_);
-      QueueFunc(stream, func, &e);
-      PollEvents(false, &to_free);
-    }
-    stream->ThenRecordEvent(e);
-    FreeMemory(to_free);
+    mutex_lock l(mu_);
+    QueueFunc(stream, func);
+    PollEvents(false);
   }
 
  private:
@@ -105,50 +85,32 @@ class EventMgr {
     std::function<void()> func;
   };
 
-  typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
-
-  void FreeMemory(const ToFreeVector& to_free) {
-    for (const auto& iu : to_free) {
-      delete iu.mem;
-      if (iu.bufrec.buf) iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
-      // The function must be called in another thread.
-      if (iu.func != nullptr) threadpool_.Schedule(iu.func);
-    }
-  }
-
   // Stream-enqueue an unused Event and save with it a collection of
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
-  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use,
-                  ::perftools::gputools::Event** e)
+  void QueueInUse(perftools::gputools::Stream* stream, InUse in_use)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   void QueueTensors(perftools::gputools::Stream* stream,
-                    std::vector<Tensor>* tensors,
-                    ::perftools::gputools::Event** e)
+                    std::vector<Tensor>* tensors)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr}, e);
+    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
   }
 
-  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec,
-                   ::perftools::gputools::Event** e)
+  void QueueBuffer(perftools::gputools::Stream* stream, BufRec bufrec)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr}, e);
+    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
   }
 
   void QueueFunc(perftools::gputools::Stream* stream,
-                 std::function<void()> func, ::perftools::gputools::Event** e)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, BufRec(), func}, e);
+                 std::function<void()> func) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    QueueInUse(stream, {nullptr, nullptr, BufRec(), func});
   }
 
   // This function should be called at roughly the same tempo as
   // QueueTensors() to check whether pending events have recorded,
-  // and then retire them.  It appends InUse elements that need cleanup
-  // to "*to_free".  The caller should call FreeMemory(to_free)
-  // when this returns.
-  void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // and then retire them.
+  void PollEvents(bool is_dedicated_poller) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // An internal polling loop that runs at a low frequency to clear
   // straggler Events.

diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -42,21 +42,13 @@ class TEST_EventMgrHelper {
 
   void QueueTensors(perftools::gputools::Stream* stream,
                     std::vector<Tensor>* tensors) {
-    ::perftools::gputools::Event* e;
-    {
-      mutex_lock l(em_->mu_);
-      em_->QueueTensors(stream, tensors, &e);
-    }
-    stream->ThenRecordEvent(e);
+    mutex_lock l(em_->mu_);
+    em_->QueueTensors(stream, tensors);
   }
 
   void PollEvents(bool is_dedicated_poller) {
-    EventMgr::ToFreeVector to_free;
-    {
-      mutex_lock l(em_->mu_);
-      em_->PollEvents(is_dedicated_poller, &to_free);
-    }
-    em_->FreeMemory(to_free);
+    mutex_lock l(em_->mu_);
+    em_->PollEvents(is_dedicated_poller);
   }
 
  private:

diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
@@ -79,7 +79,10 @@ Status AttrSlice::Find(const string& attr_name,
     return Status::OK();
   }
   Status s = errors::NotFound("No attr named '", attr_name, "' in NodeDef:");
-  if (ndef_) {
+  // Skip AttachDef for internal attrs since it is a little bit
+  // expensive and it is common for them to correctly not be included
+  // in a NodeDef.
+  if (!StringPiece(attr_name).starts_with("_") && ndef_) {
     s = AttachDef(s, *ndef_);
   }
   return s;

diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
@@ -46,7 +46,7 @@ class CholeskyOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }
@@ -69,8 +69,9 @@ class CholeskyOp
     // Perform the actual LL^T Cholesky decomposition. This will only use
     // the lower triangular part of data_in by default. The upper triangular
     // part of the matrix will not be read.
-    Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
-                             Eigen::RowMajor>> llt_decomposition(input);
+    Eigen::LLT<
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
+        llt_decomposition(input);
 
     // Output the lower triangular in a dense form.
     *output = llt_decomposition.matrixL();

diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc
@@ -44,7 +44,7 @@ class DeterminantOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }

diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -45,7 +45,7 @@ class MatrixInverseOp
     const int64 rows = input_matrix_shape.dim_size(0);
     if (rows > (1LL << 20)) {
       // A big number to cap the cost in case overflow.
-      return kint32max;
+      return kint64max;
     } else {
       return rows * rows * rows;
     }

diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -44,7 +44,10 @@ REGISTER_GPU_KERNELS(float);
 #undef REGISTER_GPU_KERNELS
 
 REGISTER_KERNEL_BUILDER(
-    Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"),
+    Name("Sum")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<complex64>("T")
+        .HostMemory("reduction_indices"),
     ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>);
 
 #endif