tensorflow · drpngx · Jan 17, 2017 · Jan 13, 2017 · Jan 13, 2017 · Jan 13, 2017
diff --git a/configure b/configure
@@ -49,6 +49,15 @@ while true; do
   # Retry
 done
 
+## Set up architecture-dependent optimization flags.
+if [ -z "$CC_OPT_FLAGS" ]; then
+  default_cc_opt_flags="-march=native"
+  read -p "Please specify optimization flags to use during compilation [Default is $default_cc_opt_flags]: " CC_OPT_FLAGS
+  if [ -z "$CC_OPT_FLAGS" ]; then
+    CC_OPT_FLAGS=$default_cc_opt_flags
+  fi
+fi
+
 if is_windows; then
   TF_NEED_GCP=0
   TF_NEED_HDFS=0
@@ -148,6 +157,12 @@ fi
 # Invoke python_config and set up symlinks to python includes
 ./util/python/python_config.sh --setup "$PYTHON_BIN_PATH"
 
+# Append CC optimization flags to bazel.rc
+echo >> tools/bazel.rc
+for opt in $CC_OPT_FLAGS; do
+  echo "build:opt --cxxopt=$opt --copt=$opt" >> tools/bazel.rc
+done
+
 # Run the gen_git_source to create links where bazel can track dependencies for
 # git hash propagation
 GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py

diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
-namespace ops {
 
 Operation::Operation(Node* n) : inputs_(GetInputs(n)), node_(n) {}
 
@@ -110,5 +109,4 @@ Input::Initializer::Initializer(
   tensor = t;
 }
 
-}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
-namespace ops {
 
 class Output;
 
@@ -193,6 +192,7 @@ class Input {
   // * A scalar, or a multi-dimensional tensor specified as a recursive
   //   initializer list. This enables directly passing constants as
   //   inputs to op wrappers.
+  // * A Tensor object.
   Input(const Output& o) : output_(o) {}  // NOLINT(runtime/explicit)
 
   template <typename T, typename = typename std::enable_if<
@@ -249,7 +249,7 @@ typedef std::vector<Output> OutputList;
 class InputList {
  public:
   // Implicitly convert a list of outputs to a list of inputs. This is useful to
-  // write code such as tf.Concat(tf.Split(x, 4)).
+  // write code such as ops::Concat(ops::Split(x, 4)).
   InputList(const OutputList& out) {  // NOLINT(runtime/explicit)
     for (auto const& x : out) {
       inputs_.push_back(x);
@@ -284,7 +284,19 @@ class InputList {
   std::vector<Input> inputs_;
 };
 
+// These symbols used to live in the ops namespace, so we temporarily
+// declare some aliases there. TODO(josh11b): Delete this!
+namespace ops {
+
+using ::tensorflow::Input;
+using ::tensorflow::InputList;
+using ::tensorflow::Operation;
+using ::tensorflow::Output;
+using ::tensorflow::OutputHash;
+using ::tensorflow::OutputList;
+
 }  // namespace ops
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -602,12 +602,10 @@ Status EncapsulateSubgraphsPass::Run(
       std::unique_ptr<Graph>* subgraph, std::vector<int>* input_permutation,
       std::vector<int>* output_permutation, NodeDef* node) {
     // Optimize the subgraph.
-    Graph* g = subgraph->release();
-    OptimizeGraph(flr.get(), &g);
-    subgraph->reset(g);
+    OptimizeGraph(flr.get(), subgraph);
 
     std::vector<bool> const_args(input_permutation->size());
-    TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*g, &const_args));
+    TF_RETURN_IF_ERROR(BackwardsConstAnalysis(**subgraph, &const_args));
 
     // Compute a permutation of the arguments such that the constant arguments
     // are first.

diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
@@ -28,7 +28,7 @@ limitations under the License.
 // Run tests, comparing the Tensorflow CPU operators with their XLA-compiled
 // counterparts:
 // randomized_tests \
-//   --tf_xla_test_use_jit=true --tf_xla_test_device=CPU \
+//   --tf_xla_test_use_jit=true --tf_xla_test_device=CPU:0 \
 //   --tf_xla_test_repetitions=20
 
 // TODO(phawkins): add tests for:
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -66,6 +67,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
@@ -76,9 +78,8 @@ int32 tf_xla_test_repetitions = 20;
 string* tf_xla_test_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 
-string DeviceTypeToDeviceName(DeviceType type) {
-  return strings::StrCat("/job:localhost/replica:0/task:0/device:", type.type(),
-                         ":0");
+string LocalDeviceToFullDeviceName(const string& device) {
+  return strings::StrCat("/job:localhost/replica:0/task:0/device:", device);
 }
 
 constexpr std::array<DataType, 3> kAllXlaTypes = {
@@ -575,9 +576,14 @@ Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
 
 void OpTest::ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
                                            double atol, double rtol) {
-  string cpu_device = DeviceTypeToDeviceName(DEVICE_CPU);
-  DeviceType test_device_type(*tf_xla_test_device_ptr);
-  string test_device = DeviceTypeToDeviceName(test_device_type);
+  string cpu_device =
+      LocalDeviceToFullDeviceName(strings::StrCat(DEVICE_CPU, ":0"));
+  string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
+
+  DeviceNameUtils::ParsedName parsed_name;
+  ASSERT_TRUE(
+      DeviceNameUtils::ParseLocalName(*tf_xla_test_device_ptr, &parsed_name));
+  DeviceType test_device_type(parsed_name.type);
   ++num_tests_;
 
   GraphDef graph;
@@ -2058,7 +2064,7 @@ TEST_F(OpTest, ZerosLike) {
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
-  tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU");
+  tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU:0");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag(
           "tf_xla_random_seed", &tensorflow::tf_xla_random_seed,
@@ -2085,13 +2091,18 @@ int main(int argc, char** argv) {
     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
     return 2;
   }
-  // XLA devices register kernels at construction time; create and destroy all
-  // known devices to make sure the kernels are registered.
+  // XLA devices register kernels at construction time; create all known devices
+  // to make sure the kernels are registered.
   std::vector<tensorflow::Device*> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "", &devices));
-  for (tensorflow::Device* device : devices) {
-    delete device;
-  }
+  tensorflow::DeviceMgr device_mgr(devices);
+
+  tensorflow::Device* ignored;
+  TF_QCHECK_OK(
+      device_mgr.LookupDevice(*tensorflow::tf_xla_test_device_ptr, &ignored))
+      << "Unknown test device (" << *tensorflow::tf_xla_test_device_ptr
+      << "). Did you build in the right configuration (e.g., is CUDA enabled)?";
+
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
@@ -97,7 +97,6 @@ cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
-        "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",

diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -186,9 +186,7 @@ Status XlaCompiler::CompileFunctionBody(
   // for devices other than CPU.
   OptimizerOptions opts;
   GraphOptimizer optimizer(opts);
-  Graph* g = graph.release();
-  OptimizeGraph(flr, &g);
-  graph.reset(g);
+  OptimizeGraph(flr, &graph);
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile(

diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
-#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/xla/client/client_library.h"

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -51,18 +51,9 @@ bool IsLiteralWithValue(const HloInstruction* operand, int value) {
 
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
-bool TransposeIsBitcast(
-    const HloInstruction* transpose,
-    const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) {
+bool TransposeIsBitcast(const HloInstruction* transpose) {
   CHECK_EQ(HloOpcode::kTranspose, transpose->opcode());
   const HloInstruction* operand = transpose->operand(0);
-
-  // Can't insert bitcasts if the compiler used a memory layout which isn't
-  // compatible.
-  if (!valid_bitcast_callback(operand->shape(), transpose->shape())) {
-    return false;
-  }
-
   return ShapeUtil::TransposeIsBitcast(operand->shape(), transpose->shape(),
                                        transpose->dimensions());
 }
@@ -80,11 +71,8 @@ bool ReshapeIsBitcast(
   const HloInstruction* operand = reshape->operand(0);
   // Can't insert bitcasts if the compiler used a memory layout which isn't
   // compatible.
-  if (!valid_bitcast_callback(operand->shape(), reshape->shape())) {
-    return false;
-  }
-
-  return ShapeUtil::ReshapeIsBitcast(operand->shape(), reshape->shape());
+  return ShapeUtil::ReshapeIsBitcast(operand->shape(), reshape->shape()) &&
+         valid_bitcast_callback(operand->shape(), reshape->shape());
 }
 }  // namespace
 
@@ -199,7 +187,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Whether layout is considered during transformation.
   bool is_layout_sensitive_;
 
-  // Callback used to determine if a bitcast is valid.
+  // Callback used to determine if a bitcast is possible.
   AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
 };
 
@@ -287,7 +275,8 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
                                                 HloInstruction* rhs) {
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsLiteralWithValue(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsLiteralWithValue(rhs, 1) &&
+      ReplaceInstructionIfSameShape(divide, lhs)) {
     return Status::OK();
   }
 
@@ -717,8 +706,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     return Status::OK();
   }
 
-  if (is_layout_sensitive_ &&
-      TransposeIsBitcast(transpose, valid_bitcast_callback_)) {
+  if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
   }

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -26,9 +26,11 @@ namespace xla {
 // A pass which performs AlgebraicSimplications.
 class AlgebraicSimplifier : public HloPass {
  public:
-  // Given two shapes, determines if it is valid to bitcast between them.
-  // Precondition: the two shapes have layouts and have the same number of
-  // elements.
+  // Given two shapes, determines if it is valid to bitcast between them after
+  // considering platform dependent effects on layout like alignment
+  // restrictions.
+  // Precondition: the two shapes have layouts, the same number of
+  // elements and ShapeUtil::ReshapeIsBitcast returns true.
   using ValidBitcastCallback = std::function<bool(const Shape&, const Shape&)>;
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -495,11 +495,13 @@ string DumpGraph(const HloComputation& computation, const string& label,
 }
 
 void DumpText(const HloModule& module, const string& label,
-              const string& directory_path) {
+              const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
   TF_CHECK_OK(env->RecursivelyCreateDir(directory_path));
   string prefix = StrCat(env->NowMicros());
-  string path = JoinPath(directory_path, StrCat(prefix, "-", label, ".txt"));
+  string filename =
+      do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
+  string path = JoinPath(directory_path, filename);
   TF_CHECK_OK(WriteStringToFile(env, path, module.ToString()));
 }
 

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -33,8 +33,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
+//
+// If do_prefix is true, a timestamp will be prepended onto the label to
+// construct a filename in the directory path; otherwise, the label is used
+// as the filename directly.
 void DumpText(const HloModule& module, const string& label,
-              const string& directory_path);
+              const string& directory_path, bool do_prefix = true);
 
 // Abstract interface for classes that render DOT graphs.
 class GraphRendererInterface {

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -27,6 +27,7 @@ option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
 option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib/..." ON)
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
+option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@@ -67,7 +68,15 @@ if(WIN32)
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-fno-exceptions -std=c++11")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -std=c++11")
+endif()
+
+if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
+  include(CheckCXXCompilerFlag)
+  CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+  if (COMPILER_OPT_ARCH_NATIVE_SUPPORTED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+  endif()
 endif()
 
 # External dependencies

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
@@ -448,6 +448,23 @@ cuda_py_tests(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_tests(
+    name = "vector_student_t_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/vector_student_t_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        ":distributions_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_tests(
     name = "uniform_test",
     size = "small",

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
@@ -93,6 +93,11 @@
 
 @@kl
 @@RegisterKL
+
+## Utilities
+
+@@softplus_inverse
+
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -110,6 +115,7 @@
 from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
 from tensorflow.contrib.distributions.python.ops.distribution import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
+from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
 from tensorflow.contrib.distributions.python.ops.exponential import *
 from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *