tensorflow · benoitsteiner · Feb 6, 2017 · Feb 3, 2017 · Feb 3, 2017 · Feb 3, 2017
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
@@ -284,7 +284,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_util",
     ],

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
@@ -912,6 +912,17 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
   }
 }
 
+/* static */ bool LiteralUtil::IsAllFloat(const Literal& literal, float value) {
+  switch (literal.shape().element_type()) {
+    case F32:
+      return AllElementsEqualValue<float>(literal, value);
+    case F64:
+      return AllElementsEqualValue<double>(literal, value);
+    default:
+      return false;
+  }
+}
+
 /* static */ bool LiteralUtil::IsZero(
     const Literal& literal, tensorflow::gtl::ArraySlice<int64> indices) {
   switch (literal.shape().element_type()) {

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
@@ -355,6 +355,17 @@ class LiteralUtil {
   // true.
   static bool IsAll(const Literal& literal, int8 value);
 
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular floating-point number.
+  //
+  // If the literal is not a floating-point value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for values that can be expressed precisely as a float,
+  // e.g. -0.5.
+  static bool IsAllFloat(const Literal& literal, float value);
+
   // Returns whether the literal is zero at the specified index. The literal
   // must be an array.
   static bool IsZero(const Literal& literal,

diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
@@ -382,6 +382,33 @@ TEST_F(LiteralUtilTest, IsAll) {
       -1));
 }
 
+TEST_F(LiteralUtilTest, IsAllFloat) {
+  // IsAllFloat always returns false when the literal is not floating-point.
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<bool>(false), 0));
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<int8>(0), 0));
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<uint8>(0), 0));
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<int>(0), 0));
+
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(0), 0));
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(.5), .5));
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(-.5), -.5));
+  EXPECT_FALSE(
+      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<float>(-.5), -.49));
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(
+      *LiteralUtil::CreateR2<float>({{0, 0, 0}, {0, .1, 0}}), 0));
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(
+      *LiteralUtil::CreateR2<float>({{.5, .5, .5}, {.5, .5, .5}}), .5));
+
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(0), 0));
+  EXPECT_TRUE(LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(.5), .5));
+  EXPECT_TRUE(
+      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(-.5), -.5));
+  EXPECT_FALSE(
+      LiteralUtil::IsAllFloat(*LiteralUtil::CreateR0<double>(-.5), -.49));
+  EXPECT_FALSE(LiteralUtil::IsAllFloat(
+      *LiteralUtil::CreateR2<double>({{0, 0, 0}, {0, .1, 0}}), 0));
+}
+
 TEST_F(LiteralUtilTest, IsZero) {
   auto scalar_zero = LiteralUtil::CreateR0<float>(0.0f);
   auto scalar_one = LiteralUtil::CreateR0<float>(1.0f);

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -46,7 +46,7 @@ namespace xla {
 namespace {
 
 // Returns whether operand is a literal with the given value.
-bool IsLiteralWithValue(const HloInstruction* operand, int value) {
+bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
   return operand->opcode() == HloOpcode::kConstant &&
          LiteralUtil::IsAll(operand->literal(), value);
 }

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -111,7 +111,7 @@ std::set<BufferAllocation> BufferAssignment::GetAllocations(
 const BufferAllocation& BufferAssignment::GetAllocation(
     BufferAllocation::Index index) const {
   CHECK(index >= 0 && index < allocations_.size())
-      << "Allocation index " << index << "is out of range.";
+      << "Allocation index " << index << " is out of range.";
   return allocations_[index];
 }
 

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -176,6 +176,7 @@ cc_library(
     srcs = ["elemental_ir_emitter.cc"],
     hdrs = ["elemental_ir_emitter.h"],
     deps = [
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",

diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/IR/Intrinsics.h"
 #include "external/llvm/include/llvm/IR/Module.h"
 #include "external/llvm/include/llvm/IR/Type.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -49,43 +50,80 @@ namespace gpu {
 
 using llvm_ir::IrArray;
 using llvm_ir::SetToFirstInsertPoint;
+using tensorflow::strings::StrAppend;
+
+// Returns whether operand is a floating-point literal with the given value.
+bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
+  return operand->opcode() == HloOpcode::kConstant &&
+         LiteralUtil::IsAllFloat(operand->literal(), value);
+}
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
     llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested)
     : ElementalIrEmitter(hlo_module_config, module, ir_builder),
+      hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
     const string& callee_name,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // Binary math functions tranform are of type [T] -> T.
-  for (PrimitiveType input_type : input_types) {
-    if (output_type != input_type) {
-      return Unimplemented("Input type ≠ output type: %s ≠ %s",
-                           PrimitiveType_Name(input_type).c_str(),
+  // The libdevice math functions differentiate between "double" and "float" by
+  // appending an 'f' to the function's name.
+  string munged_callee = callee_name;
+  switch (output_type) {
+    case F32:
+      StrAppend(&munged_callee, "f");
+      break;
+    case F64:
+      break;
+    default:
+      return Unimplemented("Bad type for libdevice math call: %s",
                            PrimitiveType_Name(output_type).c_str());
-    }
   }
+  return EmitMathCall(munged_callee, operands, input_types, output_type);
+}
 
-  // The libdevice math functions differentiate between "double" and "float" by
-  // appending an 'f' to the function's name.
-  string function_name = callee_name;
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
+    const string& callee_name,
+    tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+    tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+    PrimitiveType output_type) const {
+  // llvm intrinsics differentiate between float/double functions via the ".f32"
+  // and ".f64" suffixes.
+  string munged_callee = callee_name;
   switch (output_type) {
     case F32:
-      function_name += 'f';
+      StrAppend(&munged_callee, ".f32");
       break;
     case F64:
+      StrAppend(&munged_callee, ".f64");
       break;
     default:
-      return Unimplemented("Bad type for math call: %s",
+      return Unimplemented("Bad type for llvm intrinsic math call: %s",
+                           PrimitiveType_Name(output_type).c_str());
+  }
+  return EmitMathCall(munged_callee, operands, input_types, output_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
+    const string& callee_name,
+    tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+    tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+    PrimitiveType output_type) const {
+  // Binary math functions tranform are of type [T] -> T.
+  for (PrimitiveType input_type : input_types) {
+    if (output_type != input_type) {
+      return Unimplemented("Input type ≠ output type: %s ≠ %s",
+                           PrimitiveType_Name(input_type).c_str(),
                            PrimitiveType_Name(output_type).c_str());
+    }
   }
 
   return EmitDeviceFunctionCall(
-      function_name, operands, input_types, output_type,
+      callee_name, operands, input_types, output_type,
       {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind});
 }
 
@@ -97,21 +135,63 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
-      return EmitMathCall("__nv_fmod", {lhs_value, rhs_value},
-                          {lhs_input_type, rhs_input_type}, output_type);
+      return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
+                                   {lhs_input_type, rhs_input_type},
+                                   output_type);
     }
     case HloOpcode::kPower: {
-      return EmitMathCall("__nv_pow", {lhs_value, rhs_value},
-                          {lhs_input_type, rhs_input_type}, output_type);
+      return EmitPowerOp(op, lhs_value, rhs_value);
     }
     default:
       return ElementalIrEmitter::EmitFloatBinaryOp(op, lhs_value, rhs_value);
   }
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPowerOp(
+    const HloInstruction* op, llvm::Value* lhs_value,
+    llvm::Value* rhs_value) const {
+  CHECK_EQ(op->opcode(), HloOpcode::kPower);
+  PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
+  PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
+  PrimitiveType output_type = op->shape().element_type();
+  llvm::Type* llvm_ty = lhs_value->getType();
+
+  auto make_sqrt = [&, this]() -> StatusOr<llvm::Value*> {
+    // NVPTX has four relevant square root instructions:
+    //   sqrt.approx{.ftz}.f32
+    //   sqrt.rn{.ftz}.f32
+    //   sqrt.rn.f64
+    //   rsqrt.approx.f64
+    // We rely on LLVM's NVPTX backend to pick the right one based on our
+    // fast-math options.  (If fast-math is enabled, llvm may compute the 64-bit
+    // sqrt from the rsqrt approximation.)
+    return EmitLlvmIntrinsicMathCall("llvm.sqrt", {lhs_value}, {lhs_input_type},
+                                     output_type);
+  };
+
+  const HloInstruction* rhs = op->operand(1);
+  if (IsFPLiteralWithValue(rhs, .5)) {
+    VLOG(10) << "emitting pow(A, .5) as sqrt(A): " << op->ToString();
+    return make_sqrt();
+  }
+
+  if (!hlo_module_config_.fast_math_disabled() &&
+      IsFPLiteralWithValue(rhs, -.5)) {
+    VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString();
+    // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
+    // rsqrt.approx instruction.
+    TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt());
+    return ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
+  }
+
+  VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString();
+  return EmitLibdeviceMathCall("__nv_pow", {lhs_value, rhs_value},
+                               {lhs_input_type, rhs_input_type}, output_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErfcInv(
     PrimitiveType prim_type, llvm::Value* value) const {
-  return EmitMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
+  return EmitLibdeviceMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
@@ -120,20 +200,20 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
     case HloOpcode::kExp:
-      return EmitMathCall("__nv_exp", {operand_value}, {input_type},
-                          output_type);
+      return EmitLibdeviceMathCall("__nv_exp", {operand_value}, {input_type},
+                                   output_type);
     case HloOpcode::kFloor:
-      return EmitMathCall("__nv_floor", {operand_value}, {input_type},
-                          output_type);
+      return EmitLibdeviceMathCall("__nv_floor", {operand_value}, {input_type},
+                                   output_type);
     case HloOpcode::kCeil:
-      return EmitMathCall("__nv_ceil", {operand_value}, {input_type},
-                          output_type);
+      return EmitLibdeviceMathCall("__nv_ceil", {operand_value}, {input_type},
+                                   output_type);
     case HloOpcode::kLog:
-      return EmitMathCall("__nv_log", {operand_value}, {input_type},
-                          output_type);
+      return EmitLibdeviceMathCall("__nv_log", {operand_value}, {input_type},
+                                   output_type);
     case HloOpcode::kTanh:
-      return EmitMathCall("__nv_tanh", {operand_value}, {input_type},
-                          output_type);
+      return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
+                                   output_type);
     default:
       return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
   }

diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -64,7 +64,12 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   llvm::Value* EmitThreadId() const override;
 
  private:
-  // Emit IR to call a device function named "callee_name" on the given
+  // Emits IR for op, which must have opcode kPower.
+  StatusOr<llvm::Value*> EmitPowerOp(const HloInstruction* op,
+                                     llvm::Value* lhs_value,
+                                     llvm::Value* rhs_value) const;
+
+  // Emits IR to call a device function named "callee_name" on the given
   // operand. Returns the IR value that represents the return value.
   llvm::Value* EmitDeviceFunctionCall(
       const string& callee_name,
@@ -73,15 +78,33 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       PrimitiveType output_type,
       tensorflow::gtl::ArraySlice<llvm::Attribute::AttrKind> attributes) const;
 
-  // Emit IR to call a device function of type [T] -> T.  It adjusts the
-  // callee_name to account for float/double types.
-  // Returns the IR value that represents the return value.
+  // Emits IR to call an LLVM intrinsic of type [T] -> T.  Adjusts
+  // callee_name according to T.  Returns the IR value that represents the
+  // return value of the function.
+  StatusOr<llvm::Value*> EmitLlvmIntrinsicMathCall(
+      const string& callee_name,
+      tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+      tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+      PrimitiveType output_type) const;
+
+  // Emits IR to call a libdevice function of type [T] -> T.  Adjusts
+  // callee_name according to T.  Returns the IR value that represents the
+  // return value of the function.
+  StatusOr<llvm::Value*> EmitLibdeviceMathCall(
+      const string& callee_name,
+      tensorflow::gtl::ArraySlice<llvm::Value*> operands,
+      tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
+      PrimitiveType output_type) const;
+
+  // Emits IR to call a function of type [T] -> T.  Does not munge callee_name.
+  // Returns the IR value that represents the return value of the function.
   StatusOr<llvm::Value*> EmitMathCall(
       const string& callee_name,
       tensorflow::gtl::ArraySlice<llvm::Value*> operands,
       tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
       PrimitiveType output_type) const;
 
+  const HloModuleConfig& hlo_module_config_;
   NestedComputer compute_nested_;
 };
 

diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -45,6 +45,7 @@ std::unique_ptr<Executable> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   auto module_config = MakeUnique<HloModuleConfig>(
       hlo_module->entry_computation()->ComputeProgramShape());
+  module_config->set_fast_math_disabled(fast_math_disabled_);
   return backend_->compiler()
       ->Compile(std::move(hlo_module), std::move(module_config),
                 test_hlo_dumper_, backend_->default_stream_executor())

diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.h b/tensorflow/compiler/xla/tests/codegen_test_base.h
@@ -41,6 +41,9 @@ class CodegenTestBase : public HloTestBase {
   void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
                           const string& pattern);
 
+  // Sets the fast-math-disabled flag on the config we use when compiling.
+  void set_fast_math_disabled(bool disabled) { fast_math_disabled_ = disabled; }
+
  protected:
   // Compiles hlo_module to an executable, CHECK-failing if this fails.
   std::unique_ptr<Executable> CompileToExecutable(
@@ -49,6 +52,8 @@ class CodegenTestBase : public HloTestBase {
   // Runs FileCheck with the given pattern over the given string and EXPECTs
   // that FileCheck succeeded in matching the input.
   void RunFileCheck(const string& input, const string& pattern);
+
+  bool fast_math_disabled_ = false;
 };
 
 }  // namespace xla