[DataType] Add bfloat16 (apache#5601)

trevor-m · Jun 30, 2020 · 3d10f79 · 3d10f79
1 parent 675c585
commit 3d10f79
Show file tree

Hide file tree

Showing 10 changed files with 676 additions and 2 deletions.
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
@@ -53,6 +53,7 @@ class DataType {
     kUInt = kDLUInt,
     kFloat = kDLFloat,
     kHandle = TVMArgTypeCode::kTVMOpaqueHandle,
+    kBFloat = kDLBfloat,
     kCustomBegin = 129
   };
   /*! \brief default constructor */
@@ -72,6 +73,9 @@ class DataType {
     data_.code = static_cast<uint8_t>(code);
     data_.bits = static_cast<uint8_t>(bits);
     data_.lanes = static_cast<uint16_t>(lanes);
+    if (code == kBFloat) {
+      CHECK_EQ(bits, 16);
+    }
   }
   /*! \return The type code. */
   int code() const { return static_cast<int>(data_.code); }
@@ -89,6 +93,8 @@ class DataType {
   bool is_float() const { return code() == DataType::kFloat; }
   /*! \return whether type is a float16 type. */
   bool is_float16() const { return is_float() && bits() == 16; }
+  /*! \return whether type is a bfloat16 type. */
+  bool is_bfloat16() const { return code() == DataType::kBFloat && bits() == 16; }
   /*! \return whether type is an int type. */
   bool is_int() const { return code() == DataType::kInt; }
   /*! \return whether type is an uint type. */
@@ -283,6 +289,8 @@ inline const char* DLDataTypeCode2Str(DLDataTypeCode type_code) {
       return "float";
     case DataType::kHandle:
       return "handle";
+    case kDLBfloat:
+      return "bfloat";
     default:
       LOG(FATAL) << "unknown type_code=" << static_cast<int>(type_code);
       return "";
@@ -349,6 +357,9 @@ inline DLDataType String2DLDataType(std::string s) {
     t.bits = 1;
     t.lanes = 1;
     return t;
+  } else if (s.substr(0, 6) == "bfloat") {
+    t.code = DataType::kBFloat;
+    scan = s.c_str() + 6;
   } else if (s.substr(0, 6) == "custom") {
     t.code = ParseCustomDatatype(s, &scan);
   } else {

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
@@ -751,7 +751,7 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value) {
       return LargeUIntImm(t, static_cast<int64_t>(low), static_cast<int64_t>(high));
     }
   }
-  if (t.is_float()) return FloatImm(t, static_cast<double>(value));
+  if (t.is_float() || t.is_bfloat16()) return FloatImm(t, static_cast<double>(value));
   // For now, we store const scalar values of custom datatypes within doubles; later, during the
   // datatypes lowering pass, we will lower the value to its true representation in the format
   // specified by the datatype.

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
@@ -321,6 +321,13 @@ TVM_DLL Pass CombineContextCall();
  */
 TVM_DLL Pass NarrowDataType(int target_bits);
 
+/*!
+ * \brief Legalize bf16 typed Ops. Add a cast to fp32
+ *   before Ops, then add a cast back to bf16.
+ * \return The pass.
+ */
+TVM_DLL Pass BF16Legalize();
+
 /*!
  * \brief Rewrite the pointer content type of arguments,
  *  as well as Alloc internal to the function to use

diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
@@ -54,6 +54,7 @@ class DataTypeCode(object):
     UINT = 1
     FLOAT = 2
     HANDLE = 3
+    BFLOAT = 4
 
 
 class DataType(ctypes.Structure):
@@ -65,7 +66,8 @@ class DataType(ctypes.Structure):
         DataTypeCode.INT : 'int',
         DataTypeCode.UINT : 'uint',
         DataTypeCode.FLOAT : 'float',
-        DataTypeCode.HANDLE : 'handle'
+        DataTypeCode.HANDLE : 'handle',
+        DataTypeCode.BFLOAT : 'bfloat'
     }
     def __init__(self, type_str):
         super(DataType, self).__init__()
@@ -96,6 +98,9 @@ def __init__(self, type_str):
             self.type_code = DataTypeCode.HANDLE
             bits = 64
             head = ""
+        elif head.startswith("bfloat"):
+            self.type_code = DataTypeCode.BFLOAT
+            head = head[6:]
         elif head.startswith("custom"):
             # pylint: disable=import-outside-toplevel
             import tvm.runtime._ffi_api

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
@@ -176,6 +176,7 @@ def lower(sch,
     pass_list += [
         tvm.tir.transform.InjectPrefetch(),
         tvm.tir.transform.StorageFlatten(64, instrument_bound_checkers),
+        tvm.tir.transform.BF16Legalize(),
         tvm.tir.transform.NarrowDataType(32),
         tvm.tir.transform.Simplify(),
     ]

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
@@ -226,6 +226,56 @@ def RemoveNoOp():
     """
     return _ffi_api.RemoveNoOp()
 
+def BF16Legalize():
+    """Legalize bf16 typed Ops.
+    Runs BF16Promote, BF16CastElimination and BF16TypeLowering
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BF16Legalize()
+
+def BF16Promote():
+    """Promote bf16 to fp32. Add a cast to fp32
+    before Ops, then add a cast back to bf16.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BF16Promote()
+
+def BF16CastElimination():
+    """Eliminate verbose casting between fp32 and bf16
+    Checks if the AST has the pattern:
+    castto32(castto16(some_fp32_op(...)))
+    The verbose casting is generated by BF16Promote for multiple
+    bf16 Ops in a row. e.g.:
+    X[i] + Y[i] + T[i] =>
+    bf16((float32(bf16((float32(X[i]) + float32(Y[i])))) + float32(T[i])))
+    After this pass:
+    bf16(float32(X[i]) + float32(Y[i]) + float32(T[i]))
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BF16CastElimination()
+
+def BF16TypeLowering():
+    """Replace all bf16 type with uint16. Also lower the casting
+    between fp32 and bf16
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BF16TypeLowering()
 
 def RewriteUnsafeSelect():
     """Detect and rewrite unsafe select that contains memory access.

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
@@ -162,6 +162,7 @@ IRModule lower(te::Schedule sch, const Array<te::Tensor>& args, const std::strin
   pass_list.push_back(tir::transform::InjectPrefetch());
   pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers));
   // Phase 1
+  pass_list.push_back(tir::transform::BF16Legalize());
   pass_list.push_back(tir::transform::NarrowDataType(32));
   pass_list.push_back(tir::transform::Simplify());
   pass_list.push_back(tir::transform::LoopPartition());