Update method names to support intx and floatx changes (#775)

jainapurva · web-flow · commit cfabc13e72fd · 2024-08-28T17:55:54.000-04:00
diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
@@ -11,7 +11,9 @@ torchao.dtypes
     :nosignatures:
 
     to_nf4
-    to_affine_quantized
+    to_affine_quantized_intx
+    to_affine_quantized_floatx
+    to_affine_quantized_intx_static
     AffineQuantizedTensor
 
 ..
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -10,9 +10,6 @@
     int8_dynamic_activation_int8_semi_sparse_weight,
     float8_weight_only,
 )
-from torchao.dtypes import (
-    to_affine_quantized,
-)
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
 import torch
diff --git a/test/hqq/test_hqq_affine.py b/test/hqq/test_hqq_affine.py
@@ -1,7 +1,7 @@
 import unittest
 import torch
 from torchao.dtypes.affine_quantized_tensor import (
-    to_affine_quantized,
+    to_affine_quantized_intx,
     ZeroPointDomain,
     PlainAQTLayout,
     PlainLayoutType,
@@ -49,7 +49,7 @@ def _eval_hqq(nbits, layout_type):
     if isinstance(layout_type, TensorCoreTiledLayoutType):
     	target_dtype = torch.uint8 if TORCH_VERSION_AT_LEAST_2_5 else torch.int32
     	    	
-    q_tensor_hqq = to_affine_quantized(
+    q_tensor_hqq = to_affine_quantized_intx(
             input_float=W,
             mapping_type=mapping_type,
             block_size=block_size,
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -3,8 +3,8 @@
 from .uint4 import UInt4Tensor
 from .affine_quantized_tensor import (
     AffineQuantizedTensor,
-    to_affine_quantized,
-    to_affine_quantized_static,
+    to_affine_quantized_intx,
+    to_affine_quantized_intx_static,
     to_affine_quantized_floatx,
     LayoutType,
     PlainLayoutType,
@@ -17,8 +17,8 @@
     "to_nf4",
     "UInt4Tensor"
     "AffineQuantizedTensor",
-    "to_affine_quantized",
-    "to_affine_quantized_static",
+    "to_affine_quantized_intx",
+    "to_affine_quantized_intx_static",
     "to_affine_quantized_floatx",
     "LayoutType",
     "PlainLayoutType",
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -187,7 +187,7 @@ def __tensor_unflatten__(
         )
 
     @classmethod
-    def from_float(
+    def from_hp_to_intx(
         cls,
         input_float: torch.Tensor,
         mapping_type: MappingType,
@@ -213,16 +213,16 @@ def from_float(
             group_size = max(block_size)
             compute_dtype = zero_point_dtype if (zero_point_dtype is not None) else input_float.dtype
             device = input_float.device
-            int_data, scale, zero_point, _ = quantize_affine_hqq(input_float, nbits=nbits, group_size=group_size, axis=axis, compute_dtype=compute_dtype, device=device, verbose=False, raw_output=False)
-            int_data = int_data.to(target_dtype)
+            data, scale, zero_point, _ = quantize_affine_hqq(input_float, nbits=nbits, group_size=group_size, axis=axis, compute_dtype=compute_dtype, device=device, verbose=False, raw_output=False)
+            data = data.to(target_dtype)
         else:
             scale, zero_point = choose_qparams_affine(input_float, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, scale_dtype, zero_point_dtype, preserve_zero, zero_point_domain)
-            int_data = quantize_affine(input_float, block_size, scale, zero_point, target_dtype, quant_min, quant_max, zero_point_domain)
+            data = quantize_affine(input_float, block_size, scale, zero_point, target_dtype, quant_min, quant_max, zero_point_domain)
             # Note: output will be uint8 tensor for sub byte tensors for now
 
-        int_data = layout_type.post_process(int_data)
+        data = layout_type.post_process(data)
         layout_tensor_ctr = get_layout_tensor_constructor(type(layout_type))
-        layout_tensor = layout_tensor_ctr(int_data, scale, zero_point, layout_type)
+        layout_tensor = layout_tensor_ctr(data, scale, zero_point, layout_type)
         return cls(
             layout_tensor,
             block_size,
@@ -234,7 +234,7 @@ def from_float(
         )
 
     @classmethod
-    def from_float_static(
+    def from_hp_to_intx_static(
         cls,
         input_float: torch.Tensor,
         scale: torch.Tensor,
@@ -266,15 +266,15 @@ def from_float_static(
         )
 
     @classmethod
-    def from_float_to_floatx(
+    def from_hp_to_floatx(
         cls,
         input_float: torch.Tensor,
         block_size: Tuple[int, ...],
         target_dtype: torch.dtype = torch.float8_e4m3fn,
         layout_type: LayoutType = PlainLayoutType(),
     ):
         if target_dtype in FP8_TYPES:
-            cls.from_float(
+            return cls.from_hp_to_intx(
                 input_float=input_float,
                 mapping_type=MappingType.SYMMETRIC,
                 block_size=block_size,
@@ -1004,9 +1004,9 @@ def _(func, types, args, kwargs):
     )
     return return_and_correct_aliasing(func, args, kwargs, new)
 
-to_affine_quantized = AffineQuantizedTensor.from_float
-to_affine_quantized_static = AffineQuantizedTensor.from_float_static
-to_affine_quantized_floatx = AffineQuantizedTensor.from_float_to_floatx
+to_affine_quantized_intx = AffineQuantizedTensor.from_hp_to_intx
+to_affine_quantized_intx_static = AffineQuantizedTensor.from_hp_to_intx_static
+to_affine_quantized_floatx = AffineQuantizedTensor.from_hp_to_floatx
 
 if TORCH_VERSION_AT_LEAST_2_5:
     # Allow a model with AffineQuantizedTensor weights to be loaded with `weights_only=True`
diff --git a/torchao/prototype/hqq/example.py b/torchao/prototype/hqq/example.py
@@ -1,7 +1,7 @@
 import torch
 from torchao.prototype.hqq.core import HQQQuantizer
 from torchao.dtypes.affine_quantized_tensor import (
-    to_affine_quantized,
+    to_affine_quantized_intx,
     ZeroPointDomain,
     PlainAQTLayout,
     PlainLayoutType,
@@ -38,7 +38,7 @@
 
 for nbits in list(range(2, 9))[::-1]:
     print('------------------------------------------------------------------------------')
-    q_tensor_default = to_affine_quantized(
+    q_tensor_default = to_affine_quantized_intx(
             input_float=W,
             mapping_type=mapping_type,
             block_size=block_size,
@@ -57,7 +57,7 @@
     # nbits 4 | Default Dot product error 0.005926903802901506
 
 
-    q_tensor_hqq = to_affine_quantized(
+    q_tensor_hqq = to_affine_quantized_intx(
             input_float=W,
             mapping_type=mapping_type,
             block_size=block_size,
@@ -99,7 +99,7 @@
 # nbits 4 | Default Dot product error 0.0015244047390297055
 
 
-q_tensor_hqq = to_affine_quantized(
+q_tensor_hqq = to_affine_quantized_intx(
         input_float=W,
         mapping_type=mapping_type,
         block_size=block_size,
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -82,7 +82,7 @@ as an example:
 ```python
 import torch
 from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
-from torchao.dtypes import to_affine_quantized
+from torchao.dtypes import to_affine_quantized_intx
 import copy
 from torchao.quantization.quant_api import (
     quantize_,
@@ -142,9 +142,9 @@ speedup: 2.2715200981216173
 
 What we do underlying the APIs are roughly the following:
 ```
-from torchao.dtypes import to_affine_quantized
+from torchao.dtypes import to_affine_quantized_intx
 def int8wo_quant(weight):
-    return to_affine_quantized(weight, MappingType.SYMMETRIC, (1, weight.shape[1]), torch.int8, eps=torch.finfo(torch.float32).eps, zero_point_dtype=torch.int64)
+    return to_affine_quantized_intx(weight, MappingType.SYMMETRIC, (1, weight.shape[1]), torch.int8, eps=torch.finfo(torch.float32).eps, zero_point_dtype=torch.int64)
 
 for n, m in model.named_modules():
     if isinstance(m, torch.nn.Linear):
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -284,7 +284,7 @@ def from_float(cls, weight):
             # return weight
 
         # avoid circular dep
-        from torchao.dtypes import to_affine_quantized
+        from torchao.dtypes import to_affine_quantized_intx
         # weight settings
         mapping_type = MappingType.SYMMETRIC
         def get_weight_block_size(x):
@@ -306,10 +306,10 @@ def get_per_token_block_size(x):
         input_quant_min = -127
         input_quant_max = 127
         layout_type = PlainLayoutType()
-        input_quant_func = lambda x: to_affine_quantized(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
+        input_quant_func = lambda x: to_affine_quantized_intx(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
 
         block_size = get_weight_block_size(weight)
-        weight = to_affine_quantized(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype, layout_type=layout_type)
+        weight = to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype, layout_type=layout_type)
         weight = super(AQInt8DynamicallyQuantizedLinearWeight, cls).from_float(weight, input_quant_func)
         return weight
 
@@ -371,7 +371,7 @@ def from_float(cls, weight):
         eps = torch.finfo(torch.float32).eps
         zero_point_dtype = torch.int64
         block_size = (1, weight.shape[1])
-        return super(AQWeightOnlyQuantizedLinearWeight, cls).from_float(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
+        return super(AQWeightOnlyQuantizedLinearWeight, cls).from_hp_to_intx(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
 
 
 class AQWeightOnlyQuantizedLinearWeight2(AQWeightOnlyQuantizedLinearWeight, AQMixin):
diff --git a/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py b/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py
@@ -21,7 +21,7 @@ def intN_weight_only(group_size=32, n=8, symmetric=False):
     # for asymmetric quantization
     def apply_intN_weight_only_quant_asym(weight):
         # avoid circular dependency
-        from torchao.dtypes import to_affine_quantized
+        from torchao.dtypes import to_affine_quantized_intx
         mapping_type = MappingType.ASYMMETRIC
         block_size = (1, group_size)
         target_dtype = torch.uint8
@@ -31,20 +31,20 @@ def apply_intN_weight_only_quant_asym(weight):
         preserve_zero = True
         zero_point_dtype = torch.int64
         zero_point_domain = ZeroPointDomain.INT
-        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype)#, preserve_zero=preserve_zero,zero_point_domain=zero_point_domain)
+        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype)#, preserve_zero=preserve_zero,zero_point_domain=zero_point_domain)
 
     # for symmetric quantization
     def apply_intN_weight_only_quant_sym(weight):
         # avoid circular dependency
-        from torchao.dtypes import to_affine_quantized
+        from torchao.dtypes import to_affine_quantized_intx
         mapping_type = MappingType.SYMMETRIC
         block_size = (1, group_size)
         target_dtype = torch.int8
         quant_min = -2**(n-1)
         quant_max = 2**(n-1)-1
         eps = 1e-6
         zero_point_dtype = torch.int64
-        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps=eps, zero_point_dtype=zero_point_dtype)
+        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps=eps, zero_point_dtype=zero_point_dtype)
 
     try:
         assert n in [8, 6, 5, 4, 3, 2], "n must be one of [8, 6, 5, 4, 3, 2]"
diff --git a/torchao/quantization/prototype/qat/api.py b/torchao/quantization/prototype/qat/api.py
@@ -49,7 +49,7 @@ def int8_dynamic_activation_int4_weight_fake_quantize(group_size=32):
         quantize_(model, int8_dynamic_activation_int4_weight_fake_quantize(group_size=32))
     """
     # avoid circular dep
-    from torchao.dtypes import to_affine_quantized
+    from torchao.dtypes import to_affine_quantized_intx
 
     def _apply_weight_fake_quant(weight: torch.Tensor):
         mapping_type = MappingType.SYMMETRIC
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -23,7 +23,7 @@
 
 from torchao.dtypes.uintx.Uintx import UintxLayoutType
 from torchao.dtypes import (
-    to_affine_quantized, 
+    to_affine_quantized_intx, 
     TensorCoreTiledLayoutType, 
     PlainLayoutType,
     AffineQuantizedTensor,
@@ -323,11 +323,11 @@ def quantize_(
         # You can also add your own apply_tensor_subclass by manually calling tensor subclass constructor
         # on weight
 
-        from torchao.dtypes import to_affine_quantized
+        from torchao.dtypes import to_affine_quantized_intx
 
         # weight only uint4 asymmetric groupwise quantization
         groupsize = 32
-        apply_weight_quant = lambda x: to_affine_quantized(
+        apply_weight_quant = lambda x: to_affine_quantized_intx(
           x, "asymmetric", (1, groupsize), torch.int32, 0, 15, 1e-6,
           zero_point_dtype=torch.bfloat16, preserve_zero=False, zero_point_domain="float")
 
@@ -356,7 +356,7 @@ def filter_fn(module: nn.Module, fqn: str) -> bool:
 def _int8_asymm_per_token_quant(x: torch.Tensor) -> torch.Tensor:
     mapping_type = MappingType.ASYMMETRIC
     target_dtype = torch.int8
-    return to_affine_quantized(x, mapping_type, _get_per_token_block_size(x), target_dtype)
+    return to_affine_quantized_intx(x, mapping_type, _get_per_token_block_size(x), target_dtype)
 
 def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
     if weight.shape[-1] % group_size != 0:
@@ -373,7 +373,7 @@ def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
     # input settings
     input_quant_func = _int8_asymm_per_token_quant
 
-    weight = to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps)
+    weight = to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps)
     weight = to_linear_activation_quantized(weight, input_quant_func)
     return weight
 
@@ -424,7 +424,7 @@ def apply_int4_weight_only_quant(weight, use_hqq=False):
         preserve_zero = False
         zero_point_dtype = torch.bfloat16
         zero_point_domain = ZeroPointDomain.FLOAT
-        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain, layout_type=layout_type)
+        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain, layout_type=layout_type)
 
     return _get_linear_subclass_inserter(apply_int4_weight_only_quant)
 
@@ -439,7 +439,7 @@ def apply_int8wo_quant(weight):
         eps = torch.finfo(torch.float32).eps
         zero_point_dtype = torch.int64
         block_size = (1, weight.shape[1])
-        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
+        return to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
 
     return _get_linear_subclass_inserter(apply_int8wo_quant)
 
@@ -449,7 +449,7 @@ def _int8_symm_per_token_reduced_range_quant(x: torch.Tensor) -> torch.Tensor:
     eps = 1e-5
     quant_min = -127
     quant_max = 127
-    return to_affine_quantized(x, mapping_type, _get_per_token_block_size(x), target_dtype, eps=eps, quant_min=quant_min, quant_max=quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
+    return to_affine_quantized_intx(x, mapping_type, _get_per_token_block_size(x), target_dtype, eps=eps, quant_min=quant_min, quant_max=quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
 
 
 def int8_dynamic_activation_int8_weight(layout_type=PlainLayoutType()):
@@ -475,7 +475,7 @@ def get_weight_block_size(x):
         input_quant_func = _int8_symm_per_token_reduced_range_quant
 
         block_size = get_weight_block_size(weight)
-        weight = to_affine_quantized(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype, layout_type=layout_type)
+        weight = to_affine_quantized_intx(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype, layout_type=layout_type)
         weight = to_linear_activation_quantized(weight, input_quant_func)
         return weight
 
@@ -527,7 +527,7 @@ def apply_uintx_weight_only_quant(weight):
         zero_point_dtype = torch.int32
         zero_point_domain = ZeroPointDomain.INT
 
-        return to_affine_quantized(
+        return to_affine_quantized_intx(
             weight, mapping_type, block_size, dtype,
             eps=eps, zero_point_dtype=zero_point_dtype,
             zero_point_domain=zero_point_domain,
diff --git a/tutorials/calibration_flow/gptq_like.py b/tutorials/calibration_flow/gptq_like.py
@@ -33,7 +33,7 @@
 import gc
 from typing import Tuple, Dict, Any
 from torchao.quantization.utils import compute_error
-from torchao.dtypes import to_affine_quantized_static
+from torchao.dtypes import to_affine_quantized_intx_static
 from torchao.quantization import quantize_
 from torchao.quantization import to_linear_activation_quantized
 from torchao.quantization import LinearActivationQuantizedTensor
@@ -229,7 +229,7 @@ def _apply_activation_static_quant(observed_linear):
 
         # activation quantization
         act_scale, act_zero_point = observed_linear.input_scale, observed_linear.input_zp
-        input_quant_func = lambda x: to_affine_quantized_static(x, act_scale, act_zero_point, x.shape, target_dtype)
+        input_quant_func = lambda x: to_affine_quantized_intx_static(x, act_scale, act_zero_point, x.shape, target_dtype)
         observed_linear.weight = torch.nn.Parameter(to_linear_activation_quantized(observed_linear.weight, input_quant_func), requires_grad=False)
 
         del observed_linear.input_scale
diff --git a/tutorials/calibration_flow/static_quant.py b/tutorials/calibration_flow/static_quant.py

Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,6 @@`
`10`	`10`	`int8_dynamic_activation_int8_semi_sparse_weight,`
`11`	`11`	`float8_weight_only,`
`12`	`12`	`)`
`13`		`-from torchao.dtypes import (`
`14`		`- to_affine_quantized,`
`15`		`-)`
`16`	`13`	`from torchao.utils import TORCH_VERSION_AT_LEAST_2_5`
`17`	`14`
`18`	`15`	`import torch`