Unpin nightly version (#593)

jerryzh168 · web-flow · commit ed83ae2a69d6 · 2024-08-02T22:58:15.000-07:00
Summary: Previously there was some inductor errors so we pinned the nightly version. It should be fixed by pytorch/pytorch#132096 and we now can't use `unwrap_tensor_subclass` before `torch.compile` now. Test Plan: fix CI errors Reviewers: Subscribers: Tasks: Tags:
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -33,7 +33,7 @@ jobs:
             gpu-arch-version: "12.1"
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch==2.5.0.dev20240728+cu121 --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
           - name: CPU 2.2.2
@@ -48,7 +48,7 @@ jobs:
             gpu-arch-version: ""
           - name: CPU Nightly
             runs-on: linux.4xlarge
-            torch-spec: '--pre torch==2.5.0.dev20240728 --index-url https://download.pytorch.org/whl/nightly/cpu'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
 
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -101,21 +101,24 @@
 def _int8wo_api(mod):
     if TORCH_VERSION_AFTER_2_4:
         quantize_(mod, int8_weight_only(), set_inductor_config=False)
-        unwrap_tensor_subclass(mod)
+        if not TORCH_VERSION_AFTER_2_5:
+            unwrap_tensor_subclass(mod)
     else:
         change_linear_weights_to_int8_woqtensors(mod)
 
 def _int8da_int8w_api(mod):
     if TORCH_VERSION_AFTER_2_4:
         quantize_(mod, int8_dynamic_activation_int8_weight(), set_inductor_config=False)
-        unwrap_tensor_subclass(mod)
+        if not TORCH_VERSION_AFTER_2_5:
+            unwrap_tensor_subclass(mod)
     else:
         change_linear_weights_to_int8_dqtensors(mod)
 
 def _int4wo_api(mod):
     if TORCH_VERSION_AFTER_2_4:
         quantize_(mod, int4_weight_only(), set_inductor_config=False)
-        unwrap_tensor_subclass(mod)
+        if not TORCH_VERSION_AFTER_2_5:
+            unwrap_tensor_subclass(mod)
     else:
         change_linear_weights_to_int4_woqtensors(mod)
 
@@ -853,7 +856,8 @@ def api(mod):
                             kwargs_copy["group_size"] = groupsize
                             del kwargs_copy["groupsize"]
                             quantize_(mod, int4_weight_only(**kwargs_copy))
-                            unwrap_tensor_subclass(mod)
+                            if not TORCH_VERSION_AFTER_2_5:
+                                unwrap_tensor_subclass(mod)
                         else:
                             change_linear_weights_to_int4_woqtensors(mod, **kwargs)
 
@@ -985,6 +989,9 @@ def forward(self, x):
         # save quantized state_dict
         api(model)
 
+        # make sure the model is still runnable
+        model(x)
+
         torch.save(model.state_dict(), "test.pth")
         # get quantized reference
         model_qc = torch.compile(model, mode="max-autotune")
@@ -1004,7 +1011,9 @@ def forward(self, x):
         model.load_state_dict(state_dict, assign=True)
         model = model.to(device=test_device, dtype=test_dtype).eval()
 
-        # get quantized reference
+        # make sure the model is still runnable
+        model(x)
+
         model_qc = torch.compile(model, mode="max-autotune")
         test = model_qc(x).detach()
 
@@ -1013,6 +1022,7 @@ def forward(self, x):
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(is_fbcode(), "'PlainAQTLayout' object has no attribute 'int_data'")
+    @unittest.skipIf(TORCH_VERSION_AFTER_2_5, "Can't save local lambda function for tensor subclass")
     @torch.no_grad()
     def test_save_load_dqtensors(self, device, dtype):
         if device == "cpu":
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
@@ -18,7 +18,7 @@
     int8_dynamic_activation_int8_weight,
     quantize_,
 )
-from torchao.utils import TORCH_VERSION_AFTER_2_3, unwrap_tensor_subclass
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 from torch.testing._internal.common_utils import TestCase
 
 
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -22,6 +22,7 @@
 import time
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 from torchao._models.llama.model import prepare_inputs_for_model
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 
 def run_evaluation(
     checkpoint_path: Path,
@@ -88,7 +89,8 @@ def run_evaluation(
             model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
             model = quantizer.quantize(model, inputs).to(device)
         else:
-            unwrap_tensor_subclass(model)
+            if not TORCH_VERSION_AFTER_2_5:
+                unwrap_tensor_subclass(model)
 
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -13,6 +13,7 @@
 import torch._dynamo.config
 import torch._inductor.config
 from torchao.utils import get_model_size_in_bytes
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 
 def device_sync(device):
     if "cuda" in device:
@@ -115,7 +116,7 @@ def generate(
             from model import AffineQuantizedKVCache
             from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
             _replace_with_custom_fn_if_matches_filter(
-                model, 
+                model,
                 AffineQuantizedKVCache.from_float,
                 lambda x, y: isinstance(x, torchao._models.llama.model.KVCache),
             )
@@ -232,7 +233,8 @@ def main(
             # do autoquantization
             model.finalize_autoquant()
         else:
-            unwrap_tensor_subclass(model)
+            if not TORCH_VERSION_AFTER_2_5:
+                unwrap_tensor_subclass(model)
 
     model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9
 
diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -12,6 +12,7 @@
 from torchao.quantization import quantize_, int8_dynamic_activation_int8_weight, int4_weight_only
 from torchao.sparsity import sparsify_, apply_fake_sparsity, int8_dynamic_activation_int8_semi_sparse_weight, semi_sparse_weight
 from torchao.utils import unwrap_tensor_subclass
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 
 torch._dynamo.config.cache_size_limit = 50000
 
@@ -284,7 +285,8 @@ def run(
 
     if compress == "int8_dynamic_quant":
         quantize_(predictor.model.image_encoder, int8_dynamic_activation_int8_weight())
-        predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
+        if not TORCH_VERSION_AFTER_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
     elif compress == "sparse_mlp_only":
         def mlp_only(mod, name):
             return isinstance(mod, torch.nn.Linear) and 'mlp' in name
@@ -316,7 +318,8 @@ def mlp_only(mod, name):
         sparsify_(predictor.model.image_encoder,
                   semi_sparse_weight(),
                   mlp_lin2_only)
-        predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
+        if not TORCH_VERSION_AFTER_2_5:
+            predictor.model.image_encoder = unwrap_tensor_subclass(predictor.model.image_encoder)
 
     else:
         assert compress is None, f"Unsupported compress mode {compress}"
@@ -401,6 +404,6 @@ def mlp_only(mod, name):
         vals = ",".join(map(str, [device, sam_model_type, batch_size, max_memory_allocated_bytes, max_memory_allocated_percentage, img_s, batch_ms_batch_size, mIoU, use_compile,
             use_half, compress, use_compile_decoder, use_rel_pos, pad_input_image_batch, num_workers, num_batches, num_images, profile_path, memory_path]))
         f.write(vals+"\n")
-        
+
 if __name__ == '__main__':
     fire.Fire(run)
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -607,25 +607,24 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             quantize_affine,
         )
         from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
+        scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
 
         cur_shape = self.shape
         assert len(cur_shape) == 4
         inner_k_tiles = cur_shape[-1] * 2
         original_shape = (cur_shape[0] * 8, cur_shape[1] * (inner_k_tiles * 16))
         eye_shape = original_shape[1]
-        block_size = (1, 32)
+        groupsize = int(original_shape[1] / scale.shape[-2])
+        block_size = (1, groupsize)
         device = self.device
         original_dtype = torch.bfloat16
-        groupsize = 32
         target_dtype = torch.int32
         quant_min = 0
         quant_max = 15
         zero_point_domain = ZeroPointDomain.FLOAT
         assert len(block_size) == 2 and block_size[0] == 1
-        groupsize = block_size[-1]
         dequantized = torch.ops.aten._weight_int4pack_mm(torch.eye(eye_shape, device=device, dtype=original_dtype), self.packed_weight, groupsize, self.scale_and_zero)
         dequantized = dequantized.t().contiguous()
-        scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
         # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
         scale = scale.reshape(scale.shape[:-1]).contiguous()
         zero = zero.reshape(zero.shape[:-1]).contiguous()
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -109,8 +109,11 @@ group_size = 32
 quantize_(m, int4_weight_only(group_size=group_size))
 
 # temporary workaround for tensor subclass + torch.compile
+# NOTE: this is only need for torch 2.5+
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 from torchao.utils import unwrap_tensor_subclass
-m = unwrap_tensor_subclass(m)
+if not TORCH_VERSION_AFTER_2_5:
+    unwrap_tensor_subclass(m)
 # compile the model to improve performance
 m = torch.compile(m, mode='max-autotune')
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -337,6 +337,9 @@ def int8_dynamic_activation_int4_weight(group_size=32):
          size is more fine grained
     """
     def apply_int8_dynamic_activation_int4_weight_quant(weight):
+        if weight.shape[-1] % group_size != 0:
+            return weight
+
         # avoid circular dep
         from torchao.dtypes import to_affine_quantized
 
@@ -379,6 +382,9 @@ def int4_weight_only(group_size=128, inner_k_tiles=8):
         `inner_k_tiles`: parameter for int4 mm kernel, choices are [8, 4, 2]
     """
     def apply_int4_weight_only_quant(weight):
+        if weight.shape[-1] % group_size != 0:
+            return weight
+
         # avoid circular dep
         from torchao.dtypes import to_affine_quantized
         from torchao.dtypes import TensorCoreTiledLayoutType
@@ -438,18 +444,12 @@ def get_weight_block_size(x):
         zero_point_dtype = torch.int64
 
         # input settings
-        def get_per_token_block_size(x):
-            block_size = list(x.shape)
-            for i in range(len(block_size)-1):
-                block_size[i] = 1
-            return block_size
-
         input_mapping_type = MappingType.SYMMETRIC
         input_target_dtype = torch.int8
         input_eps = 1e-5
         input_quant_min = -127
         input_quant_max = 127
-        input_quant_func = lambda x: to_affine_quantized(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
+        input_quant_func = lambda x: to_affine_quantized(x, input_mapping_type, _get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
 
         block_size = get_weight_block_size(weight)
         weight = to_affine_quantized(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype, layout_type=layout_type)
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -233,6 +233,7 @@ def _quantize_affine_no_dtype_cast(
     # TODO: validations
     # TODO: validate scale/zero_point dimensions are compatible with block_size
     assert input.dtype in [torch.float32, torch.float16, torch.bfloat16], f"Unsupported input dtype: {input.dtype}"
+    assert len(block_size) == input.dim(), f"Got input dim:{input.dim()}, block_size: {block_size}"
     shape_for_reduction, reduction_dims = _get_reduction_params(block_size, input.size())
     original_shape = input.shape
     input = input.view(shape_for_reduction)
@@ -349,6 +350,7 @@ def _dequantize_affine_no_dtype_check(
     zero_point_domain: str = ZeroPointDomain.INT.name,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
+    assert len(block_size) == input.dim(), f"Got input dim:{input.dim()}, block_size: {block_size}"
     shape_for_reduction, reduction_dims = _get_reduction_params(block_size, input.size())
     original_shape = input.shape
     input = input.view(shape_for_reduction)
@@ -589,7 +591,7 @@ def _choose_qparams_affine(
     if zero_point_dtype is None:
         zero_point_dtype = input.dtype
 
-    assert len(block_size) == input.dim()
+    assert len(block_size) == input.dim(), f"Got input dim:{input.dim()}, block_size: {block_size}"
     shape_for_reduction, reduction_dims = _get_reduction_params(block_size, input.size())
     input = input.view(shape_for_reduction)
 
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -129,6 +129,13 @@ def guard_dtype_size(tensor_arg, arg_name, dtype=None, size=None):
     if size is not None and tensor_arg.size() != size:
         raise ValueError(f"Expected Tensor argument {arg_name} to have size {size}, but got {tensor_arg.size()} instead.")
 
+def _get_per_token_block_size(x: torch.Tensor) -> List[int]:
+    block_size = []
+    for _ in range(len(x.shape)-1):
+        block_size.append(1)
+    block_size.append(x.shape[-1])
+    return block_size
+
 # taken from
 # https://github.com/mit-han-lab/smoothquant/blob/2f87951dacfb9238d8d657f52ae83a82a3c9ba0c/smoothquant/fake_quant.py#L26
 # and slightly modified
@@ -492,10 +499,3 @@ def recommended_inductor_config_setter():
     torch._inductor.config.fx_graph_cache = True
     torch._inductor.config.triton.unique_kernel_names = True
     torch.set_float32_matmul_precision("high")
-
-def _get_per_token_block_size(x: torch.Tensor) -> List[int]:
-    block_size = []
-    for i in range(len(x.shape)-1):
-        block_size.append(1)
-    block_size.append(x.shape[-1])
-    return block_size
diff --git a/tutorials/quantize_vit/run_vit_b_quant.py b/tutorials/quantize_vit/run_vit_b_quant.py
@@ -31,8 +31,10 @@
 ## compilation configs end
 
 # temporary workaround for the API to work with torch.compile
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 from torchao.utils import unwrap_tensor_subclass
-unwrap_tensor_subclass(model)
+if not TORCH_VERSION_AFTER_2_5:
+    unwrap_tensor_subclass(model)
 
 model = torch.compile(model, mode='max-autotune')
 

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`int8_dynamic_activation_int8_weight,`
`19`	`19`	`quantize_,`
`20`	`20`	`)`
`21`		`-from torchao.utils import TORCH_VERSION_AFTER_2_3, unwrap_tensor_subclass`
	`21`	`+from torchao.utils import TORCH_VERSION_AFTER_2_3`
`22`	`22`	`from torch.testing._internal.common_utils import TestCase`
`23`	`23`
`24`	`24`