Add support for CPU Inference

Refer: AutoGPTQ/AutoGPTQ#385 Signed-Off By: Vivek Khandelwal <vivek@nod-labs.com>
vivekkhandelwal1 · Oct 31, 2023 · 78342d1 · 78342d1
1 parent 8e7588b
commit 78342d1
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -240,10 +240,10 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
                     out_features = layer.weight.shape[1]
                 if not (self.desc_act) or self.group_size == -1:
                     new_layer = QuantLinear(
-                        self.bits, self.group_size, in_features, out_features, True, use_cuda_fp16=self.use_cuda_fp16
+                        self.bits, self.group_size, in_features, out_features, True, use_cuda_fp16=self.use_cuda_fp16, weight_dtype=layer.weight.dtype
                     )
                 else:
-                    new_layer = QuantLinear(self.bits, self.group_size, in_features, out_features, True)
+                    new_layer = QuantLinear(self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype)
                 new_layer.device = device
                 setattr(module, attr, new_layer.to(device))
         for name1, child in module.named_children():

diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
@@ -35,7 +35,7 @@
 TORCH_MINIMUM_VERSION = packaging.version.parse("1.11.0")
 TRANSFORMERS_MINIMUM_VERSION = packaging.version.parse("4.25.0")
 DIFFUSERS_MINIMUM_VERSION = packaging.version.parse("0.18.0")
-AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.4.2")
+AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0")
 
 
 # This is the minimal required version to support some ONNX Runtime features