Remove neural_compressor dependency in MatMulNBits (#24627)

jiafatom · web-flow · commit 6fa8ba107f46 · 2025-05-03T14:33:57.000-07:00
### Description
As titled.



### Motivation and Context
Dependency no need.
diff --git a/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py
@@ -8,15 +8,13 @@
 
 import argparse
 import copy
-import importlib
 import logging
 import os
 
 import numpy as np
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
-from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_matmul_8bits, quantize_qdq_matmul_4bits
 
@@ -1356,21 +1354,7 @@ def process(self):
             self.model = ONNXModel(self.model)  # Ensure the model is wrapped back into ONNXModel
             self.model.clean_initializers()
         else:
-            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
-            try:
-                importlib.import_module("neural_compressor")
-            except Exception as e:
-                logging.error(f"{e}.")
-                raise RuntimeError(
-                    "neural-compressor is not correctly installed. Please check your environment."
-                ) from e
-
-            import neural_compressor
-
-            assert version.parse(neural_compressor.__version__) >= version.parse("2.3.2"), (
-                "Require neural-compressor >= 2.3.2 to support weight only quantization!"
-            )
-
+            # RTN or GPTQ weight-only quantize algorithm
             self.int4_quant_algo()