diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py index a7815eb..b87407f 100644 --- a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py +++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py @@ -6,7 +6,6 @@ from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation -import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -17,9 +16,7 @@ def run( assert quant_type in ("fp8", "nvfp4"), "unsupported" # Load model. - model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.bfloat16 - ) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") print(model) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -34,6 +31,16 @@ def run( "re:.*shared_expert.*", ] ) + elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": + ignore_list.extend( + [ + "re:.*self_attn", + "re:.*router", + "re:.*vision_model.*", + "re:.*multi_modal_projector.*", + "Llama4TextAttention", + ] + ) if quant_type == "fp8": # Configure the quantization algorithm and scheme.