From 8fc7288da1b8164bfe5beead7da611704ef2c135 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Wed, 5 Nov 2025 04:05:36 -0800 Subject: [PATCH] llm_compressor example of llama 4 scout Summary: Test Plan: ``` with-proxy python quantize_hf_model_with_llm_compressor.py --model_name "meta-llama/Llama-4-Scout-17B-16E-Instruct" python inspect_llm_compressor_output.py --dir_name data/llmcompressor/fp8-Llama-4-Scout-17B-16E-Instruct/ ``` Reviewers: Subscribers: Tasks: Tags: --- .../quantize_hf_model_with_llm_compressor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py index a7815eb..b87407f 100644 --- a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py +++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py @@ -6,7 +6,6 @@ from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation -import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -17,9 +16,7 @@ def run( assert quant_type in ("fp8", "nvfp4"), "unsupported" # Load model. - model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.bfloat16 - ) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") print(model) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -34,6 +31,16 @@ def run( "re:.*shared_expert.*", ] ) + elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": + ignore_list.extend( + [ + "re:.*self_attn", + "re:.*router", + "re:.*vision_model.*", + "re:.*multi_modal_projector.*", + "Llama4TextAttention", + ] + ) if quant_type == "fp8": # Configure the quantization algorithm and scheme.