From 8fc7288da1b8164bfe5beead7da611704ef2c135 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Wed, 5 Nov 2025 04:05:36 -0800
Subject: [PATCH] llm_compressor example of llama 4 scout

Summary:

Test Plan:

```
with-proxy python quantize_hf_model_with_llm_compressor.py --model_name "meta-llama/Llama-4-Scout-17B-16E-Instruct"
python inspect_llm_compressor_output.py --dir_name data/llmcompressor/fp8-Llama-4-Scout-17B-16E-Instruct/
```

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .../quantize_hf_model_with_llm_compressor.py      | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
index a7815eb..b87407f 100644
--- a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
+++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -6,7 +6,6 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
@@ -17,9 +16,7 @@ def run(
     assert quant_type in ("fp8", "nvfp4"), "unsupported"
 
     # Load model.
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype=torch.bfloat16
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
     print(model)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -34,6 +31,16 @@ def run(
                 "re:.*shared_expert.*",
             ]
         )
+    elif model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
+        ignore_list.extend(
+            [
+                "re:.*self_attn",
+                "re:.*router",
+                "re:.*vision_model.*",
+                "re:.*multi_modal_projector.*",
+                "Llama4TextAttention",
+            ]
+        )
 
     if quant_type == "fp8":
         # Configure the quantization algorithm and scheme.