From 376b96172b812beac0906e930ee940260628f196 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Fri, 3 Oct 2025 07:02:33 -0700
Subject: [PATCH] refactor hf scripts

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .gitignore                                    |  1 +
 hf_torchao_vllm/README.md                     |  2 +-
 .../inspect_llm_compressor_output.py          | 26 ++++++++++
 hf_torchao_vllm/inspect_torchao_output.py     | 28 +++++++++++
 .../quantize_hf_model_with_llm_compressor.py  | 47 +++++++++++++++++++
 .../quantize_hf_model_with_torchao.py         |  2 +-
 .../utils/inspect_llm_compressor_output.py    | 21 ---------
 .../utils/inspect_torchao_output.py           | 24 ----------
 .../quantize_hf_model_with_llm_compressor.py  | 40 ----------------
 9 files changed, 104 insertions(+), 87 deletions(-)
 create mode 100644 hf_torchao_vllm/inspect_llm_compressor_output.py
 create mode 100644 hf_torchao_vllm/inspect_torchao_output.py
 create mode 100644 hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
 delete mode 100644 hf_torchao_vllm/utils/inspect_llm_compressor_output.py
 delete mode 100644 hf_torchao_vllm/utils/inspect_torchao_output.py
 delete mode 100644 hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py

diff --git a/.gitignore b/.gitignore
index c024a81..5203c5c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/
 hf_torchao_vllm/data
+hf_torchao_vllm/sparse_logs
diff --git a/hf_torchao_vllm/README.md b/hf_torchao_vllm/README.md
index fcfef92..7e51ad1 100644
--- a/hf_torchao_vllm/README.md
+++ b/hf_torchao_vllm/README.md
@@ -7,5 +7,5 @@ Example
 python quantize_hf_model_with_torchao.py --model_name "Qwen/Qwen1.5-MoE-A2.7B" --experts_only_qwen_1_5_moe_a_2_7b True --save_model_to_disk True --quant_type nvfp4
 
 # run the model from above in vLLM
-python run_quantized_model_in_vllm.py --model_name "data/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
+python run_quantized_model_in_vllm.py --model_name "data/torchao/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
 ```
diff --git a/hf_torchao_vllm/inspect_llm_compressor_output.py b/hf_torchao_vllm/inspect_llm_compressor_output.py
new file mode 100644
index 0000000..825abc6
--- /dev/null
+++ b/hf_torchao_vllm/inspect_llm_compressor_output.py
@@ -0,0 +1,26 @@
+# inspects the output of model created with llm-compressor
+# via the `run_llm_compressor.py` script
+
+import safetensors
+import json
+import fire
+
+def run(
+    dir_name: str = 'data/llmcompressor/opt-125m-FP8-Dynamic',
+):
+    json_config_name = f'{dir_name}/config.json'
+    with open(json_config_name, 'r') as f:
+        data = json.load(f)
+        # TODO: pretty print
+        print(json.dumps(data, indent=2))
+
+    # inpect the model, saved in safetensors format
+    model_name = f'{dir_name}/model.safetensors'
+    with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
+        print(f.metadata())
+        for k in f.keys():
+            t = f.get_tensor(k)
+            print(k, t.shape, t.dtype)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/inspect_torchao_output.py b/hf_torchao_vllm/inspect_torchao_output.py
new file mode 100644
index 0000000..edd4369
--- /dev/null
+++ b/hf_torchao_vllm/inspect_torchao_output.py
@@ -0,0 +1,28 @@
+# inspects the output of model created with torchao
+# via the `torchao_hf_script.py` script
+
+import json
+import torch
+import torchao  # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
+import fire
+
+# not sure why I still need this
+torch.serialization.add_safe_globals([getattr])
+
+def run(dir_name: str = 'data/torchao/fp8-opt-125m'):
+    json_config_name = f'{dir_name}/config.json'
+
+    # inspect the config
+    with open(json_config_name, 'r') as f:
+        data = json.load(f)
+        # TODO: pretty print
+        print(json.dumps(data, indent=2))
+
+    # inspect the data
+    model_name = f'{dir_name}/pytorch_model.bin'
+    state_dict = torch.load(model_name, weights_only=True)
+    for k, v in state_dict.items():
+        print(k, v.shape, type(v))
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
new file mode 100644
index 0000000..93da539
--- /dev/null
+++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
@@ -0,0 +1,47 @@
+# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+import fire
+
+def run():
+
+    # MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+    MODEL_ID = "facebook/opt-125m"
+
+    # Load model.
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+    # Configure the quantization algorithm and scheme.
+    # In this case, we:
+    #   * quantize the weights to fp8 with per channel via ptq
+    #   * quantize the activations to fp8 with dynamic per token
+    recipe = QuantizationModifier(
+        targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+    )
+
+    # Apply quantization.
+    oneshot(model=model, recipe=recipe)
+
+    # Confirm generations of the quantized model look sane.
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+        model.device
+    )
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+
+    # Save to disk in compressed-tensors format.
+    SAVE_DIR = "data/llmcompressor/" + MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+
+if __name__ == '__main__':
+    fire.Fire(run)
diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py
index f41f9a8..38114a9 100644
--- a/hf_torchao_vllm/quantize_hf_model_with_torchao.py
+++ b/hf_torchao_vllm/quantize_hf_model_with_torchao.py
@@ -252,7 +252,7 @@ def main(
     # Set default output directory based on model base name if not provided
     if output_dir is None:
         model_base_name = model_name.split("/")[-1]
-        output_dir = f"data/{quant_type}-{model_base_name}"
+        output_dir = f"data/torchao/{quant_type}-{model_base_name}"
 
     # Convert to args-like object for compatibility with the rest of the code
     args = Namespace(
diff --git a/hf_torchao_vllm/utils/inspect_llm_compressor_output.py b/hf_torchao_vllm/utils/inspect_llm_compressor_output.py
deleted file mode 100644
index 61863d3..0000000
--- a/hf_torchao_vllm/utils/inspect_llm_compressor_output.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# inspects the output of model created with llm-compressor
-# via the `run_llm_compressor.py` script
-
-import safetensors
-import json
-
-# inspect the config
-dir_name = 'opt-125m-FP8-Dynamic'
-json_config_name = f'{dir_name}/config.json'
-with open(json_config_name, 'r') as f:
-    data = json.load(f)
-    # TODO: pretty print
-    print(json.dumps(data, indent=2))
-
-# inpect the model, saved in safetensors format
-model_name = f'{dir_name}/model.safetensors'
-with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
-    print(f.metadata())
-    for k in f.keys():
-        t = f.get_tensor(k)
-        print(k, t.shape, t.dtype)
diff --git a/hf_torchao_vllm/utils/inspect_torchao_output.py b/hf_torchao_vllm/utils/inspect_torchao_output.py
deleted file mode 100644
index 457faaf..0000000
--- a/hf_torchao_vllm/utils/inspect_torchao_output.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# inspects the output of model created with torchao
-# via the `torchao_hf_script.py` script
-
-import json
-import torch
-import torchao  # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
-
-# not sure why I still need this
-torch.serialization.add_safe_globals([getattr])
-
-dir_name = 'data/fp8-opt-125m'
-json_config_name = f'{dir_name}/config.json'
-
-# inspect the config
-with open(json_config_name, 'r') as f:
-    data = json.load(f)
-    # TODO: pretty print
-    print(json.dumps(data, indent=2))
-
-# inspect the data
-model_name = f'{dir_name}/pytorch_model.bin'
-state_dict = torch.load(model_name, weights_only=True)
-for k, v in state_dict.items():
-    print(k, v.shape, type(v))
diff --git a/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py
deleted file mode 100644
index 616e250..0000000
--- a/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-MODEL_ID = "facebook/opt-125m"
-
-# Load model.
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Configure the quantization algorithm and scheme.
-# In this case, we:
-#   * quantize the weights to fp8 with per channel via ptq
-#   * quantize the activations to fp8 with dynamic per token
-recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
-)
-
-# Apply quantization.
-oneshot(model=model, recipe=recipe)
-
-# Confirm generations of the quantized model look sane.
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=20)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)