From 376b96172b812beac0906e930ee940260628f196 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Fri, 3 Oct 2025 07:02:33 -0700 Subject: [PATCH] refactor hf scripts Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .gitignore | 1 + hf_torchao_vllm/README.md | 2 +- .../inspect_llm_compressor_output.py | 26 ++++++++++ hf_torchao_vllm/inspect_torchao_output.py | 28 +++++++++++ .../quantize_hf_model_with_llm_compressor.py | 47 +++++++++++++++++++ .../quantize_hf_model_with_torchao.py | 2 +- .../utils/inspect_llm_compressor_output.py | 21 --------- .../utils/inspect_torchao_output.py | 24 ---------- .../quantize_hf_model_with_llm_compressor.py | 40 ---------------- 9 files changed, 104 insertions(+), 87 deletions(-) create mode 100644 hf_torchao_vllm/inspect_llm_compressor_output.py create mode 100644 hf_torchao_vllm/inspect_torchao_output.py create mode 100644 hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py delete mode 100644 hf_torchao_vllm/utils/inspect_llm_compressor_output.py delete mode 100644 hf_torchao_vllm/utils/inspect_torchao_output.py delete mode 100644 hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py diff --git a/.gitignore b/.gitignore index c024a81..5203c5c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ __pycache__/ hf_torchao_vllm/data +hf_torchao_vllm/sparse_logs diff --git a/hf_torchao_vllm/README.md b/hf_torchao_vllm/README.md index fcfef92..7e51ad1 100644 --- a/hf_torchao_vllm/README.md +++ b/hf_torchao_vllm/README.md @@ -7,5 +7,5 @@ Example python quantize_hf_model_with_torchao.py --model_name "Qwen/Qwen1.5-MoE-A2.7B" --experts_only_qwen_1_5_moe_a_2_7b True --save_model_to_disk True --quant_type nvfp4 # run the model from above in vLLM -python run_quantized_model_in_vllm.py --model_name "data/nvfp4-Qwen1.5-MoE-A2.7B" --compile False +python run_quantized_model_in_vllm.py --model_name "data/torchao/nvfp4-Qwen1.5-MoE-A2.7B" --compile False ``` diff --git a/hf_torchao_vllm/inspect_llm_compressor_output.py b/hf_torchao_vllm/inspect_llm_compressor_output.py new file mode 100644 index 0000000..825abc6 --- /dev/null +++ b/hf_torchao_vllm/inspect_llm_compressor_output.py @@ -0,0 +1,26 @@ +# inspects the output of model created with llm-compressor +# via the `run_llm_compressor.py` script + +import safetensors +import json +import fire + +def run( + dir_name: str = 'data/llmcompressor/opt-125m-FP8-Dynamic', +): + json_config_name = f'{dir_name}/config.json' + with open(json_config_name, 'r') as f: + data = json.load(f) + # TODO: pretty print + print(json.dumps(data, indent=2)) + + # inpect the model, saved in safetensors format + model_name = f'{dir_name}/model.safetensors' + with safetensors.safe_open(model_name, framework='pt', device='cpu') as f: + print(f.metadata()) + for k in f.keys(): + t = f.get_tensor(k) + print(k, t.shape, t.dtype) + +if __name__ == '__main__': + fire.Fire(run) diff --git a/hf_torchao_vllm/inspect_torchao_output.py b/hf_torchao_vllm/inspect_torchao_output.py new file mode 100644 index 0000000..edd4369 --- /dev/null +++ b/hf_torchao_vllm/inspect_torchao_output.py @@ -0,0 +1,28 @@ +# inspects the output of model created with torchao +# via the `torchao_hf_script.py` script + +import json +import torch +import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor]) +import fire + +# not sure why I still need this +torch.serialization.add_safe_globals([getattr]) + +def run(dir_name: str = 'data/torchao/fp8-opt-125m'): + json_config_name = f'{dir_name}/config.json' + + # inspect the config + with open(json_config_name, 'r') as f: + data = json.load(f) + # TODO: pretty print + print(json.dumps(data, indent=2)) + + # inspect the data + model_name = f'{dir_name}/pytorch_model.bin' + state_dict = torch.load(model_name, weights_only=True) + for k, v in state_dict.items(): + print(k, v.shape, type(v)) + +if __name__ == '__main__': + fire.Fire(run) diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py new file mode 100644 index 0000000..93da539 --- /dev/null +++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py @@ -0,0 +1,47 @@ +# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py + +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +import fire + +def run(): + + # MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + MODEL_ID = "facebook/opt-125m" + + # Load model. + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + + # Configure the quantization algorithm and scheme. + # In this case, we: + # * quantize the weights to fp8 with per channel via ptq + # * quantize the activations to fp8 with dynamic per token + recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] + ) + + # Apply quantization. + oneshot(model=model, recipe=recipe) + + # Confirm generations of the quantized model look sane. + print("========== SAMPLE GENERATION ==============") + dispatch_for_generation(model) + input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device + ) + output = model.generate(input_ids, max_new_tokens=20) + print(tokenizer.decode(output[0])) + print("==========================================") + + # Save to disk in compressed-tensors format. + SAVE_DIR = "data/llmcompressor/" + MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" + model.save_pretrained(SAVE_DIR) + tokenizer.save_pretrained(SAVE_DIR) + +if __name__ == '__main__': + fire.Fire(run) diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py index f41f9a8..38114a9 100644 --- a/hf_torchao_vllm/quantize_hf_model_with_torchao.py +++ b/hf_torchao_vllm/quantize_hf_model_with_torchao.py @@ -252,7 +252,7 @@ def main( # Set default output directory based on model base name if not provided if output_dir is None: model_base_name = model_name.split("/")[-1] - output_dir = f"data/{quant_type}-{model_base_name}" + output_dir = f"data/torchao/{quant_type}-{model_base_name}" # Convert to args-like object for compatibility with the rest of the code args = Namespace( diff --git a/hf_torchao_vllm/utils/inspect_llm_compressor_output.py b/hf_torchao_vllm/utils/inspect_llm_compressor_output.py deleted file mode 100644 index 61863d3..0000000 --- a/hf_torchao_vllm/utils/inspect_llm_compressor_output.py +++ /dev/null @@ -1,21 +0,0 @@ -# inspects the output of model created with llm-compressor -# via the `run_llm_compressor.py` script - -import safetensors -import json - -# inspect the config -dir_name = 'opt-125m-FP8-Dynamic' -json_config_name = f'{dir_name}/config.json' -with open(json_config_name, 'r') as f: - data = json.load(f) - # TODO: pretty print - print(json.dumps(data, indent=2)) - -# inpect the model, saved in safetensors format -model_name = f'{dir_name}/model.safetensors' -with safetensors.safe_open(model_name, framework='pt', device='cpu') as f: - print(f.metadata()) - for k in f.keys(): - t = f.get_tensor(k) - print(k, t.shape, t.dtype) diff --git a/hf_torchao_vllm/utils/inspect_torchao_output.py b/hf_torchao_vllm/utils/inspect_torchao_output.py deleted file mode 100644 index 457faaf..0000000 --- a/hf_torchao_vllm/utils/inspect_torchao_output.py +++ /dev/null @@ -1,24 +0,0 @@ -# inspects the output of model created with torchao -# via the `torchao_hf_script.py` script - -import json -import torch -import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor]) - -# not sure why I still need this -torch.serialization.add_safe_globals([getattr]) - -dir_name = 'data/fp8-opt-125m' -json_config_name = f'{dir_name}/config.json' - -# inspect the config -with open(json_config_name, 'r') as f: - data = json.load(f) - # TODO: pretty print - print(json.dumps(data, indent=2)) - -# inspect the data -model_name = f'{dir_name}/pytorch_model.bin' -state_dict = torch.load(model_name, weights_only=True) -for k, v in state_dict.items(): - print(k, v.shape, type(v)) diff --git a/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py deleted file mode 100644 index 616e250..0000000 --- a/hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py +++ /dev/null @@ -1,40 +0,0 @@ -# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py - -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.utils import dispatch_for_generation - -# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -MODEL_ID = "facebook/opt-125m" - -# Load model. -model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -# Configure the quantization algorithm and scheme. -# In this case, we: -# * quantize the weights to fp8 with per channel via ptq -# * quantize the activations to fp8 with dynamic per token -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] -) - -# Apply quantization. -oneshot(model=model, recipe=recipe) - -# Confirm generations of the quantized model look sane. -print("========== SAMPLE GENERATION ==============") -dispatch_for_generation(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( - model.device -) -output = model.generate(input_ids, max_new_tokens=20) -print(tokenizer.decode(output[0])) -print("==========================================") - -# Save to disk in compressed-tensors format. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" -model.save_pretrained(SAVE_DIR) -tokenizer.save_pretrained(SAVE_DIR)