From 848e553b6a4e8924b82aa69d943944bc712fa769 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Tue, 30 Sep 2025 13:09:41 -0700 Subject: [PATCH] add scripts for inspecting torchao and llm-compressor output Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .../inspect_llm_compressor_output.py | 21 ++++++++++ torchao_hf_vllm/inspect_torchao_output.py | 24 +++++++++++ torchao_hf_vllm/run_llm_compressor.py | 40 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 torchao_hf_vllm/inspect_llm_compressor_output.py create mode 100644 torchao_hf_vllm/inspect_torchao_output.py create mode 100644 torchao_hf_vllm/run_llm_compressor.py diff --git a/torchao_hf_vllm/inspect_llm_compressor_output.py b/torchao_hf_vllm/inspect_llm_compressor_output.py new file mode 100644 index 0000000..61863d3 --- /dev/null +++ b/torchao_hf_vllm/inspect_llm_compressor_output.py @@ -0,0 +1,21 @@ +# inspects the output of model created with llm-compressor +# via the `run_llm_compressor.py` script + +import safetensors +import json + +# inspect the config +dir_name = 'opt-125m-FP8-Dynamic' +json_config_name = f'{dir_name}/config.json' +with open(json_config_name, 'r') as f: + data = json.load(f) + # TODO: pretty print + print(json.dumps(data, indent=2)) + +# inpect the model, saved in safetensors format +model_name = f'{dir_name}/model.safetensors' +with safetensors.safe_open(model_name, framework='pt', device='cpu') as f: + print(f.metadata()) + for k in f.keys(): + t = f.get_tensor(k) + print(k, t.shape, t.dtype) diff --git a/torchao_hf_vllm/inspect_torchao_output.py b/torchao_hf_vllm/inspect_torchao_output.py new file mode 100644 index 0000000..457faaf --- /dev/null +++ b/torchao_hf_vllm/inspect_torchao_output.py @@ -0,0 +1,24 @@ +# inspects the output of model created with torchao +# via the `torchao_hf_script.py` script + +import json +import torch +import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor]) + +# not sure why I still need this +torch.serialization.add_safe_globals([getattr]) + +dir_name = 'data/fp8-opt-125m' +json_config_name = f'{dir_name}/config.json' + +# inspect the config +with open(json_config_name, 'r') as f: + data = json.load(f) + # TODO: pretty print + print(json.dumps(data, indent=2)) + +# inspect the data +model_name = f'{dir_name}/pytorch_model.bin' +state_dict = torch.load(model_name, weights_only=True) +for k, v in state_dict.items(): + print(k, v.shape, type(v)) diff --git a/torchao_hf_vllm/run_llm_compressor.py b/torchao_hf_vllm/run_llm_compressor.py new file mode 100644 index 0000000..616e250 --- /dev/null +++ b/torchao_hf_vllm/run_llm_compressor.py @@ -0,0 +1,40 @@ +# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py + +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.utils import dispatch_for_generation + +# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID = "facebook/opt-125m" + +# Load model. +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Configure the quantization algorithm and scheme. +# In this case, we: +# * quantize the weights to fp8 with per channel via ptq +# * quantize the activations to fp8 with dynamic per token +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] +) + +# Apply quantization. +oneshot(model=model, recipe=recipe) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +dispatch_for_generation(model) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( + model.device +) +output = model.generate(input_ids, max_new_tokens=20) +print(tokenizer.decode(output[0])) +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR)