vkuzo · vkuzo · Sep 30, 2025 · Sep 30, 2025
diff --git a/torchao_hf_vllm/inspect_llm_compressor_output.py b/torchao_hf_vllm/inspect_llm_compressor_output.py
@@ -0,0 +1,21 @@
+# inspects the output of model created with llm-compressor
+# via the `run_llm_compressor.py` script
+
+import safetensors
+import json
+
+# inspect the config
+dir_name = 'opt-125m-FP8-Dynamic'
+json_config_name = f'{dir_name}/config.json'
+with open(json_config_name, 'r') as f:
+    data = json.load(f)
+    # TODO: pretty print
+    print(json.dumps(data, indent=2))
+
+# inpect the model, saved in safetensors format
+model_name = f'{dir_name}/model.safetensors'
+with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
+    print(f.metadata())
+    for k in f.keys():
+        t = f.get_tensor(k)
+        print(k, t.shape, t.dtype)
diff --git a/torchao_hf_vllm/inspect_torchao_output.py b/torchao_hf_vllm/inspect_torchao_output.py
@@ -0,0 +1,24 @@
+# inspects the output of model created with torchao
+# via the `torchao_hf_script.py` script
+
+import json
+import torch
+import torchao  # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
+
+# not sure why I still need this
+torch.serialization.add_safe_globals([getattr])
+
+dir_name = 'data/fp8-opt-125m'
+json_config_name = f'{dir_name}/config.json'
+
+# inspect the config
+with open(json_config_name, 'r') as f:
+    data = json.load(f)
+    # TODO: pretty print
+    print(json.dumps(data, indent=2))
+
+# inspect the data
+model_name = f'{dir_name}/pytorch_model.bin'
+state_dict = torch.load(model_name, weights_only=True)
+for k, v in state_dict.items():
+    print(k, v.shape, type(v))
diff --git a/torchao_hf_vllm/run_llm_compressor.py b/torchao_hf_vllm/run_llm_compressor.py
@@ -0,0 +1,40 @@
+# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "facebook/opt-125m"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per channel via ptq
+#   * quantize the activations to fp8 with dynamic per token
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)