Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions torchao_hf_vllm/inspect_llm_compressor_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# inspects the output of model created with llm-compressor
# via the `run_llm_compressor.py` script

import safetensors
import json

# inspect the config
dir_name = 'opt-125m-FP8-Dynamic'
json_config_name = f'{dir_name}/config.json'
with open(json_config_name, 'r') as f:
data = json.load(f)
# TODO: pretty print
print(json.dumps(data, indent=2))

# inpect the model, saved in safetensors format
model_name = f'{dir_name}/model.safetensors'
with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
print(f.metadata())
for k in f.keys():
t = f.get_tensor(k)
print(k, t.shape, t.dtype)
24 changes: 24 additions & 0 deletions torchao_hf_vllm/inspect_torchao_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# inspects the output of model created with torchao
# via the `torchao_hf_script.py` script

import json
import torch
import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])

# not sure why I still need this
torch.serialization.add_safe_globals([getattr])

dir_name = 'data/fp8-opt-125m'
json_config_name = f'{dir_name}/config.json'

# inspect the config
with open(json_config_name, 'r') as f:
data = json.load(f)
# TODO: pretty print
print(json.dumps(data, indent=2))

# inspect the data
model_name = f'{dir_name}/pytorch_model.bin'
state_dict = torch.load(model_name, weights_only=True)
for k, v in state_dict.items():
print(k, v.shape, type(v))
40 changes: 40 additions & 0 deletions torchao_hf_vllm/run_llm_compressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation

# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "facebook/opt-125m"

# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
# In this case, we:
# * quantize the weights to fp8 with per channel via ptq
# * quantize the activations to fp8 with dynamic per token
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
)

# Apply quantization.
oneshot(model=model, recipe=recipe)

# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)