Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
__pycache__/
hf_torchao_vllm/data
hf_torchao_vllm/sparse_logs
2 changes: 1 addition & 1 deletion hf_torchao_vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ Example
python quantize_hf_model_with_torchao.py --model_name "Qwen/Qwen1.5-MoE-A2.7B" --experts_only_qwen_1_5_moe_a_2_7b True --save_model_to_disk True --quant_type nvfp4

# run the model from above in vLLM
python run_quantized_model_in_vllm.py --model_name "data/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
python run_quantized_model_in_vllm.py --model_name "data/torchao/nvfp4-Qwen1.5-MoE-A2.7B" --compile False
```
26 changes: 26 additions & 0 deletions hf_torchao_vllm/inspect_llm_compressor_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# inspects the output of model created with llm-compressor
# via the `run_llm_compressor.py` script

import safetensors
import json
import fire

def run(
dir_name: str = 'data/llmcompressor/opt-125m-FP8-Dynamic',
):
json_config_name = f'{dir_name}/config.json'
with open(json_config_name, 'r') as f:
data = json.load(f)
# TODO: pretty print
print(json.dumps(data, indent=2))

# inpect the model, saved in safetensors format
model_name = f'{dir_name}/model.safetensors'
with safetensors.safe_open(model_name, framework='pt', device='cpu') as f:
print(f.metadata())
for k in f.keys():
t = f.get_tensor(k)
print(k, t.shape, t.dtype)

if __name__ == '__main__':
fire.Fire(run)
28 changes: 28 additions & 0 deletions hf_torchao_vllm/inspect_torchao_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# inspects the output of model created with torchao
# via the `torchao_hf_script.py` script

import json
import torch
import torchao # this is needed to run torch.serialization.add_safe_globals([torchao.quantization.Float8Tensor])
import fire

# not sure why I still need this
torch.serialization.add_safe_globals([getattr])

def run(dir_name: str = 'data/torchao/fp8-opt-125m'):
json_config_name = f'{dir_name}/config.json'

# inspect the config
with open(json_config_name, 'r') as f:
data = json.load(f)
# TODO: pretty print
print(json.dumps(data, indent=2))

# inspect the data
model_name = f'{dir_name}/pytorch_model.bin'
state_dict = torch.load(model_name, weights_only=True)
for k, v in state_dict.items():
print(k, v.shape, type(v))

if __name__ == '__main__':
fire.Fire(run)
47 changes: 47 additions & 0 deletions hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_w8a8_fp8/llama3_example.py

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation

import fire

def run():

# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_ID = "facebook/opt-125m"

# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
# In this case, we:
# * quantize the weights to fp8 with per channel via ptq
# * quantize the activations to fp8 with dynamic per token
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
)

# Apply quantization.
oneshot(model=model, recipe=recipe)

# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = "data/llmcompressor/" + MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

if __name__ == '__main__':
fire.Fire(run)
2 changes: 1 addition & 1 deletion hf_torchao_vllm/quantize_hf_model_with_torchao.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def main(
# Set default output directory based on model base name if not provided
if output_dir is None:
model_base_name = model_name.split("/")[-1]
output_dir = f"data/{quant_type}-{model_base_name}"
output_dir = f"data/torchao/{quant_type}-{model_base_name}"

# Convert to args-like object for compatibility with the rest of the code
args = Namespace(
Expand Down
21 changes: 0 additions & 21 deletions hf_torchao_vllm/utils/inspect_llm_compressor_output.py

This file was deleted.

24 changes: 0 additions & 24 deletions hf_torchao_vllm/utils/inspect_torchao_output.py

This file was deleted.

40 changes: 0 additions & 40 deletions hf_torchao_vllm/utils/quantize_hf_model_with_llm_compressor.py

This file was deleted.