diff --git a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py index 473c49f..df76a22 100644 --- a/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py +++ b/hf_torchao_vllm/quantize_hf_model_with_llm_compressor.py @@ -84,7 +84,20 @@ def tokenize(sample): # * quantize the weights to fp4 with per group 16 via ptq # * calibrate a global_scale for activations, which will be used to # quantize activations to fp4 on the fly - recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"]) + recipe = QuantizationModifier( + targets="Linear", + scheme="NVFP4", + ignore=[ + "lm_head", + # for Qwen MoE, but ok to just hardcode here for now + # https://github.com/vllm-project/llm-compressor/blob/33ef5f497a9801893764c6a2c880cb1f560067fa/examples/quantizing_moe/qwen_example.py#L10 + "re:.*mlp.gate$", + "re:.*mlp.shared_expert_gate$", + # also skip attention and shared expert, to focus on MoE for now + "re:.*self_attn.*", + "re:.*shared_expert.*", + ], + ) # Apply quantization. oneshot( diff --git a/hf_torchao_vllm/quantize_hf_model_with_torchao.py b/hf_torchao_vllm/quantize_hf_model_with_torchao.py index a6c6546..a699424 100644 --- a/hf_torchao_vllm/quantize_hf_model_with_torchao.py +++ b/hf_torchao_vllm/quantize_hf_model_with_torchao.py @@ -121,14 +121,21 @@ def get_quantization_config(args): expert_fqn_to_config = {} # TODO(future PR): this is annoying, I should be able to use a regex here for layer_idx in range(24): - for expert_idx in range(60): - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None + expert_fqn_to_config[f"lm_head"] = None module_fqn_to_config = ModuleFqnToConfig({ - "_default": None, + "_default": single_config, **expert_fqn_to_config, }) + return TorchAoConfig( quant_type=module_fqn_to_config, ) @@ -162,12 +169,18 @@ def get_quantization_config(args): expert_fqn_to_config = {} # TODO(future PR): this is annoying, I should be able to use a regex here for layer_idx in range(24): - for expert_idx in range(60): - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj"] = single_config - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj"] = single_config - expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj"] = single_config + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.q_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.k_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.v_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.self_attn.o_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.gate"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.gate_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.up_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert.down_proj"] = None + expert_fqn_to_config[f"model.layers.{layer_idx}.mlp.shared_expert_gate"] = None + expert_fqn_to_config[f"lm_head"] = None module_fqn_to_config = ModuleFqnToConfig({ - "_default": None, + "_default": single_config, **expert_fqn_to_config, }) return TorchAoConfig(