# Model Quantization 模型量化 



## 1. AWQ 量化

https://qwen.readthedocs.io/en/latest/quantization/awq.html

In [None]:
!git clone https://github.com/casper-hansen/AutoAWQ.git
!cd AutoAWQ
!pip install -e .

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Specify paths and hyperparameters for quantization
model_path = "your_model_path"
quant_path = "your_quantized_model_path"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load your tokenizer and model with AutoAWQ
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto", safetensors=True)


# 自 2024 年 7 月 18 日起，AutoAWQ 不支持量化 Qwen2 MoE 模型。

In [None]:
# 然后您需要准备用于校准的数据。您需要做的只是将样本放入列表中，每个样本都是一个文本。由于我们直接使用微调数据进行校准，因此我们首先使用 ChatML 模板对其进行格式化。例如：

data = []
for msg in dataset:
    text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
    data.append(text.strip())

In [None]:
model.quantize(tokenizer, quant_config=quant_config, calib_data=data)

In [None]:
model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
tokenizer.save_pretrained(quant_path)

## 2. GPTQ 量化

https://qwen.readthedocs.io/en/latest/quantization/gptq.html

In [None]:
!git clone https://github.com/AutoGPTQ/AutoGPTQ
!cd AutoGPTQ
!pip install -e .

In [None]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer

# Specify paths and hyperparameters for quantization
model_path = "your_model_path"
quant_path = "your_quantized_model_path"
quantize_config = BaseQuantizeConfig(
    bits=8, # 4 or 8
    group_size=128,
    damp_percent=0.01,
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
    static_groups=False,
    sym=True,
    true_sequential=True,
    model_name_or_path=None,
    model_file_base_name="model"
)
max_len = 8192

# Load your tokenizer and model with AutoGPTQ
# To learn about loading model to multiple GPUs,
# visit https://github.com/AutoGPTQ/AutoGPTQ/blob/main/docs/tutorial/02-Advanced-Model-Loading-and-Best-Practice.md
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config)

In [None]:
model = AutoGPTQForCausalLM.from_pretrained(
    model_path,
    quantize_config,
    max_memory={i: "20GB" for i in range(4)}
)

In [None]:
import torch

data = []
for msg in dataset:
    text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
    model_inputs = tokenizer([text])
    input_ids = torch.tensor(model_inputs.input_ids[:max_len], dtype=torch.int)
    data.append(dict(input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id)))

In [None]:
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
model.quantize(data, cache_examples_on_gpu=False)

In [None]:
model.save_quantized(quant_path, use_safetensors=True)
tokenizer.save_pretrained(quant_path)

## 3. GGUF 格式

https://qwen.readthedocs.io/en/latest/quantization/gguf.html


为了最好的保存，最好先量化为AWQ 格式后，再将其转换为 GGUF 格式。


In [None]:
!git clone git@github.com:ggerganov/llama.cpp.git
cd llama.cpp

In [None]:
!python convert-hf-to-gguf.py Qwen/Qwen2-7B-Instruct --outfile models/7B/qwen2-7b-instruct-fp16.gguf

In [None]:
!./llama-quantize models/7B/qwen2-7b-instruct-fp16.gguf models/7B/qwen2-7b-instruct-q4_0.gguf q4_0

# q2_k, q3_k_m, q4_0, q4_k_m, q5_0, q5_k_m, q6_k, and q8_0