In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = "/root/autodl-tmp/models/"
model_id = "facebook/opt-6.7b"

In [2]:
model = AutoAWQForCausalLM.from_pretrained(model_path + model_id, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path + model_id, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from datasets import load_dataset
# 准备数据集
# 因为我是在国内算力云上租的机器，很难将默认的数据集（mit-han-lab/pile-val-backup）下载下来
# 所以我就在 huggingface 上挑了一个很小的数据集，自己处理一下
dataset = load_dataset("parquet", data_files={'train': '/root/autodl-tmp/datasets/nampdn-ai/mini-en/data.parquet'})
myDatasets = [item['text'][:512] for item in dataset['train']] #将每个文本截取到最多512的长度，减少量化时间

In [4]:
quant_config = {
    "zero_point": True, 
    "q_group_size": 128, 
    "w_bit": 4, 
    "version": "GEMM",
}

In [5]:
# 开始量化训练
model.quantize(
    tokenizer, 
    quant_config=quant_config,
    calib_data=myDatasets[0:10000], # 使用自己的数据集做量化，只用10000个项，减少量化时间
)

AWQ: 100%|██████████| 32/32 [14:48<00:00, 27.78s/it]


量化结束后显卡的信息如下：

```
Mon Jan  1 00:37:32 2024       （新年快乐~）
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100S-PCI...  On   | 00000000:00:0B.0 Off |                  Off |
| N/A   30C    P0    36W / 250W |   1062MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
+-----------------------------------------------------------------------------+
```

In [None]:
# 保存模型权重
# 使用老师的代码没有办法保存成功，报 model 没有 save_pretrained 方法
# 我通过查看 AutoAWQ 的源代码找到了 save_quantized 方法，可以保存成功，顺便吐槽下 AutoAWQ 在 github 上的介绍也没有更新……

# from transformers import AwqConfig

# # 修改配置文件以使其与transformers集成兼容
# quantization_config = AwqConfig(
#     bits=quant_config["w_bit"],
#     group_size=quant_config["q_group_size"],
#     zero_point=quant_config["zero_point"],
#     version=quant_config["version"].lower(),
# ).to_dict()

# # 预训练的transformers模型存储在model属性中，我们需要传递一个字典
# model.model.config.quantization_config = quantization_config

new_model_id = "my-opt6.7b-AWQ"
tokenizer.save_pretrained(model_path + new_model_id)
model.save_quantized(model_path + new_model_id)