In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from awq import AutoAWQForCausalLM

# 检查 PyTorch 和 CUDA
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA 版本: {torch.version.cuda}")
    print(f"GPU 设备: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA 不可用，将使用 CPU")

# 检查 transformers
try:
    model_name = "facebook/opt-1.3b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    print("transformers 库验证成功")
except Exception as e:
    print(f"transformers 库验证失败: {str(e)}")

# 检查 AWQ
try:
    # 使用一个小型的 AWQ 量化模型进行测试
    model_name_or_path = "TheBloke/Llama-2-7B-Chat-AWQ"
    model = AutoAWQForCausalLM.from_quantized(
        model_name_or_path,
        fuse_layers=True,
        trust_remote_code=False,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
    print("AWQ 库验证成功")
except Exception as e:
    print(f"AWQ 库验证失败: {str(e)}")

# 简单推理测试
if torch.cuda.is_available() and 'model' in locals():
    try:
        prompt = "Hello, world!"
        inputs = tokenizer(prompt, return_tensors="pt").to(0)
        outputs = model.generate(**inputs, max_new_tokens=10)
        print("推理测试结果:", tokenizer.decode(outputs[0], skip_special_tokens=True))
    except Exception as e:
        print(f"推理测试失败: {str(e)}")

ImportError: /root/anaconda3/envs/AliOpenAPI/lib/python3.11/site-packages/awq_inference_engine.cpython-311-x86_64-linux-gnu.so: undefined symbol: _ZN3c104cuda9SetDeviceEi

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

# 模型名称，可替换为"facebook/opt-6.7b"
model_name_or_path = "facebook/opt-1.3b"

# 配置GPTQ量化参数
quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    dataset="wikitext2",
    desc_act=False,
)

# 量化模型
print("开始量化模型...")
quant_model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=quantization_config,
    device_map='auto',
    torch_dtype=torch.float16
)

# 检查量化正确性
print("\n检查量化是否成功：")
first_layer = quant_model.model.decoder.layers[0].self_attn.q_proj
print(f"量化权重(qweight)是否存在: {'qweight' in first_layer.__dict__}")
print(f"量化零点(qzeros)数据类型: {first_layer.qzeros.dtype if 'qzeros' in first_layer.__dict__ else '未量化'}")

# 保存量化模型
save_path = "opt-1.3b-gptq-4bit"
quant_model.save_pretrained(save_path)
print(f"\n量化模型已保存至: {save_path}")

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 使用默认数据集量化的模型进行测试（加入指定文本）
text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = quant_model.generate(** inputs, max_new_tokens=64)
print(f"\n默认数据集量化模型生成结果:")
print(tokenizer.decode(out[0], skip_special_tokens=True))

# 自定义数据集量化演示
custom_dataset = [
    "Quantization helps reduce model size significantly.",
    "GPTQ is an efficient post-training quantization method for large language models."
]

custom_quant_config = GPTQConfig(
    bits=4,
    group_size=128,
    desc_act=False,
    dataset=custom_dataset
)

print("\n使用自定义数据集进行量化...")
custom_quant_model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=custom_quant_config,
    device_map="auto",
    torch_dtype=torch.float16
)

# 使用自定义数据集量化的模型进行测试（加入指定文本）
text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = custom_quant_model.generate(**inputs, max_new_tokens=64)
print(f"\n自定义数据集量化模型生成结果:")
print(tokenizer.decode(out[0], skip_special_tokens=True))

开始量化模型...


  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.



[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Detected gptqmodel and auto-gptq, will use gptqmodel


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Detected gptqmodel and auto-gptq, will use gptqmodel


model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Quantizing model.decoder.layers blocks :   0%|          | 0/24 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 1/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 1/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 2/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 2/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 3/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 3/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 4/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 4/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 5/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 5/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 6/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 6/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 7/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 7/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 8/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 8/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 9/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 9/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 10/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 10/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 11/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 11/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 12/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 12/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 13/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 13/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 14/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 14/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 15/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 15/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 16/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 16/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 17/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 17/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 18/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 18/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 19/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 19/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 20/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 20/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 21/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 21/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 22/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 22/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 23/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 23/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 24/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 24/24...
INFO:optimum.gptq.quantizer:Packing model...


[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_att

[32mINFO[0m  Optimize: `TritonV2QuantLinear` compilation triggered.                   

检查量化是否成功：
量化权重(qweight)是否存在: False
量化零点(qzeros)数据类型: 未量化

量化模型已保存至: opt-1.3b-gptq-4bit

默认数据集量化模型生成结果:
Merry Christmas! I'm glad to see you're still around.
I'm still around, just not on reddit.

使用自定义数据集进行量化...


Detected gptqmodel and auto-gptq, will use gptqmodel
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Detected gptqmodel and auto-gptq, will use gptqmodel


Quantizing model.decoder.layers blocks :   0%|          | 0/24 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 1/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 1/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 1/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 2/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 2/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 2/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 3/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 3/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 3/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 4/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 4/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 4/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 5/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 5/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 5/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 6/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 6/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 6/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 7/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 7/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 7/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 8/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 8/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 8/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 9/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 9/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 9/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 10/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 10/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 10/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 11/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 11/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 11/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 12/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 12/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 12/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 13/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 13/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 13/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 14/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 14/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 14/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 15/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 15/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 15/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 16/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 16/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 16/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 17/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 17/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 17/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 18/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 18/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 18/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 19/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 19/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 19/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 20/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 20/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 20/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 21/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 21/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 21/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 22/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 22/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 22/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 23/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 23/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 23/24...
INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 24/24
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]


Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 24/24...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 24/24...
INFO:optimum.gptq.quantizer:Packing model...


[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_attn.q_proj
INFO:optimum.gptq.quantize


自定义数据集量化模型生成结果:
Merry Christmas! I'm glad to the

































































In [2]:
quant_model.model.decoder.layers[0].self_attn.q_proj.__dict__

{'training': True,
 '_parameters': {},
 '_buffers': {'qweight': tensor([[-1968072024, -1771280518,  1789289879,  ...,  1215579015,
            2053435976, -1753460124],
          [  967351451, -1785370714, -1778418564,  ...,  -878163915,
            -110706971,  2105174792],
          [  844485784, -1236563623,  -882554554,  ..., -1999525490,
           -1734957961,  2020046922],
          ...,
          [ 1821035160, -1280943212, -1672110441,  ..., -1186514519,
            1723308183, -1263167625],
          [-1516574247, -1471756296, -1236731976,  ..., -2004788788,
            1468696729,  1719643110],
          [ -942189433, -1148750501, -1505130619,  ..., -1702438745,
             -60205431,  -912684712]], device='cuda:0', dtype=torch.int32),
  'qzeros': tensor([[-2004318072, -2004318072, -2004318072,  ..., -2004318072,
           -2004318072, -2004318072],
          [-2004318072, -2004318072, -2004318072,  ..., -2004318072,
           -2004318072, -2004318072],
          [-2004318

In [3]:
text = "Merry Christmas! I'm glad to"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = custom_quant_model.generate(**inputs, max_new_tokens=64)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Merry Christmas! I'm glad to the































































