In [2]:
from hqq.core.quantize import *
q4_config    = BaseQuantizeConfig(nbits=8, group_size=64) 
q3_config    = BaseQuantizeConfig(nbits=2, group_size=64)

# quant_config = {'self_attn.q_proj':q4_config,
#   'self_attn.k_proj':q4_config,
#   'self_attn.v_proj':q4_config,
#   'self_attn.o_proj':q4_config,

#   'mlp.gate_proj':q3_config,
#   'mlp.up_proj'  :q3_config,
#   'mlp.down_proj':q3_config,
# }
quant_config = {
  'mlp.up_proj'  :q3_config,
}

In [None]:
import torch
import os
import sys
sys.path.append('/mnt/storage/zyx/llama3-8b')
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
from transformers import AutoTokenizer, BitsAndBytesConfig
from modeling_llama import LlamaForCausalLM

# 加载预训练的 OPT-30B 模型和 tokenizerQ
model_name = "/mnt/storage/zyx/Meta-Llama-3-8B"


llm = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    use_cache=True,
    torch_dtype=torch.float32,
    # attn_implementation="flash_attention_2"
) 
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:


#Quantize
from hqq.models.hf.base import AutoHQQHFModel
AutoHQQHFModel.quantize_model(llm, quant_config=quant_config, compute_dtype=torch.float32, device='cuda')

In [None]:

# tokenizer.padding_side = "right"

from datasets import load_dataset
def preprocess_data(batch):
    # 使用 tokenizer 将文本数据转换为模型输入
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs["labels"] = inputs.input_ids.clone()
    return inputs

# 定义一个函数来选择特征并丢弃不需要的
def select_features(example):
    return {
        'input_ids': example['input_ids'],
        'attention_mask': example['attention_mask'],
        'labels': example['labels']
    }

tokenizer.pad_token = tokenizer.eos_token
# # 加载 C4 数据集的验证集
c4 = load_dataset('/home/lz/workspace/llama2-7b/HQQ/notebooks/draft')
# 对数据集进行预处理
c4_dataset = c4.map(preprocess_data, batched=True)
# c4_dataset = c4_dataset.map(select_features, batched=True)
c4_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# c4_dataset
top_four_thousand_data = c4_dataset['validation'].select(range(10))

import numpy as np

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

from torch.utils.data import DataLoader
from tqdm import tqdm
set_seed(42)

# 定义数据加载器
batch_size = 4
# dataloader = DataLoader(c4_dataset['validation'], batch_size=batch_size)
dataloader = DataLoader(top_four_thousand_data, batch_size=batch_size)

# 计算评估损失
total_loss = 0.0
num_batches = 0

for batch in tqdm(dataloader):
    input_ids = batch['input_ids'].to(llm.device)
    attention_mask = batch['attention_mask'].to(llm.device)
    labels = batch['labels'].to(llm.device)
    
    # 禁用梯度计算
    with torch.no_grad():
        outputs = llm(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1
        if num_batches % 100 == 0:
            print(f"[{num_batches}], Eval Loss: {total_loss / (num_batches)}")

# 计算平均损失
eval_loss = total_loss / num_batches
print(f"Eval Loss: {eval_loss}")

In [None]:
def profle_svdllm(name, model, calib_loader, dev):
    model.to(dev)
    if "llama" in name or "mistral" in name or "vicuna" in name:
        layers = model.model.layers
    print("Start obtaining the whitening matrix...")
    def hook(module, input, output):
        inp = input[0].detach().float()
        if inp.dim() == 2:   # for opt
            inp = inp.unsqueeze(0)
        adds = torch.matmul(inp.transpose(1,2), inp)
        adds_sum = torch.sum(adds, dim=0)
        module.raw_scaling_diag_matrix += adds_sum
        del inp, adds, adds_sum
        torch.cuda.empty_cache()
    for name, module in model.named_modules():
        if "up_proj" in name:
            module.raw_scaling_diag_matrix = 0
            module.register_forward_hook(hook)
            
    for batch in tqdm(calib_loader):
        inputs = batch['input_ids'].to(llm.device)
        model(inputs)
    for name, module in model.named_modules():
        if "up_proj" in name:
            module._forward_hooks.clear()
            # print(module.raw_scaling_diag_matrix)
    torch.cuda.empty_cache()

    profiling_mat = {}
    print("Start Cholesky Decomposition...")
    
    layer_profile = {}
    for name, module in model.named_modules():
        if "up_proj" in name:
            covariance = module.raw_scaling_diag_matrix.double().to(dev)
            if not torch.allclose(covariance, covariance.t(), atol=1e-6):
                raise ValueError("Covariance matrix is not symmetric.")
                    # Perform eigen decomposition
            Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
            if torch.isnan(Lambda).any() or torch.isinf(Lambda).any():
                raise ValueError("Lambda contains NaN or Inf values.")

            # 检查 Lambda 是否包含负值
            if (Lambda < 0).any():
                print("Lambda contains negative values. Clamping to zero.")
                eigenvalues = torch.linalg.eigvalsh(covariance)
                covariance += (- eigenvalues[0] + 2e-6) * torch.eye(covariance.shape[0]).cuda()
                Lambda, Q = torch.linalg.eigh(covariance, UPLO='U')
                print(f"Lambda min: {Lambda.min().item()}, Lambda max: {Lambda.max().item()}")
            # 现在进行平方根操作
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            # Sort eigenvalues and eigenvectors in descending order
            indices = torch.argsort(Lambda, descending=True)
            Lambda = Lambda[indices]
            Q = Q[:, indices]

            # Compute Q_prime = Q * sqrt(Lambda)
            Lambda_diag = torch.diag(torch.sqrt(Lambda))
            Q_prime = torch.matmul(Q, Lambda_diag)
            layer_profile[name] = Q_prime.cpu()
            profiling_mat[name] = layer_profile
    return profiling_mat
profle_svdllm("llama", llm, dataloader, "cuda")