<a href="https://colab.research.google.com/github/zhouzhouwei/QuEST_test/blob/main/src/HadamardTesting_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import snapshot_download

PATH = "../QuEST-800M-INT4"
snapshot_download(repo_id="ISTA-DASLab/QuEST-800M-INT4", local_dir=PATH)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

summary.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

main.pt:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

'/nfs/scistore19/alistgrp/apanfero/QuEST/QuEST-800M-INT4'

In [None]:
import json

from optim.utils import load_checkpoint
from models.utils import get_model


class DotDict(dict):
    def __getattr__(self, key):
        try:
            return self[key]
        except KeyError:
            raise AttributeError(f"'DotDict' object has no attribute '{key}'")

    def __setattr__(self, key, value):
        self[key] = value


with open(f"{PATH}/summary.json", "r") as f:
    config = json.load(f)


In [10]:
import numpy as np
from scipy.linalg import hadamard

def hadamard_transform_cpu(x):
    """
    纯 CPU 实现 Hadamard 变换（仅支持 2 的幂次维度）
    x: 形状为 [..., n] 的张量，n 必须是 2 的幂
    """
    n = x.shape[-1]
    assert (n & (n - 1)) == 0, f"维度 {n} 不是 2 的幂，Hadamard 矩阵仅支持 2^k 维度"

    H = hadamard(n, dtype=np.float32) / np.sqrt(n)  # 归一化
    return x @ H.T

In [None]:
import torch


In [13]:
import numpy as np

X = np.random.rand(32, 64)
print("矩阵 X 的形状:", X.shape)
xhat = hadamard_transform_cpu(X)
print("变换后的矩阵 xhat 的形状:", xhat.shape)

矩阵 X 的形状: (32, 64)
变换后的矩阵 xhat 的形状: (32, 64)


In [17]:
import torch
import numpy as np

X = np.random.rand(32, 64)
print("矩阵 X 的形状:", X.shape)
xhat = hadamard_transform_cpu(X)
print("变换后的矩阵 xhat 的形状:", xhat.shape)
# Convert numpy array xhat to torch.Tensor
x_had_torch = torch.from_numpy(xhat)

# Perform the calculation
std = torch.sqrt(torch.mean(x_had_torch**2, dim=-1, keepdim=True)) + 1e-8

print("x_had_torch 的形状:", x_had_torch.shape)

OPTIMAL_GAUSSIAN_SCALES = {
    1: 0.7978845587140913,
    1.585: 1.2240089519030855,
    2: 1.4935346200015913,
    3: 2.051068354131873,
    4: 2.513930578568423,
    5: 2.9160938834961225,
    6: 3.276597282593217,
    7: 3.6010497188221655,
    8: 3.884938678807525,
}

scale = OPTIMAL_GAUSSIAN_SCALES[4] * std
n_levels = 2 ** 4
step = 2 * scale / (n_levels - 1)
x_clip = torch.clamp(x_had_torch, -scale, scale)
xq = torch.round((x_clip + scale) / step)
xq_dequant = xq * step - scale
print(f"xq dequant: {xq_dequant}")

矩阵 X 的形状: (32, 64)
变换后的矩阵 xhat 的形状: (32, 64)
x_had_torch 的形状: torch.Size([32, 64])
xq dequant: tensor([[ 1.4540, -0.0969, -0.4847,  ...,  0.4847, -0.0969, -0.2908],
        [ 1.4976,  0.4992, -0.4992,  ..., -0.2995,  0.0998,  0.0998],
        [ 1.3419, -0.6262,  0.2684,  ..., -0.0895, -0.0895,  0.0895],
        ...,
        [ 1.4858,  0.2972,  0.4953,  ...,  0.0991, -0.2972,  0.2972],
        [ 1.4431,  0.0962,  0.0962,  ...,  0.6734, -0.2886,  0.0962],
        [ 1.3060, -0.0871,  0.2612,  ...,  0.7836, -0.0871, -0.0871]],
       dtype=torch.float64)


In [18]:
import torch
import numpy as np

# Ensure X is a torch tensor for comparison with xq_dequant
X_torch = torch.from_numpy(X)

# Calculate Mean Squared Error (MSE)
mse = torch.mean((X_torch - xq_dequant)**2)

# Calculate Mean Absolute Error (MAE)
mae = torch.mean(torch.abs(X_torch - xq_dequant))

print(f"MSE 误差: {mse.item()}")
print(f"MAE 误差: {mae.item()}")

MSE 误差: 0.44150285651567595
MAE 误差: 0.5553469577408101


In [None]:
import torch
from torch import nn
import torch.nn.functional as F

from fast_hadamard_transform import hadamard_transform

from models.quantization.base_linear import OPTIMAL_GAUSSIAN_SCALES, HadamardTrustQuantizer, HalfHadamardTrustQuantizer


def quantize_pack_hadamard_dense(x: torch.Tensor, quantizer: HadamardTrustQuantizer):
    assert quantizer.centered
    x_had = hadamard_transform(x.reshape(-1, 128), scale=2 ** (-7/2)).reshape(x.shape)

    std = torch.sqrt(torch.mean(x_had**2, dim=-1, keepdim=True)) + 1e-8
    scale = OPTIMAL_GAUSSIAN_SCALES[quantizer.bits] * std

    step = 2 * scale / (quantizer.n_levels - 1)
    x_clip = torch.clamp(x_had, -scale, scale)
    xq = torch.round((x_clip + scale) / step)

    assert xq.min() >= 0 and xq.max() < quantizer.n_levels
    return xq, scale, step
    # ^ note: xq is in rotated space!

def dequantize_dense(xq, scale, step):
    return xq * step - scale

weight = torch.rand(2, 128).cuda()
quantizer = HadamardTrustQuantizer(bits=4)
ref = quantizer(weight)
xq, scale, step = quantize_pack_hadamard_dense(weight, quantizer)
deq = dequantize_dense(xq, scale, step)

torch.testing.assert_close(hadamard_transform(ref.reshape(-1, 128), scale=2 ** (-7/2)).reshape(ref.shape), deq, rtol=1e-3, atol=1e-3)

In [None]:
from models.quantization.base_linear import QuantizedLinear

class Linear4bit(nn.Module):
    def __init__(self, quantizer_linear):
        super().__init__()

        assert isinstance(quantizer_linear.weight_quantizer, HadamardTrustQuantizer)
        assert isinstance(quantizer_linear.activation_quantizer, HadamardTrustQuantizer)

        self.activation_quantizer = quantizer_linear.activation_quantizer

        wq = dequantize_dense(*quantize_pack_hadamard_dense(quantizer_linear.weight, quantizer_linear.weight_quantizer))
        self.register_buffer("wq", wq)
        self.bias = quantizer_linear.bias

    def forward(self, x):
        x = dequantize_dense(*quantize_pack_hadamard_dense(x, self.activation_quantizer))
        return F.linear(x, self.wq, self.bias)


def replace_linears(model):
    for name, module in model.named_children():
        if isinstance(module, QuantizedLinear):
            model._modules[name] = Linear4bit(module)
        else:
            replace_linears(module)
    return model

In [None]:
class PseudoDdp(nn.Module):
    def __init__(self, model):
        super().__init__()
        self._orig_mod = nn.ModuleDict({
            "module": model,
        })

class PseudoLoader:
    def load_state_dict(self, *args, **kwargs):
        pass

model = PseudoDdp(get_model(DotDict(config['args'])))
model.load_state_dict(torch.load(f"{PATH}/main.pt"))
model = model.cuda()
model = model._orig_mod["module"]
model = replace_linears(model)

  model.load_state_dict(torch.load(f"{PATH}/main.pt"))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

In [None]:
def generate_text_greedily(model, tokenizer, prompt, max_length=50, device='cuda'):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    for _ in range(max_length):
        with torch.no_grad():
            outputs = model(input_ids, get_logits=True)
            logits = outputs['logits'][:, -1, :]

        next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

generated_text = generate_text_greedily(model, tokenizer, "Hi!", max_length=20)
print(generated_text)


Hi! Sign in to let us know how The Coffee House was?
by jennifer1


In [None]:
numel = 0
for name, param in model.named_buffers():
    numel += param.numel()
    # print(name, param.numel())

print(numel/1e6)

822.083584
