In [2]:
!git clone https://github.com/vlonekika/Llama_Quant_Int4_Triton.git

Cloning into 'Llama_Quant_Int4_Triton'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 39 (delta 11), reused 38 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (39/39), 7.24 KiB | 3.62 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [4]:
!pip install -q transformers accelerate bitsandbytes torch datasets

In [5]:
import os
os.chdir("/content/Llama_Quant_Int4_Triton")

In [6]:
import time
import torch
import pandas as pd

from datasets import load_dataset
from research_tools.quant import quantize_int4
from research_tools.matmul import matmul_int4
from research_tools.utils import quantize_model, model_memory, perplexity, compute_speed
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# comparing Pytorch quantize vs triton quantize

In [7]:
# PyTorch quantize
def quantize_int4_torch_fast(w):
    w = w.contiguous()
    n_rows, n_cols = w.shape

    min_val = w.min(dim=1, keepdim=True).values
    max_val = w.max(dim=1, keepdim=True).values
    scale = ((max_val - min_val) / 15.0).clamp(min=1e-6)
    zero_point = ((-min_val / scale).round()).clamp(0, 15)

    scales = scale.squeeze(1).to(torch.float16)
    zeros = zero_point.squeeze(1).to(torch.float16)

    q = ((w / scale + zero_point).round()).clamp(0, 15).to(torch.int32)

    q_padded = torch.zeros((n_rows, (n_cols + 7) // 8 * 8), dtype=torch.int32, device=w.device)
    q_padded[:, :n_cols] = q
    shifts = torch.arange(8, device=w.device, dtype=torch.int32) * 4
    q_packed = (q_padded.view(n_rows, -1, 8) << shifts).sum(dim=2)

    return q_packed, scales, zeros

def matmul_int4_torch_fast(x, w_packed, w_scales, w_zeros):
    M, K = x.shape
    N = w_packed.shape[0]

    w_rows = torch.zeros((N, K), device=x.device, dtype=torch.float32)
    for i in range(8):
        mask = i < (K // 8 * 8)
        vals = (w_packed[:, :(K // 8)] >> (i * 4)) & 0xF
        w_rows[:, i::8] = (vals.float() - w_zeros[:, None]) * w_scales[:, None]

    output = (x.float() @ w_rows.T).to(torch.bfloat16)
    return output

#Benchmark quantize
def benchmark_quantize(w):
    results = []

    block_sizes = [32, 64, 128, 256, 512]
    for BLOCK_SIZE in block_sizes:
        torch.cuda.synchronize()
        start = time.time()
        quantize_int4(w, BLOCK_SIZE=BLOCK_SIZE)
        torch.cuda.synchronize()
        elapsed = (time.time() - start) * 1000
        results.append({"method": "triton", "BLOCK_SIZE": BLOCK_SIZE, "time_ms": elapsed})

    #PyTorch
    torch.cuda.synchronize()
    start = time.time()
    quantize_int4_torch_fast(w)
    torch.cuda.synchronize()
    elapsed = (time.time() - start) * 1000
    results.append({"method": "torch", "BLOCK_SIZE": None, "time_ms": elapsed})

    df = pd.DataFrame(results)
    return df, df.loc[df['time_ms'].idxmin()]

#Benchmark matmul
def benchmark_matmul(x, w_packed, w_scales, w_zeros):
    results = []

    configs = [
        {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16},
        {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 16},
        {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128},
        {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128},
    ]

    #Triton
    for cfg in configs:
        torch.cuda.synchronize()
        start = time.time()
        matmul_int4(
            x, w_packed, w_scales, w_zeros,
            BLOCK_M=cfg["BLOCK_M"],
            BLOCK_N=cfg["BLOCK_N"],
            BLOCK_K=cfg["BLOCK_K"]
        )
        torch.cuda.synchronize()
        elapsed = (time.time() - start) * 1000
        results.append({**cfg, "method": "triton", "time_ms": elapsed})

    #PyTorch
    torch.cuda.synchronize()
    start = time.time()
    matmul_int4_torch_fast(x, w_packed, w_scales, w_zeros)
    torch.cuda.synchronize()
    elapsed = (time.time() - start) * 1000
    results.append({"BLOCK_M": None, "BLOCK_N": None, "BLOCK_K": None, "method": "torch", "time_ms": elapsed})

    df = pd.DataFrame(results)
    return df, df.loc[df['time_ms'].idxmin()]

device = "cuda"
hidden_size = 1024
batch_size = 512
w = torch.randn(hidden_size, hidden_size, device=device)
x = torch.randn(batch_size, hidden_size, device=device)
packed, scales, zeros = quantize_int4(w)

df_quantize, best_quantize = benchmark_quantize(w)
df_matmul, best_matmul = benchmark_matmul(x, packed, scales, zeros)

print("Quantize timings:")
print(df_quantize)
print("Best quantize config:")
print(best_quantize)

print("\nMatmul timings:")
print(df_matmul)
print("Best matmul config:")
print(best_matmul)

df_quantize.to_excel("df_quantize.xlsx", index=False)
df_matmul.to_excel("df_matmul.xlsx", index=False)

Quantize timings:
   method  BLOCK_SIZE     time_ms
0  triton        32.0    3.177881
1  triton        64.0    0.679255
2  triton       128.0    2.689838
3  triton       256.0    2.497435
4  triton       512.0    2.451181
5   torch         NaN  177.739859
Best quantize config:
method          triton
BLOCK_SIZE        64.0
time_ms       0.679255
Name: 1, dtype: object

Matmul timings:
    BLOCK_M  BLOCK_N  BLOCK_K  method    time_ms
0      32.0     32.0     16.0  triton  20.141602
1      32.0     64.0     16.0  triton   7.694244
2      32.0     64.0     32.0  triton   7.466555
3      64.0     64.0     32.0  triton   6.075144
4      64.0    128.0     32.0  triton   6.243229
5      64.0    128.0     64.0  triton   5.468130
6     128.0     64.0     64.0  triton   5.099058
7     128.0    128.0     64.0  triton   4.697323
8     128.0    128.0    128.0  triton   4.979134
9     256.0    128.0    128.0  triton  17.190456
10      NaN      NaN      NaN   torch  66.637039
Best matmul config:
BLOCK

# (X16@W4^T) vs (X16@W16^T)

In [8]:
def benchmark_triton_vs_vanilla(batch_sizes, hidden_size=2048, n_iters=10):

    block_configs = [
        {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16},
        {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 16},
        {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32},
        {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64},
        {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128},
        {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128},
    ]

    results = []

    for batch in batch_sizes:

        x = torch.randn(batch, hidden_size, device="cuda", dtype=torch.bfloat16)
        w = torch.randn(hidden_size, hidden_size, device="cuda")

        w_packed, w_scales, w_zeros = quantize_int4(w)

        for cfg in block_configs:
            BLOCK_M, BLOCK_N, BLOCK_K = cfg["BLOCK_M"], cfg["BLOCK_N"], cfg["BLOCK_K"]

            matmul_int4(x, w_packed, w_scales, w_zeros, BLOCK_M, BLOCK_N, BLOCK_K)

            torch.cuda.synchronize()
            t0 = time.time()
            for _ in range(n_iters):
                matmul_int4(x, w_packed, w_scales, w_zeros, BLOCK_M, BLOCK_N, BLOCK_K)
            torch.cuda.synchronize()
            elapsed = (time.time() - t0) * 1000 / n_iters

            results.append({
                "batch": batch,
                "method": "Triton X@W4^T",
                "BLOCK_M": BLOCK_M,
                "BLOCK_N": BLOCK_N,
                "BLOCK_K": BLOCK_K,
                "time_ms": elapsed
            })

        W16_T = w.t().contiguous()
        torch.cuda.synchronize()
        t1 = time.time()
        for _ in range(n_iters):
            out16 = (x.float() @ W16_T.float()).to(torch.bfloat16)
        torch.cuda.synchronize()
        elapsed16 = (time.time() - t1) * 1000 / n_iters

        results.append({
            "batch": batch,
            "method": "Pytorch X16@W16^T",
            "BLOCK_M": None,
            "BLOCK_N": None,
            "BLOCK_K": None,
            "time_ms": elapsed16
        })

    df = pd.DataFrame(results)
    return df

token_cnt = [128, 512, 2048]
hidden_size = 2048
df = benchmark_triton_vs_vanilla(token_cnt, hidden_size=hidden_size, n_iters=5)

print("Results")
print(df)

df.to_excel("matmul_quant_vs_vanila.xlsx", index=False)

Results
    batch             method  BLOCK_M  BLOCK_N  BLOCK_K     time_ms
0     128      Triton X@W4^T     32.0     32.0     16.0    7.605314
1     128      Triton X@W4^T     32.0     64.0     16.0    5.153370
2     128      Triton X@W4^T     32.0     64.0     32.0    4.768324
3     128      Triton X@W4^T     64.0     64.0     32.0    3.366280
4     128      Triton X@W4^T     64.0    128.0     32.0    3.164387
5     128      Triton X@W4^T     64.0    128.0     64.0    3.157234
6     128      Triton X@W4^T    128.0     64.0     64.0    2.455187
7     128      Triton X@W4^T    128.0    128.0     64.0    4.667568
8     128      Triton X@W4^T    128.0    128.0    128.0    4.669571
9     128      Triton X@W4^T    256.0    128.0    128.0   12.510014
10    128  Pytorch X16@W16^T      NaN      NaN      NaN    0.297260
11    512      Triton X@W4^T     32.0     32.0     16.0   12.401915
12    512      Triton X@W4^T     32.0     64.0     16.0    7.595062
13    512      Triton X@W4^T     32.0   

# wiki

In [9]:
MODEL_NAME = 'unsloth/Llama-3.2-1B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

model_fp16 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map='auto'
)

fp16_size = model_memory(model_fp16)
print("FP16 model")
fp16_ppl = perplexity(model_fp16, tokenizer, dataset, samples=256)
fp16_speed = compute_speed(model_fp16, tokenizer, dataset, batch_size=16, samples=256)

print(f"Perplexity: {fp16_ppl:.2f}, memory size: {fp16_size:.2f} MB")

del model_fp16
torch.cuda.empty_cache()

model_quant = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map='auto'
)

model_int4 = quantize_model(model_quant)

int4_size = model_memory(model_int4)
print("INT4 quantized model")
int4_ppl = perplexity(model_int4, tokenizer, dataset, samples=256)
int4_speed = compute_speed(model_int4, tokenizer, dataset, batch_size=16, samples=256)

print(f"Perplexity: {int4_ppl:.2f}, memory size: {int4_size:.2f} MB")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


FP16 model


100%|██████████| 256/256 [00:06<00:00, 41.89it/s]


len: 128 Batch 16: avg time 155.45ms
len: 512 Batch 16: avg time 355.38ms
len: 2048 Batch 16: avg time 382.21ms
Perplexity: 28.03, memory size: 4714.26 MB
INT4 quantized model


100%|██████████| 256/256 [01:03<00:00,  4.04it/s]


len: 128 Batch 16: avg time 3172.17ms
len: 512 Batch 16: avg time 7442.40ms
len: 2048 Batch 16: avg time 7446.27ms
Perplexity: 45.23, memory size: 1002.26 MB
