# Notebook – GPTQ_Compress.ipynb
Convert a Hugging Face model to 4‑bit GPTQ for vLLM

## Install dependencies
```bash
pip install auto-gptq transformers accelerate
```

In [None]:
import os, random, json
from pathlib import Path
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, GPTQConfig

BASE_MODEL = os.getenv("BASE_MODEL", "mistral-7b-instruct-v0.3")
OUT_DIR = Path(BASE_MODEL + "-gptq")
OUT_DIR.mkdir(exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Build tiny calibration set (replace with domain snippets for best results)
calib_sentences = ["Hello world!"] * 128

quant_cfg = GPTQConfig(
    bits=4,
    group_size=128,
    dataset=calib_sentences,
    desc_act=False
)

model = AutoGPTQForCausalLM.from_pretrained(
    BASE_MODEL,
    quant_config=quant_cfg,
    device_map="auto",
    trust_remote_code=True
)
model.save_pretrained(OUT_DIR, safe_serialization=True)
tokenizer.save_pretrained(OUT_DIR)
print(f"Saved GPTQ model to {OUT_DIR}")
