In [None]:
# !pip install --upgrade transformers --break-system-packages

In [None]:
# !pip install auto-round --break-system-packages

In [14]:
import os
import torch
from auto_round import AutoRound
from huggingface_hub import HfApi, create_repo, notebook_login, get_token
from transformers import AutoModelForCausalLM, AutoTokenizer


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


PyTorch Version: 2.10.0+cu126
CUDA Available: True
CUDA Version: 12.6
GPU Name: NVIDIA RTX A4000
VRAM: 15.7 GB


In [4]:
MODEL_ID = "Qwen/Qwen3-1.7B"
HF_USER = "Vishva007"
OUTPUT_BASE_DIR = "./AutoRound"

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# "AWQ",            # Best for Nvidia GPUs (vLLM, TGI)
# "GPTQ",           # Good compatibility
# "AutoRound",      # Intel default format (Requires auto-round lib to run)
# "GGUF",           # For llama.cpp / Olama

TARGET_FORMATS = "AWQ"

# High-End GPU Tuning Parameters (A40/A6000/L40)
TUNING_CONFIG = {
    "group_size": 128,
    "sym": True,
    "iters": 256,          # High accuracy (Production grade)
    "nsamples": 256,        # More calibration data
    "batch_size": 8,        # Faster on 48GB VRAM
    "seqlen": 2048,
    "low_gpu_mem_usage": False,   # Keep on GPU for speed
    "enable_torch_compile": True, # JIT acceleration
    # "quant_nontext_module": False # Keep Vision Tower in BF16 (Crucial for VLM accuracy)
}

In [7]:
def push_to_hub(local_dir, repo_name, token):
    """Creates repo and uploads folder to Hugging Face."""
    full_repo_id = f"{HF_USER}/{repo_name}"
    print(f"\n[Hub] Pushing {local_dir} to {full_repo_id}...")
    
    try:
        api = HfApi()
        create_repo(full_repo_id, repo_type="model", exist_ok=True, private=False, token=token)
        
        api.upload_folder(
            folder_path=local_dir,
            repo_id=full_repo_id,
            repo_type="model",
            token=token
        )
        print(f"[Hub] ✅ Successfully uploaded: https://huggingface.co/{full_repo_id}")
    except Exception as e:
        print(f"[Hub] ❌ Error uploading: {e}")

In [None]:
use_fp16 = TARGET_FORMATS in ["AWQ", "GPTQ"]

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    dtype=torch.float16 if use_fp16 else "auto",
    trust_remote_code=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
ar = AutoRound(
        model,              
        tokenizer, 
        scheme="W4A16",
        **TUNING_CONFIG
    )

[38;20m2026-02-01 05:23:55 INFO base.py L391: using torch.float16 for quantization tuning[0m


In [10]:
base_name = MODEL_ID.split("/")[-1]

if TARGET_FORMATS == "AWQ":
    dir_suffix = "W4A16-AWQ"
    format_arg = "auto_awq"
elif TARGET_FORMATS == "GPTQ":
    dir_suffix = "W4A16-GPTQ"
    format_arg = "auto_gptq"
else: # AutoRound
    dir_suffix = "W4A16-AutoRound"
    format_arg = "auto_round"

save_dir = os.path.join(OUTPUT_BASE_DIR, dir_suffix)


ar.quantize_and_save(
    format=format_arg, 
    output_dir=save_dir, 
    inplace=False
)

[38;20m2026-02-01 05:23:56 INFO base.py L1729: start to cache block inputs[0m
[38;20m2026-02-01 05:24:02 INFO base.py L1744: caching done[0m
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
[38;20mquantized 7/7 layers in the block, loss iter 0: 0.002010 -> iter 224: 0.000584,'peak_ram': 6.33GB, 'peak_vram': 7.9GB[0m
Quantizing model.layers.1:   4%|▎         | 1/28 [00:33<14:59, 33.30s/it][38;20mquantized 7/7 layers in the block, loss iter 0: 0.003459 -> iter 234: 0.001175,'peak_ram': 6.33GB, 'peak_vram': 7.94GB[0m
Quantizing model.layers.2:   7%|▋         | 2/28 [01:04<13:55, 32.14s/it][38;20mquantized 7/7 layers in the block, loss iter 0: 0.049776 -> iter 90: 0.024743,'peak_ram': 6.33GB, 'peak_vram': 7.94GB[0m
Quantizing model.layers.3:  11%|█         | 3/28 [01:35<13:09, 31.56s/it][38;20mquantized 7/7 layers in the block, loss iter 0: 0.035735 -> iter 239: 0.024846,'peak_ram': 6.33GB, 'peak_vram': 7.94GB[0m
Quantizing

(Qwen3ForCausalLM(
   (model): Qwen3Model(
     (embed_tokens): Embedding(151936, 2048)
     (layers): ModuleList(
       (0-27): 28 x Qwen3DecoderLayer(
         (self_attn): Qwen3Attention(
           (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
           (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
           (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
           (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
           (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
           (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
         )
         (mlp): Qwen3MLP(
           (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
           (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
           (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
           (act_fn): SiLUActivation()
         )
         (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
         

In [15]:
hf_token = get_token()

if hf_token:
    push_to_hub(save_dir, f"{base_name}-W4A16-AutoRound-{TARGET_FORMATS}", hf_token)
else:
    print("❌ No HF token found. Skipping upload.")



[Hub] Pushing ./AutoRound/W4A16-AWQ to Vishva007/Qwen3-1.7B-W4A16-AutoRound-AWQ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-1.7B-W4A16-AutoRound-AWQ
