In [None]:
# !pip install --upgrade transformers

In [None]:
# !pip install auto-round

In [1]:
import os
import torch
from auto_round import AutoRound
from huggingface_hub import HfApi, create_repo

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


PyTorch Version: 2.8.0+cu126
CUDA Available: True
CUDA Version: 12.6
GPU Name: NVIDIA GeForce RTX 4090
VRAM: 23.5 GB


In [19]:
MODEL_ID = "Qwen/Qwen3-4B-Instruct-2507"
HF_USER = "Vishva007"
OUTPUT_BASE_DIR = "./AutoRound"

In [8]:
HF_TOKEN = os.getenv("HF_TOKEN")

In [None]:
TARGET_FORMATS = [
    "AWQ",            # Best for Nvidia GPUs (vLLM, TGI)
    # "GPTQ",         # Good compatibility
    "AutoRound",      # Intel default format (Requires auto-round lib to run)
    # "GGUF",         # For llama.cpp / Olama
]

# High-End GPU Tuning Parameters (A40/A6000/L40)
TUNING_CONFIG = {
    "group_size": 128,
    "sym": True,
    "iters": 1000,          # High accuracy (Production grade)
    "nsamples": 512,        # More calibration data
    "batch_size": 8,        # Faster on 48GB VRAM
    "seqlen": 2048,
    "low_gpu_mem_usage": False,   # Keep on GPU for speed
    "enable_torch_compile": True, # JIT acceleration
    "quant_nontext_module": False # Keep Vision Tower in BF16 (Crucial for VLM accuracy)
}

In [None]:
def push_to_hub(local_dir, repo_name, token):
    """Creates repo and uploads folder to Hugging Face."""
    full_repo_id = f"{HF_USER}/{repo_name}"
    print(f"\n[Hub] Pushing {local_dir} to {full_repo_id}...")
    
    try:
        api = HfApi(token=token)
        create_repo(full_repo_id, repo_type="model", exist_ok=True, private=False, token=token)
        
        api.upload_folder(
            folder_path=local_dir,
            repo_id=full_repo_id,
            repo_type="model",
            token=token
        )
        print(f"[Hub] ✅ Successfully uploaded: https://huggingface.co/{full_repo_id}")
    except Exception as e:
        print(f"[Hub] ❌ Error uploading: {e}")

In [14]:
ar = AutoRound(
        MODEL_ID,
        scheme="W4A16",
        **TUNING_CONFIG
    )

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

404 Client Error. (Request ID: Root=1-697cd268-03c062925dfe3dd45fefea95;11f64133-6eb0-40b8-b08e-c6d2bd09520a)

Entry Not Found for url: https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507/resolve/main/model_index.json.


Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

[38;20m2026-01-30 15:46:51 INFO base.py L391: using torch.bfloat16 for quantization tuning[0m


In [15]:
ar.quantize()

[38;20m2026-01-30 15:47:35 INFO base.py L1729: start to cache block inputs[0m


README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

data/train-00000-of-00001-4746b8785c874c(…):   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1216 [00:00<?, ? examples/s]

[38;20m2026-01-30 15:48:01 INFO base.py L1744: caching done[0m
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELI

(Qwen3ForCausalLM(
   (model): Qwen3Model(
     (embed_tokens): Embedding(151936, 2560)
     (layers): ModuleList(
       (0-35): 36 x Qwen3DecoderLayer(
         (self_attn): Qwen3Attention(
           (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
           (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
           (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
           (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
           (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
           (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
         )
         (mlp): Qwen3MLP(
           (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
           (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
           (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
           (act_fn): SiLUActivation()
         )
         (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
         

In [21]:
base_name = MODEL_ID.split("/")[-1]
    
if "AWQ" in TARGET_FORMATS:
        save_dir = os.path.join(OUTPUT_BASE_DIR, "W4A16-AWQ")
        print("\n💾 Exporting to AWQ...")
        ar.save_quantized(save_dir, format="auto_awq", inplace=False)
        push_to_hub(save_dir, f"{base_name}-W4A16-AutoRound-AWQ", HF_TOKEN)

if "GPTQ" in TARGET_FORMATS:
        save_dir = os.path.join(OUTPUT_BASE_DIR, "W4A16-GPTQ")
        print("\n💾 Exporting to GPTQ...")
        ar.save_quantized(save_dir, format="auto_gptq", inplace=False)
        push_to_hub(save_dir, f"{base_name}-W4A16-AutoRound-GPTQ", HF_TOKEN)

if "AutoRound" in TARGET_FORMATS:
        save_dir = os.path.join(OUTPUT_BASE_DIR, "W4A16-AutoRound")
        print("\n💾 Exporting to AutoRound format...")
        ar.save_quantized(save_dir, format="auto_round", inplace=False)
        push_to_hub(save_dir, f"{base_name}-W4A16-AutoRound", HF_TOKEN)


[38;20m2026-01-30 16:43:09 INFO export.py L96: Saving quantized model to auto_awq format[0m



💾 Exporting to AWQ...


packing model.layers.35.mlp.down_proj: 100%|██████████| 252/252 [00:03<00:00, 74.00it/s]   


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]




[Hub] Pushing ./AutoRound/W4A16-AWQ to Vishva007/Qwen3-4B-Instruct-2507-W4A16-AutoRound-AWQ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-4B-Instruct-2507-W4A16-AutoRound-AWQ

💾 Exporting to AutoRound format...


packing model.layers.35.mlp.down_proj: 100%|██████████| 252/252 [00:04<00:00, 62.01it/s]   


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]




[Hub] Pushing ./AutoRound/W4A16-AutoRound to Vishva007/Qwen3-4B-Instruct-2507-W4A16-AutoRound...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-4B-Instruct-2507-W4A16-AutoRound
