In [2]:
# !pip install --upgrade "transformers<5.0"

In [4]:
# !pip install git+https://github.com/intel/auto-round.git

In [1]:
import os
import torch
from auto_round import AutoRound
from huggingface_hub import HfApi, create_repo, notebook_login, get_token
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


PyTorch Version: 2.8.0+cu126
CUDA Available: True
CUDA Version: 12.6
GPU Name: NVIDIA GeForce RTX 3090
VRAM: 23.6 GB


In [4]:
MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
HF_USER = "Vishva007"
OUTPUT_BASE_DIR = "./AutoRound"

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
TUNING_CONFIG = {
    "group_size": 128,
    "sym": True,
    "iters": 800,  # High accuracy (Production grade)
    "nsamples": 512,  # More calibration data
    "batch_size": 2,  # Faster on 48GB VRAM
    "seqlen": 2048,
    "low_gpu_mem_usage": False,  # Keep on GPU for speed
    "enable_torch_compile": True,  # JIT acceleration
    "quant_nontext_module": False,  # Keep Vision Tower in BF16 (Crucial for VLM accuracy)
}

In [7]:
def push_to_hub(local_dir, repo_name, token):
    """Creates repo and uploads folder to Hugging Face."""
    full_repo_id = f"{HF_USER}/{repo_name}"
    print(f"\n[Hub] Pushing {local_dir} to {full_repo_id}...")

    try:
        api = HfApi()
        create_repo(
            full_repo_id, repo_type="model", exist_ok=True, private=False, token=token
        )

        api.upload_folder(
            folder_path=local_dir, repo_id=full_repo_id, repo_type="model", token=token
        )
        print(f"[Hub] ✅ Successfully uploaded: https://huggingface.co/{full_repo_id}")
    except Exception as e:
        print(f"[Hub] ❌ Error uploading: {e}")

In [8]:
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID, dtype=torch.float16, device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

tokenizer = processor.tokenizer


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

In [9]:
ar = AutoRound(
    model=model,
    tokenizer=tokenizer,
    processor=processor,
    scheme="W4A16",
    model_dtype="fp16",
    **TUNING_CONFIG,
)

[38;20m2026-02-07 17:40:17 INFO autoround.py L165: using MLLM mode for multimodal model.[0m
[38;20m2026-02-07 17:40:18 INFO base.py L486: using torch.float16 for quantization tuning[0m


In [10]:
# SINGLE CALL to save all 3 formats to the same output directory
# The files will exist side-by-side or merged in this folder.
ar.quantize_and_save(
    OUTPUT_BASE_DIR, format="auto_gptq,auto_awq,auto_round", inplace=True
)

[38;20m2026-02-07 17:40:20 INFO base.py L1739: start to cache block inputs[0m


README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

data/train-00000-of-00001-4746b8785c874c(…):   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1216 [00:00<?, ? examples/s]

  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
cache block inputs: 100%|██████████| 512/512 [00:08<00:00, 60.85it/s]
[38;20m2026-02-07 17:41:20 INFO base.py L1754: caching done[0m
  extern_kernels.mm(reinterpret_tensor(buf1, (4096, 2560), (2560, 1), 0), reinterpret_tensor(arg6_1, (2560, 4096), (1, 2560), 0), out=buf2)
  extern_kernels.mm(reinterpret_tensor(buf1, (4096, 2560), (2560, 1), 0), reinterpret_tensor(arg8_1, (2560, 1024), (1, 2560), 0), out=buf5)
  extern_kernels.mm(reinterpret_tensor(buf1, (4096, 2560), (2560, 1), 0), reinterpret_tensor(arg10_1, (2560, 1024), (1, 2560), 0), out=buf8)
  extern_kernels.mm(reinterpret_tensor(buf12, (4096, 4096), (4096, 1), 0), reinterpret_tensor(arg11_1, (4096, 2560), (1, 4096), 0), out=buf17)
  extern_kernels.mm(reinterpret_tensor(buf19, (4096, 2560), (2560, 1), 0), reinterpret_tensor(arg13_1, (2560, 9728), (1, 2560), 0), out=buf20)
  extern_kernels.mm(reinterpret_tensor(buf19, (4096, 2560), (2560, 1), 0

(Qwen3VLForConditionalGeneration(
   (model): Qwen3VLModel(
     (visual): Qwen3VLVisionModel(
       (patch_embed): Qwen3VLVisionPatchEmbed(
         (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
       )
       (pos_embed): Embedding(2304, 1024)
       (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
       (blocks): ModuleList(
         (0-23): 24 x Qwen3VLVisionBlock(
           (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
           (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
           (attn): Qwen3VLVisionAttention(
             (qkv): Linear(in_features=1024, out_features=3072, bias=True)
             (proj): Linear(in_features=1024, out_features=1024, bias=True)
           )
           (mlp): Qwen3VLVisionMLP(
             (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)
             (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)
             (act_fn): GELUTanh()
           

In [11]:
base_name = MODEL_ID.split("/")[-1]
hf_token = get_token()

In [12]:
path_autoround = os.path.join(OUTPUT_BASE_DIR, "auto-round-auto-gptq")
path_gptq = os.path.join(OUTPUT_BASE_DIR, "auto-gptq")
path_awq = os.path.join(OUTPUT_BASE_DIR, "auto-awq")

In [13]:
if hf_token:
    # 1. AutoRound Repo
    # Verify path exists before uploading
    if os.path.exists(path_autoround):
        push_to_hub(path_autoround, f"{base_name}-W4A16-AutoRound", hf_token)
    else:
        print(f"⚠️ Could not find AutoRound output at {path_autoround}")

    # 2. GPTQ Repo
    if os.path.exists(path_gptq):
        push_to_hub(path_gptq, f"{base_name}-W4A16-AutoRound-GPTQ", hf_token)
    else:
        print(f"⚠️ Could not find GPTQ output at {path_gptq}")

    # 3. AWQ Repo
    if os.path.exists(path_awq):
        push_to_hub(path_awq, f"{base_name}-W4A16-AutoRound-AWQ", hf_token)
    else:
        print(f"⚠️ Could not find AWQ output at {path_awq}")


[Hub] Pushing ./AutoRound/auto-round-auto-gptq to Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound

[Hub] Pushing ./AutoRound/auto-gptq to Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound-GPTQ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound-GPTQ

[Hub] Pushing ./AutoRound/auto-awq to Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound-AWQ...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[Hub] ✅ Successfully uploaded: https://huggingface.co/Vishva007/Qwen3-VL-4B-Instruct-W4A16-AutoRound-AWQ
