In [1]:
import gc, json, torch, logging, os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot
from llmcompressor import configure_logger, LoggerConfig

from evaluation_utils import evaluate_mmlu, mmlu_harness_hf, display_metric

configure_logger(LoggerConfig(
    disabled=True,
    clear_loggers=True,
    console_log_level=None,
    log_file=None,
    log_file_level=None
))


In [2]:
hub_prefix = "TheS3b/Qwen3-0.6B-SmoothQuant-W8A8-calib"  # base name for HF pushes
model_repo = "brygotti/MNLP_M2_mcqa_model"
BITS         = 8
BLOCK_SIZE   = 64
prompt_sizes = [20, 200, 2000]
MAX_SEQUENCE_LENGTH = 2048

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

all_metrics = {}

In [3]:
# Tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

# Re-use the calibration set you already filtered
calibration_data = load_dataset("TheS3b/unified-dataset-filtered-430K")
def is_valid_prompt(example, min_len=64, max_len=256, thresh=0.5):
    tokens = tokenizer(example["prompt"], return_tensors="pt")["input_ids"]
    return min_len <= tokens.shape[1] <= max_len and (example["relevance1"] + example["relevance2"]) * 0.5 > thresh
filtered_calibration_set = calibration_data.filter(
    lambda ex: is_valid_prompt(ex), batched=False
).shuffle(seed=42)["train"]

def tokenise(sample):
    return tokenizer(
        sample["prompt"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]

for size in prompt_sizes:
    print(f"\n── SmoothQuant W{BITS}A8 with {size} calibration prompts ──\n")
    gc.collect()
    torch.cuda.empty_cache()

    # Build tokenised calibration dataset
    calib_ds = filtered_calibration_set.select(range(size)).map(
        tokenise, remove_columns=filtered_calibration_set.column_names
    )

    # FP16 baseline model
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True,
    )

    recipe = [
        SmoothQuantModifier(
            smoothing_strength=0.8,
            ignore=["lm_head"],
            num_calibration_steps=size,
            block_size=BLOCK_SIZE,
        ),
        GPTQModifier(
            scheme=f"W{BITS}A8",
            targets="Linear",
            ignore=["lm_head"],
            block_size=BLOCK_SIZE,
        ),
    ]

    # One-shot quantisation pass
    oneshot(
        model=model,
        dataset=calib_ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=size,
    )

    model.eval()
    device = next(model.parameters()).device
    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)
    display_metric(f"SmoothQuant W{BITS}A8 Size {size}", metrics)
    key = f"SmoothQuant W{BITS}A8 calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/smooth_quant_metrics_calibration.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()



── SmoothQuant W8A8 with 20 calibration prompts ──



20it [00:02,  8.14it/s]
Preparing intermediates cache: 100%|██████████| 20/20 [00:00<00:00, 1947.99it/s]
(1/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 148.77it/s]
(1/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 481.89it/s]
(2/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 353.38it/s]
(2/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 556.38it/s]
(3/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 351.24it/s]
(3/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 528.83it/s]
(4/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 350.78it/s]
(4/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 587.58it/s]
(5/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 350.20it/s]
(5/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 649.55it/s]
(6/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 355.10it/s]
(6/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 664.03it/s]
(7/29): Calibrating: 100%|██████████| 20/20 [00:00<00:0


**SmoothQuant W8A8 Size 20 Evaluation Results**
- Accuracy              :  46.79 %
- Avg. inference time   :  709.5 ms
- Throughput (tok/s)    :  588.1
- Avg. peak VRAM        : 1207.4 MB
- Score Acc/VRAM        :  0.388 



tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Quantized Compression: 100%|██████████| 703/703 [00:02<00:00, 290.96it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]


── SmoothQuant W8A8 with 200 calibration prompts ──



200it [00:07, 26.12it/s]
Preparing intermediates cache: 100%|██████████| 200/200 [00:00<00:00, 2375.76it/s]
(1/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 314.20it/s]
(1/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 503.03it/s]
(2/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 354.49it/s]
(2/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 661.51it/s]
(3/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 356.35it/s]
(3/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 664.13it/s]
(4/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 356.67it/s]
(4/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 663.76it/s]
(5/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 357.17it/s]
(5/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 661.46it/s]
(6/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 354.04it/s]
(6/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 662.26it/s]
(7/29): Calibrating: 100%|██


**SmoothQuant W8A8 Size 200 Evaluation Results**
- Accuracy              :  46.94 %
- Avg. inference time   :  710.6 ms
- Throughput (tok/s)    :  587.2
- Avg. peak VRAM        : 2642.5 MB
- Score Acc/VRAM        :  0.178 



tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Quantized Compression: 100%|██████████| 703/703 [00:02<00:00, 311.24it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]


── SmoothQuant W8A8 with 2000 calibration prompts ──



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2000it [01:15, 26.40it/s]
Preparing intermediates cache: 100%|██████████| 2000/2000 [00:00<00:00, 2404.71it/s]
(1/29): Calibrating: 100%|██████████| 2000/2000 [00:06<00:00, 318.23it/s]
(1/29): Propagating: 100%|██████████| 2000/2000 [00:04<00:00, 431.12it/s]
(2/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 355.76it/s]
(2/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 608.47it/s]
(3/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 348.34it/s]
(3/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 645.07it/s]
(4/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 355.97it/s]
(4/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 646.69it/s]
(5/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 356.05it/s]
(5/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 645.79it/s]
(6/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 355.85it/s]
(6/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 629.22it/s]
(


**SmoothQuant W8A8 Size 2000 Evaluation Results**
- Accuracy              :  46.99 %
- Avg. inference time   :  712.0 ms
- Throughput (tok/s)    :  586.0
- Avg. peak VRAM        : 2645.0 MB
- Score Acc/VRAM        :  0.178 



tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Quantized Compression: 100%|██████████| 703/703 [00:03<00:00, 181.85it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

In [6]:
hub_prefix = "TheS3b/Qwen3-0.6B-SmoothQuant-W4A8-calib"  # base name for HF pushes
model_repo = "brygotti/MNLP_M2_mcqa_model"
BITS         = [4]
BLOCK_SIZE   = 64
size = 200
MAX_SEQUENCE_LENGTH = 2048

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

all_metrics = {}

In [7]:
# Tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

# Re-use the calibration set you already filtered
calibration_data = load_dataset("TheS3b/unified-dataset-filtered-430K")
def is_valid_prompt(example, min_len=64, max_len=256, thresh=0.5):
    tokens = tokenizer(example["prompt"], return_tensors="pt")["input_ids"]
    return min_len <= tokens.shape[1] <= max_len and (example["relevance1"] + example["relevance2"]) * 0.5 > thresh
filtered_calibration_set = calibration_data.filter(
    lambda ex: is_valid_prompt(ex), batched=False
).shuffle(seed=42)["train"]

# Tokenisation helper for llm-compressor (expects tokenised samples)
def tokenise(sample):
    return tokenizer(
        sample["prompt"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
calib_ds = filtered_calibration_set.select(range(size)).map(
    tokenise, remove_columns=filtered_calibration_set.column_names
)

for BIT in BITS:
    print(f"\n-- SmoothQuant W{BIT}A8 with {size} calibration prompts --\n")
    gc.collect()
    torch.cuda.empty_cache()

    # FP16 baseline model (no quantisation yet)
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True,
    )

    recipe = [
        SmoothQuantModifier(
            smoothing_strength=0.8,
            ignore=["lm_head"],
            num_calibration_steps=size,
            block_size=BLOCK_SIZE,
        ),
        GPTQModifier(
            scheme=f"W{BIT}A8",
            targets="Linear",
            ignore=["lm_head"],
            block_size=BLOCK_SIZE,
        ),
    ]

    # One-shot quantisation pass
    oneshot(
        model=model,
        dataset=calib_ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=size,
    )

    model.eval()
    device = next(model.parameters()).device
    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)
    display_metric(f"SmoothQuant W{BIT}A8 Size {size}", metrics)
    key = f"SmoothQuant W{BIT}A8 calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/smooth_quant_metrics_bits.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()



-- SmoothQuant W4A8 with 200 calibration prompts --



200it [00:07, 25.70it/s]
Preparing intermediates cache: 100%|██████████| 200/200 [00:00<00:00, 2359.67it/s]
(1/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 310.43it/s]
(1/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 476.17it/s]
(2/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 350.43it/s]
(2/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 648.11it/s]
(3/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 352.17it/s]
(3/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 644.95it/s]
(4/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 351.20it/s]
(4/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 628.56it/s]
(5/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 348.04it/s]
(5/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 642.18it/s]
(6/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 351.31it/s]
(6/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 642.73it/s]
(7/29): Calibrating: 100%|██


**SmoothQuant W4A8 Size 200 Evaluation Results**
- Accuracy              :  43.73 %
- Avg. inference time   : 2731.0 ms
- Throughput (tok/s)    :  152.8
- Avg. peak VRAM        : 2661.8 MB
- Score Acc/VRAM        :  0.164 



tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Quantized Compression: 100%|██████████| 703/703 [00:16<00:00, 43.91it/s]


model.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]