In [1]:
import gc, json, torch, logging, os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot
from llmcompressor import configure_logger, LoggerConfig

from evaluation_utils import evaluate_mmlu, mmlu_harness_hf, display_metric

configure_logger(LoggerConfig(
    disabled=True,
    clear_loggers=True,
    console_log_level=None,
    log_file=None,
    log_file_level=None
))


In [2]:
hub_prefix = "TheS3b/Qwen3-0.6B-SmoothQuant-W8A8-calib"  # base name for HF pushes
model_repo = "brygotti/MNLP_M3_mcqa_model"
BITS         = 8
BLOCK_SIZE   = 64
prompt_sizes = [20, 200, 2000]
MAX_SEQUENCE_LENGTH = 2048

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

all_metrics = {}

In [3]:
# Tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

filtered_calibration_set = load_dataset("TheS3b/MNLP_M3_quantized_dataset")["train"]

def tokenise(sample):
    return tokenizer(
        sample["prompt"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]

for size in prompt_sizes:
    print(f"\n── SmoothQuant W{BITS}A8 with {size} calibration prompts ──\n")
    gc.collect()
    torch.cuda.empty_cache()

    # Build tokenised calibration dataset
    calib_ds = filtered_calibration_set.select(range(size)).map(
        tokenise, remove_columns=filtered_calibration_set.column_names
    )

    # FP16 baseline model
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True,
    )

    recipe = [
        SmoothQuantModifier(
            smoothing_strength=0.8,
            ignore=["lm_head"],
            num_calibration_steps=size,
            block_size=BLOCK_SIZE,
        ),
        GPTQModifier(
            scheme=f"W{BITS}A8",
            targets="Linear",
            ignore=["lm_head"],
            block_size=BLOCK_SIZE,
        ),
    ]

    # One-shot quantisation pass
    oneshot(
        model=model,
        dataset=calib_ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=size,
    )

    model.eval()
    device = next(model.parameters()).device
    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)
    display_metric(f"SmoothQuant W{BITS}A8 Size {size}", metrics)
    key = f"SmoothQuant W{BITS}A8 calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/smooth_quant_metrics_calibration.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()



── SmoothQuant W8A8 with 20 calibration prompts ──



20it [00:01, 16.37it/s]
Preparing intermediates cache: 100%|██████████| 20/20 [00:00<00:00, 1937.73it/s]
(1/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 263.25it/s]
(1/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 495.45it/s]
(2/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 352.21it/s]
(2/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 585.94it/s]
(3/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 359.20it/s]
(3/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 655.38it/s]
(4/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 355.29it/s]
(4/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 658.50it/s]
(5/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 356.65it/s]
(5/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 665.11it/s]
(6/29): Calibrating: 100%|██████████| 20/20 [00:00<00:00, 349.72it/s]
(6/29): Propagating: 100%|██████████| 20/20 [00:00<00:00, 621.05it/s]
(7/29): Calibrating: 100%|██████████| 20/20 [00:00<00:0


**SmoothQuant W8A8 Size 20 Evaluation Results**
- Accuracy              :  46.13 %
- Avg. inference time   :  699.2 ms
- Throughput (tok/s)    :  596.7
- Avg. peak VRAM        : 1207.4 MB
- Score Acc/VRAM        :  0.382 



Quantized Compression: 100%|██████████| 703/703 [00:02<00:00, 239.51it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]


── SmoothQuant W8A8 with 200 calibration prompts ──



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

200it [00:07, 26.08it/s]
Preparing intermediates cache: 100%|██████████| 200/200 [00:00<00:00, 2134.11it/s]
(1/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 315.81it/s]
(1/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 494.80it/s]
(2/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 350.85it/s]
(2/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 580.85it/s]
(3/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 357.74it/s]
(3/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 667.44it/s]
(4/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 357.91it/s]
(4/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 668.79it/s]
(5/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 358.17it/s]
(5/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 668.52it/s]
(6/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 357.76it/s]
(6/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 664.66it/s]
(7/29): Calibrating: 100%|██


**SmoothQuant W8A8 Size 200 Evaluation Results**
- Accuracy              :  46.13 %
- Avg. inference time   :  703.5 ms
- Throughput (tok/s)    :  593.1
- Avg. peak VRAM        : 2642.5 MB
- Score Acc/VRAM        :  0.175 



Quantized Compression: 100%|██████████| 703/703 [00:02<00:00, 258.25it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]


── SmoothQuant W8A8 with 2000 calibration prompts ──



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2000it [01:17, 25.78it/s]
Preparing intermediates cache: 100%|██████████| 2000/2000 [00:00<00:00, 2143.97it/s]
(1/29): Calibrating: 100%|██████████| 2000/2000 [00:06<00:00, 317.50it/s]
(1/29): Propagating: 100%|██████████| 2000/2000 [00:04<00:00, 413.35it/s]
(2/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 342.42it/s]
(2/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 637.03it/s]
(3/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 356.83it/s]
(3/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 642.35it/s]
(4/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 345.70it/s]
(4/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 653.09it/s]
(5/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 356.08it/s]
(5/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 658.94it/s]
(6/29): Calibrating: 100%|██████████| 2000/2000 [00:05<00:00, 355.25it/s]
(6/29): Propagating: 100%|██████████| 2000/2000 [00:03<00:00, 648.76it/s]
(


**SmoothQuant W8A8 Size 2000 Evaluation Results**
- Accuracy              :  46.48 %
- Avg. inference time   :  715.1 ms
- Throughput (tok/s)    :  583.5
- Avg. peak VRAM        : 2642.0 MB
- Score Acc/VRAM        :  0.176 



Quantized Compression: 100%|██████████| 703/703 [00:02<00:00, 267.27it/s]


model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

In [4]:
hub_prefix = "TheS3b/Qwen3-0.6B-SmoothQuant-W4A8-calib"  # base name for HF pushes
model_repo = "brygotti/MNLP_M3_mcqa_model"
BITS         = [4]
BLOCK_SIZE   = 64
size = 200
MAX_SEQUENCE_LENGTH = 2048

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

all_metrics = {}

In [5]:
# Tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

# Re-use the calibration set you already filtered
filtered_calibration_set = load_dataset("TheS3b/MNLP_M3_quantized_dataset")["train"]

# Tokenisation helper for llm-compressor (expects tokenised samples)
def tokenise(sample):
    return tokenizer(
        sample["prompt"],
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )

eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
calib_ds = filtered_calibration_set.select(range(size)).map(
    tokenise, remove_columns=filtered_calibration_set.column_names
)

for BIT in BITS:
    print(f"\n-- SmoothQuant W{BIT}A8 with {size} calibration prompts --\n")
    gc.collect()
    torch.cuda.empty_cache()

    # FP16 baseline model (no quantisation yet)
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True,
    )

    recipe = [
        SmoothQuantModifier(
            smoothing_strength=0.8,
            ignore=["lm_head"],
            num_calibration_steps=size,
            block_size=BLOCK_SIZE,
        ),
        GPTQModifier(
            scheme=f"W{BIT}A8",
            targets="Linear",
            ignore=["lm_head"],
            block_size=BLOCK_SIZE,
        ),
    ]

    # One-shot quantisation pass
    oneshot(
        model=model,
        dataset=calib_ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=size,
    )

    model.eval()
    device = next(model.parameters()).device
    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)
    display_metric(f"SmoothQuant W{BIT}A8 Size {size}", metrics)
    key = f"SmoothQuant W{BIT}A8 calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/smooth_quant_metrics_bits.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()



-- SmoothQuant W4A8 with 200 calibration prompts --



200it [00:07, 25.53it/s]
Preparing intermediates cache: 100%|██████████| 200/200 [00:00<00:00, 2113.43it/s]
(1/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 315.03it/s]
(1/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 456.17it/s]
(2/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 347.76it/s]
(2/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 608.32it/s]
(3/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 354.86it/s]
(3/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 551.72it/s]
(4/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 353.94it/s]
(4/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 648.36it/s]
(5/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 354.95it/s]
(5/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 656.98it/s]
(6/29): Calibrating: 100%|██████████| 200/200 [00:00<00:00, 355.05it/s]
(6/29): Propagating: 100%|██████████| 200/200 [00:00<00:00, 661.32it/s]
(7/29): Calibrating: 100%|██


**SmoothQuant W4A8 Size 200 Evaluation Results**
- Accuracy              :  43.12 %
- Avg. inference time   : 2551.4 ms
- Throughput (tok/s)    :  163.5
- Avg. peak VRAM        : 2651.0 MB
- Score Acc/VRAM        :  0.163 



Quantized Compression: 100%|██████████| 703/703 [00:16<00:00, 42.64it/s]


model.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]