In [1]:
from datasets import load_dataset, Features, Value
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.utils.data import DataLoader
import torch
import gc
from tqdm.notebook import tqdm
from transformers import GPTQConfig
import os
import logging
from typing import Dict, List, Tuple
import numpy as np
import json
from evaluation_utils import evaluate_mmlu, mmlu_harness_hf, display_metric

In [2]:
# 1) load the HF dataset
ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")
print(ds)
print(ds["test"].column_names)
print(len(ds['test']))

DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'choices', 'answer'],
        num_rows: 1962
    })
})
['id', 'question', 'choices', 'answer']
1962


In [3]:
base_repo  = "Qwen/Qwen3-0.6B-Base"
model_repo = "brygotti/MNLP_M3_mcqa_model"

all_metrics = {}

# Simple Quantization

In [4]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(base_repo, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    base_repo,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("Base Model", metrics)
all_metrics['Base Model'] = metrics

Evaluating: 100%|██████████| 1962/1962 [05:07<00:00,  6.37it/s]


**Base Model Evaluation Results**
- Accuracy              :  44.34 %
- Avg. inference time   :  156.0 ms
- Throughput (tok/s)    : 2674.5
- Avg. peak VRAM        : 2402.8 MB
- Score Acc/VRAM        :  0.185 






In [5]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("SFT Model", metrics)
all_metrics['SFT Model'] = metrics

Evaluating: 100%|██████████| 1962/1962 [05:06<00:00,  6.40it/s]


**SFT Model Evaluation Results**
- Accuracy              :  46.38 %
- Avg. inference time   :  155.4 ms
- Throughput (tok/s)    : 2685.7
- Avg. peak VRAM        : 2402.8 MB
- Score Acc/VRAM        :  0.193 






In [6]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4")
model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("BnB 4-bit Model", metrics)
all_metrics['BnB 4-bit Model'] = metrics

tokenizer.push_to_hub("TheS3b/Qwen3-0.6B-bnb-4bit")
model.push_to_hub("TheS3b/Qwen3-0.6B-bnb-4bit")

Evaluating: 100%|██████████| 1962/1962 [07:43<00:00,  4.23it/s]



**BnB 4-bit Model Evaluation Results**
- Accuracy              :  43.48 %
- Avg. inference time   :  235.6 ms
- Throughput (tok/s)    : 1771.1
- Avg. peak VRAM        :  616.0 MB
- Score Acc/VRAM        :  0.706 



No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-4bit/commit/348cd17ce9d851721bc6ae9e1ffcedd2743121ad', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='348cd17ce9d851721bc6ae9e1ffcedd2743121ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-4bit', endpoint='https://huggingface.co', repo_type='model', repo_id='TheS3b/Qwen3-0.6B-bnb-4bit'), pr_revision=None, pr_num=None)

Weights are stored in 4-bit but packed in 8-bit numbers. During inference, the intermediate results are stored in 16-bit float numbers.

In [7]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_has_fp16_weight=False, # keep all weights int8 on GPU, better VRAM
    llm_int8_enable_fp32_cpu_offload=False
)

model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("BnB 8-bit Model", metrics)
all_metrics['BnB 8-bit Model'] = metrics

tokenizer.push_to_hub("TheS3b/Qwen3-0.6B-bnb-8bit")
model.push_to_hub("TheS3b/Qwen3-0.6B-bnb-8bit")

Evaluating: 100%|██████████| 1962/1962 [16:04<00:00,  2.03it/s]



**BnB 8-bit Model Evaluation Results**
- Accuracy              :  46.69 %
- Avg. inference time   :  490.6 ms
- Throughput (tok/s)    :  850.5
- Avg. peak VRAM        : 1231.2 MB
- Score Acc/VRAM        :  0.379 



No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/753M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-8bit/commit/3d034290eb7cfaea1955a60b25388aaf8a4004d3', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='3d034290eb7cfaea1955a60b25388aaf8a4004d3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-8bit', endpoint='https://huggingface.co', repo_type='model', repo_id='TheS3b/Qwen3-0.6B-bnb-8bit'), pr_revision=None, pr_num=None)

In [8]:
# Intermediate save of results
import json
with open("Results/quantization_metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)


# GPTQ

In [9]:
with open("Results/quantization_metrics.json", "r") as f:
    all_metrics = json.load(f)

In [10]:
hub_prefix = "TheS3b/Qwen3-0.6B-GPTQ-4bit-rel0.5-calib"  # base for model names

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

# Load tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

filtered_calibration_set = load_dataset("TheS3b/MNLP_M3_quantized_dataset")["train"]

# Prompt sizes to test
prompt_sizes = [20, 200, 2000]

# Dataset to evaluate on
eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]

for size in prompt_sizes:
    print(f"\n -- Quantizing with {size} calibration prompts -- \n")
    
    gc.collect()
    torch.cuda.empty_cache()

    calibration_prompts = filtered_calibration_set.select(range(size))["prompt"]

    quant_config = GPTQConfig(
        bits=4,
        tokenizer=model_repo,
        dataset=calibration_prompts,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        device_map="cuda",
        trust_remote_code=True,
        quantization_config=quant_config
    )

    model.eval()
    device = next(model.parameters()).device

    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)

    display_metric(f"GPTQ Model Size {size}", metrics)

    # Save under a unique key and push
    key = f"GPTQ 4bit calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/quantization_metrics.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()


 -- Quantizing with 20 calibration prompts -- 

ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for compatibililty.


Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Evaluating: 100%|██████████| 1962/1962 [05:01<00:00,  6.51it/s]



**GPTQ Model Size 20 Evaluation Results**
- Accuracy              :  43.12 %
- Avg. inference time   :  152.5 ms
- Throughput (tok/s)    : 2735.3
- Avg. peak VRAM        :  613.2 MB
- Score Acc/VRAM        :  0.703 



No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]


 -- Quantizing with 200 calibration prompts -- 



Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 1962/1962 [05:03<00:00,  6.47it/s]



**GPTQ Model Size 200 Evaluation Results**
- Accuracy              :  43.83 %
- Avg. inference time   :  153.6 ms
- Throughput (tok/s)    : 2716.5
- Avg. peak VRAM        :  592.8 MB
- Score Acc/VRAM        :  0.739 



No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]


 -- Quantizing with 2000 calibration prompts -- 



Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 1962/1962 [05:02<00:00,  6.50it/s]



**GPTQ Model Size 2000 Evaluation Results**
- Accuracy              :  43.93 %
- Avg. inference time   :  152.9 ms
- Throughput (tok/s)    : 2728.8
- Avg. peak VRAM        :  592.8 MB
- Score Acc/VRAM        :  0.741 



No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]