In [1]:
from datasets import load_dataset, Features, Value
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.utils.data import DataLoader
import torch
import gc
from tqdm.notebook import tqdm
from transformers import GPTQConfig
import os
import logging
from typing import Dict, List, Tuple
import numpy as np
import json


In [2]:
import time
from typing import Dict, List

import torch
from tqdm import tqdm

LETTER_INDICES: List[str] = ["A", "B", "C", "D"]


class Doc:
    def __init__(self, query: str, choices: List[str], gold_index: int):
        self.query = query
        self.choices = choices
        self.gold_index = gold_index

def mmlu_harness_hf(
    ex, topic: str = "advanced master-level STEM courses"
) -> Doc:
    """
    Convert a raw example from the HF MMLU JSON into a Doc object
    understood by the evaluator.
    """
    question = ex["question"]
    choices = ex["choices"]
    answer = ex["answer"]
    prompt = f"The following are multiple choice questions about {topic}.\n\n"
    prompt += question + "\n"
    for letter, text in zip(LETTER_INDICES, choices):
        prompt += f"{letter}. {text}\n"
    prompt += "Answer:"
    gold_ix = LETTER_INDICES.index(answer)
    # prepend a space before each candidate, as required by the original prompt logic
    return Doc(prompt, [f" {c}" for c in LETTER_INDICES], gold_ix)

@torch.no_grad()
def score_choice(
    model: torch.nn.Module,
    tokenizer,
    device: torch.device,
    prompt: str,
    choice: str,
) -> float:
    """
    Log-probability that `model` assigns to `choice` when it is generated
    directly after `prompt`.
    """
    # 1) Encode prompt and prompt+choice (no special tokens)
    enc_prompt = tokenizer(
        prompt, return_tensors="pt", add_special_tokens=False
    ).to(device)
    enc_full = tokenizer(
        prompt + choice, return_tensors="pt", add_special_tokens=False
    ).to(device)

    input_ids = enc_full.input_ids
    attn_mask = enc_full.attention_mask

    # 2) Forward pass
    logits = model(input_ids=input_ids, attention_mask=attn_mask).logits
    log_probs = torch.log_softmax(logits, dim=-1)

    # 3) Sum log-probs only for the choice tokens
    prompt_len = enc_prompt.input_ids.size(1)
    total_lp = 0.0
    for i in range(prompt_len, input_ids.size(1)):
        token_id = input_ids[0, i].item()
        total_lp += log_probs[0, i - 1, token_id].item()

    return total_lp

def evaluate_mmlu(
    dataset,
    model: torch.nn.Module,
    tokenizer,
    device: torch.device,
    harness_fn,
) -> Dict[str, float]:

    correct = total = 0
    total_time = 0.0
    total_tokens = 0
    per_example_peaks: List[int] = []

    for example in tqdm(dataset, desc="Evaluating"):
        if device.type == "cuda":
            torch.cuda.reset_peak_memory_stats(device)

        start = time.perf_counter()

        doc = harness_fn(example)

        for choice in doc.choices:
            # same construction we pass to the model
            ids = tokenizer(doc.query + choice,
                            add_special_tokens=False).input_ids
            total_tokens += len(ids)

        scores = [
            score_choice(model, tokenizer, device, doc.query, c)
            for c in doc.choices
        ]

        if device.type == "cuda":
            torch.cuda.synchronize(device)

        total_time += time.perf_counter() - start

        if device.type == "cuda":
            per_example_peaks.append(torch.cuda.max_memory_allocated(device))

        pred = int(torch.argmax(torch.tensor(scores)))
        correct += (pred == doc.gold_index)
        total += 1

    avg_time      = total_time / total
    tokens_per_s  = total_tokens / total_time
    avg_peak_vram = (
        (sum(per_example_peaks) / len(per_example_peaks)) / 1024**2
        if per_example_peaks else float("nan")
    )

    return {
        "accuracy": correct / total,
        "avg_time_s": avg_time,
        "tokens_per_s": tokens_per_s,
        "avg_peak_vram_MB": avg_peak_vram,
        "score_acc_over_vram": 1000 * (correct / total) / avg_peak_vram
    }


def display_metric(name, metrics):
    print(
        f"\n**{name} Evaluation Results**\n"
        f"- Accuracy              : {metrics['accuracy'] * 100:6.2f} %\n"
        f"- Avg. inference time   : {metrics['avg_time_s'] * 1_000:6.1f} ms\n"
        f"- Throughput (tok/s)    : {metrics['tokens_per_s']:6.1f}\n"
        f"- Avg. peak VRAM        : {metrics['avg_peak_vram_MB']:6.1f} MB\n"
        f"- Score Acc/VRAM        : {metrics['score_acc_over_vram']:6.3f} \n"
    )


In [3]:
# 1) load the HF dataset
ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")
print(ds)
print(ds["test"].column_names)
print(len(ds['test']))

DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'choices', 'answer'],
        num_rows: 1962
    })
})
['id', 'question', 'choices', 'answer']
1962


In [4]:
base_repo  = "Qwen/Qwen3-0.6B-Base"
model_repo = "brygotti/MNLP_M2_mcqa_model"

all_metrics = {}

# Simple Quantization

In [5]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(base_repo, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    base_repo,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("Base Model", metrics)
all_metrics['Base Model'] = metrics

Evaluating: 100%|██████████| 1962/1962 [05:06<00:00,  6.41it/s]


**Base Model Evaluation Results**
- Accuracy              :  44.34 %
- Avg. inference time   :  155.1 ms
- Throughput (tok/s)    : 2690.3
- Avg. peak VRAM        : 2402.8 MB
- Score Acc/VRAM        :  0.185 






In [6]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("SFT Model", metrics)
all_metrics['SFT Model'] = metrics

Evaluating: 100%|██████████| 1962/1962 [05:11<00:00,  6.31it/s]


**SFT Model Evaluation Results**
- Accuracy              :  46.84 %
- Avg. inference time   :  157.6 ms
- Throughput (tok/s)    : 2648.0
- Avg. peak VRAM        : 2402.8 MB
- Score Acc/VRAM        :  0.195 






In [7]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4")
model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("BnB 4-bit Model", metrics)
all_metrics['BnB 4-bit Model'] = metrics

tokenizer.push_to_hub("TheS3b/Qwen3-0.6B-bnb-4bit")
model.push_to_hub("TheS3b/Qwen3-0.6B-bnb-4bit")

Evaluating: 100%|██████████| 1962/1962 [07:54<00:00,  4.13it/s]



**BnB 4-bit Model Evaluation Results**
- Accuracy              :  43.27 %
- Avg. inference time   :  241.0 ms
- Throughput (tok/s)    : 1731.5
- Avg. peak VRAM        :  616.0 MB
- Score Acc/VRAM        :  0.702 



No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-4bit/commit/d33c0d03e07e8e28b6385d59666d64c53e9fb918', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='d33c0d03e07e8e28b6385d59666d64c53e9fb918', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-4bit', endpoint='https://huggingface.co', repo_type='model', repo_id='TheS3b/Qwen3-0.6B-bnb-4bit'), pr_revision=None, pr_num=None)

Weights are stored in 4-bit but packed in 8-bit numbers. During inference, the intermediate results are stored in 16-bit float numbers.

In [8]:
gc.collect()
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_has_fp16_weight=False, # keep all weights int8 on GPU, better VRAM
    llm_int8_enable_fp32_cpu_offload=False
)

model = AutoModelForCausalLM.from_pretrained(
    model_repo,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("BnB 8-bit Model", metrics)
all_metrics['BnB 8-bit Model'] = metrics

tokenizer.push_to_hub("TheS3b/Qwen3-0.6B-bnb-8bit")
model.push_to_hub("TheS3b/Qwen3-0.6B-bnb-8bit")

Evaluating: 100%|██████████| 1962/1962 [16:28<00:00,  1.99it/s]



**BnB 8-bit Model Evaluation Results**
- Accuracy              :  47.30 %
- Avg. inference time   :  502.5 ms
- Throughput (tok/s)    :  830.3
- Avg. peak VRAM        : 1231.5 MB
- Score Acc/VRAM        :  0.384 



No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-8bit/commit/5c61abc5243fe0f7cb7c2d2ec2c0552af4bdaec6', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='5c61abc5243fe0f7cb7c2d2ec2c0552af4bdaec6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheS3b/Qwen3-0.6B-bnb-8bit', endpoint='https://huggingface.co', repo_type='model', repo_id='TheS3b/Qwen3-0.6B-bnb-8bit'), pr_revision=None, pr_num=None)

In [9]:
# Intermediate save of results
import json
with open("Results/quantization_metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)


# GPTQ

In [10]:
with open("Results/quantization_metrics.json", "r") as f:
    all_metrics = json.load(f)

In [11]:
hub_prefix = "TheS3b/Qwen3-0.6B-GPTQ-4bit-rel0.5-calib"  # base for model names

logging.disable(logging.INFO)
os.environ["EXLLAMA_KERNELS_AVAILABLE"] = "0"

# Load tokenizer once
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)

# Load and filter calibration data
calibration_data = load_dataset('TheS3b/unified-dataset-filtered-430K')

def is_valid_prompt(example, min_len=64, max_len=256, thresh=0.5):
    tokens = tokenizer(example["prompt"], return_tensors="pt")["input_ids"]
    return min_len <= tokens.shape[1] <= max_len and (example["relevance1"] + example["relevance2"]) * 0.5 > thresh

filtered_calibration_set = calibration_data.filter(lambda ex: is_valid_prompt(ex), batched=False).shuffle(seed=42)["train"]

# Prompt sizes to test
prompt_sizes = [20, 200, 2000]

# Dataset to evaluate on
eval_ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]

for size in prompt_sizes:
    print(f"\n -- Quantizing with {size} calibration prompts -- \n")
    
    gc.collect()
    torch.cuda.empty_cache()

    calibration_prompts = filtered_calibration_set.select(range(size))["prompt"]

    quant_config = GPTQConfig(
        bits=4,
        tokenizer=model_repo,
        dataset=calibration_prompts,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_repo,
        device_map="cuda",
        trust_remote_code=True,
        quantization_config=quant_config
    )

    model.eval()
    device = next(model.parameters()).device

    metrics = evaluate_mmlu(eval_ds, model, tokenizer, device, mmlu_harness_hf)

    display_metric(f"GPTQ Model Size {size}", metrics)

    # Save under a unique key and push
    key = f"GPTQ 4bit calib{size}"
    all_metrics[key] = metrics

    push_name = f"{hub_prefix}{size}"
    tokenizer.push_to_hub(push_name)
    model.push_to_hub(push_name)

    with open("Results/quantization_metrics.json", "w") as f:
        json.dump(all_metrics, f, indent=2)

    del model
    gc.collect()
    torch.cuda.empty_cache()


 -- Quantizing with 20 calibration prompts -- 


[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


[32mINFO[0m  Optimize: `TritonV2QuantLinear` compilation triggered.                   


Evaluating: 100%|██████████| 1962/1962 [08:02<00:00,  4.07it/s]



**GPTQ Model Size 20 Evaluation Results**
- Accuracy              :  44.60 %
- Avg. inference time   :  244.6 ms
- Throughput (tok/s)    : 1706.0
- Avg. peak VRAM        :  599.3 MB
- Score Acc/VRAM        :  0.744 



No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.



 -- Quantizing with 200 calibration prompts -- 



Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


Evaluating: 100%|██████████| 1962/1962 [08:04<00:00,  4.05it/s]



**GPTQ Model Size 200 Evaluation Results**
- Accuracy              :  43.83 %
- Avg. inference time   :  245.9 ms
- Throughput (tok/s)    : 1696.9
- Avg. peak VRAM        :  595.0 MB
- Score Acc/VRAM        :  0.737 



No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.



 -- Quantizing with 2000 calibration prompts -- 



Quantizing model.layers blocks :   0%|          | 0/28 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


Evaluating: 100%|██████████| 1962/1962 [08:00<00:00,  4.09it/s]



**GPTQ Model Size 2000 Evaluation Results**
- Accuracy              :  44.90 %
- Avg. inference time   :  243.7 ms
- Throughput (tok/s)    : 1712.2
- Avg. peak VRAM        :  595.0 MB
- Score Acc/VRAM        :  0.755 



No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
