In [6]:
from datasets import load_dataset, Features, Value
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.utils.data import DataLoader
import torch
import gc
from tqdm.notebook import tqdm
from transformers import GPTQConfig
import os
import logging
from typing import Dict, List, Tuple
import numpy as np
import json
from evaluation_utils import evaluate_mmlu, mmlu_harness_hf, display_metric

# EfficientQAT 4-bit group size 64

In [2]:
all_metrics = {}

In [3]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w4g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W4G64", metrics)
all_metrics['QAT-W4G64'] = metrics


[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          
[32mINFO[0m   Kernel: Auto-selection: adding candidate `ExllamaV2QuantLinear`         


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Evaluating: 100%|██████████| 1962/1962 [04:50<00:00,  6.77it/s]


**QAT-W4G64 Evaluation Results**
- Accuracy              :  45.87 %
- Avg. inference time   :  147.0 ms
- Throughput (tok/s)    : 2838.9
- Avg. peak VRAM        :  792.1 MB
- Score Acc/VRAM        :  0.579 






# EfficientQAT 2-bit group size 64

In [4]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w2g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W2G64", metrics)
all_metrics['QAT-W2G64'] = metrics

[32mINFO[0m   Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`          


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


[32mINFO[0m  Optimize: `TritonV2QuantLinear` compilation triggered.                   


Evaluating: 100%|██████████| 1962/1962 [07:46<00:00,  4.21it/s]


**QAT-W2G64 Evaluation Results**
- Accuracy              :  37.26 %
- Avg. inference time   :  236.6 ms
- Throughput (tok/s)    : 1763.8
- Avg. peak VRAM        :  486.4 MB
- Score Acc/VRAM        :  0.766 






In [5]:
with open("Results/QAT-metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)

# MMLU

In [7]:
all_metrics = {}

In [8]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w4g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/mmlu")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W4G64-MMLU", metrics)
all_metrics['QAT-W4G64-MMLU'] = metrics

[32mINFO[0m   Kernel: Auto-selection: adding candidate `ExllamaV2QuantLinear`         


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Evaluating: 100%|██████████| 14042/14042 [35:39<00:00,  6.56it/s]


**QAT-W4G64-MMLU Evaluation Results**
- Accuracy              :  49.77 %
- Avg. inference time   :  151.3 ms
- Throughput (tok/s)    : 3184.7
- Avg. peak VRAM        :  922.4 MB
- Score Acc/VRAM        :  0.540 






In [9]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w2g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/mmlu")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W2G64-MMLU", metrics)
all_metrics['QAT-W2G64-MMLU'] = metrics

[32mINFO[0m   Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`          


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Evaluating: 100%|██████████| 14042/14042 [56:25<00:00,  4.15it/s] 


**QAT-W2G64-MMLU Evaluation Results**
- Accuracy              :  37.72 %
- Avg. inference time   :  240.1 ms
- Throughput (tok/s)    : 2007.7
- Avg. peak VRAM        :  495.8 MB
- Score Acc/VRAM        :  0.761 






In [10]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-0.6B-GPTQ-4bit-rel0.5-calib200'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/mmlu")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("GPTQ-4bit-MMLU", metrics)
all_metrics['GPTQ-4bit-MMLU'] = metrics

tokenizer_config.json:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

[32mINFO[0m   Kernel: Auto-selection: adding candidate `MarlinQuantLinear`            


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Evaluating: 100%|██████████| 14042/14042 [35:13<00:00,  6.64it/s]


**GPTQ-4bit-MMLU Evaluation Results**
- Accuracy              :  48.02 %
- Avg. inference time   :  149.5 ms
- Throughput (tok/s)    : 3224.6
- Avg. peak VRAM        :  714.3 MB
- Score Acc/VRAM        :  0.672 






In [11]:
with open("Results/QAT-metrics-MMLU.json", "w") as f:
    json.dump(all_metrics, f, indent=2)