In [1]:
from datasets import load_dataset, Features, Value
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from torch.utils.data import DataLoader
import torch
import gc
from tqdm.notebook import tqdm
from transformers import GPTQConfig
import os
import logging
from typing import Dict, List, Tuple
import numpy as np
import json
from evaluation_utils import evaluate_mmlu, mmlu_harness_hf, display_metric

# EfficientQAT 4-bit group size 64

In [2]:
all_metrics = {}

In [None]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w4g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W4G64", metrics)
all_metrics['QAT-W4G64'] = metrics


[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          
[32mINFO[0m   Kernel: Auto-selection: adding candidate `ExllamaV2QuantLinear`         


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Evaluating:   8%|â–Š         | 162/1962 [00:24<04:23,  6.83it/s]

# EfficientQAT 2-bit group size 64

In [None]:
gc.collect()
torch.cuda.empty_cache()

repo_id = 'TheS3b/Qwen3-EfficientQAT-w2g64'

tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()
device = next(model.parameters()).device

ds = load_dataset("brygotti/NLP4Education_english_single_mcq_4_choices")["test"]
metrics = evaluate_mmlu(ds, model, tokenizer, device, mmlu_harness_hf)

display_metric("QAT-W2G64", metrics)
all_metrics['QAT-W2G64'] = metrics

In [None]:
with open("Results/QAT-metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)