# MMLU Evaluation â€“ LLaMA-2-7B LoRA vs QLoRA

This notebook evaluates two fine-tuned models:

- 4-bit QLoRA on LLaMA-2-7B
- 16-bit LoRA on LLaMA-2-7B

We use the 5-shot MMLU benchmark (multiple-choice), and compare accuracy across the same test split.


1. Setup & Dependencies

In [None]:
import os
import json
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch.nn.functional as F

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

2. Configurations

In [None]:
BASE_MODEL_NAME = "meta-llama/Llama-2-7b-hf"

ADAPTER_DIR_4BIT  = "results/llama7b_4bit_qlora"
ADAPTER_DIR_16BIT = "results/llama7b_16bit_lora"

# Where to save the final scores
MMLU_RESULTS_PATH = "results/mmlu_scores.json"

# If you're using the qlora repo JSONs:
#MMLU_JSON_DIR = "data/mmlu"  # e.g. after cloning / downloading qlora data
#MMLU_JSON_FILE = "five_shot_mmlu_test.json"

3. Load 5-shot MMLU dataset

In [None]:
!git clone https://github.com/artidoro/qlora.git
MMLU_DIR = "qlora/data/mmlu"

mmlu = load_dataset(
    "json",
    data_files={"test": f"{MMLU_DIR}/five_shot_mmlu_test.json"},
)["test"]

print("MMLU 5-shot test size:", len(mmlu))
print(mmlu[0])

4. Scoring functions

In [None]:
LETTERS = ["A", "B", "C", "D"]

def format_example_5shot(ex):
    return ex["input"]

def score_choice(prompt, choice_letter):
    """
    Compute log-probability of the answer letter (" A"/" B"/...) given the prompt.
    """
    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    choice_ids = tokenizer(" " + choice_letter, add_special_tokens=False).input_ids
    choice_ids = torch.tensor([choice_ids]).to(model.device)

    input_ids = torch.cat([prompt_ids, choice_ids], dim=1)

    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits

    choice_len = choice_ids.shape[1]
    logits_for_choice = logits[:, -choice_len-1:-1, :]
    target_ids = choice_ids

    log_probs = F.log_softmax(logits_for_choice, dim=-1)
    token_log_probs = log_probs.gather(-1, target_ids.unsqueeze(-1)).squeeze(-1)
    return float(token_log_probs.sum().cpu())

def evaluate_mmlu_5shot(dataset, max_samples=None, verbose_every=50):
    n_correct = 0
    n_total = 0

    for i, ex in enumerate(dataset):
        if max_samples is not None and i >= max_samples:
            break

        prompt = format_example_5shot(ex)

        scores = []
        for letter in LETTERS:
            scores.append(score_choice(prompt, letter))

        pred_idx = int(np.argmax(scores))
        pred_letter = LETTERS[pred_idx]

        gold_letter = str(ex["output"]).strip()[0]

        if pred_letter == gold_letter:
            n_correct += 1
        n_total += 1

        if verbose_every is not None and (i + 1) % verbose_every == 0:
            print(f"{i+1} examples, running accuracy = {n_correct / n_total:.4f}")

    acc = n_correct / n_total if n_total > 0 else 0.0
    return acc

6. Evaluate 4-bit QLoRA

In [None]:
# quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# base model in 4-bit
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)

# attach QLoRA adapter
model = PeftModel.from_pretrained(model, ADAPTER_DIR_4BIT)
model.eval()

# run evaluation
acc_4bit = evaluate_mmlu_5shot(mmlu, max_samples=None)
print(f"4-bit QLoRA MMLU 5-shot accuracy: {acc_4bit:.4%}")

7. Evaluate 16-bit LoRA

In [None]:
# reuse tokenizer (same base model)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# base model in bf16
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# attach LoRA adapter
model = PeftModel.from_pretrained(model, ADAPTER_DIR_16BIT)
model.eval()

acc_16bit = evaluate_mmlu_5shot(mmlu, max_samples=None)
print(f"16-bit LoRA MMLU 5-shot accuracy: {acc_16bit:.4%}")

8. Compare & Save Results

In [None]:
os.makedirs(os.path.dirname(MMLU_RESULTS_PATH), exist_ok=True)

# load existing scores file if present
if os.path.exists(MMLU_RESULTS_PATH):
    with open(MMLU_RESULTS_PATH, "r") as f:
        scores = json.load(f)
else:
    scores = {}

scores["llama2_7b_4bit_qlora"] = {
    "mmlu_5shot_accuracy": float(acc_4bit),
}
scores["llama2_7b_16bit_lora"] = {
    "mmlu_5shot_accuracy": float(acc_16bit),
}

with open(MMLU_RESULTS_PATH, "w") as f:
    json.dump(scores, f, indent=2)

print("\nSaved MMLU scores to", MMLU_RESULTS_PATH)
print(json.dumps(scores, indent=2))