In [1]:
import os
import time
import math
import shutil
import torch
import numpy as np
import evaluate
from datasets import load_dataset, DownloadConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    default_data_collator,
    EvalPrediction,
)
from optimum.intel import INCTrainer, INCModelForCausalLM
from neural_compressor import QuantizationAwareTrainingConfig

2025-09-03 23:43:00.992905: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-03 23:43:01.006372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-03 23:43:01.023874: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-03 23:43:01.029380: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-03 23:43:01.041980: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# ─── SETTINGS ────────────────────────────────────────────────────────────────
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)
MAX_NEW_TOKENS = 50
OUTPUT_DIR = "qat_tinyllama"

In [3]:
def load_model_and_tokenizer(model_id: str):
    # 1. Load the model
    model = AutoModelForCausalLM.from_pretrained(model_id)
    # (Optional) save memory during training
    model.gradient_checkpointing_enable()

    # 2. Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    # 3. Make sure we have a pad token
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer


In [4]:
def prepare_datasets(tokenizer, block_size=8, train_size=1000, eval_size=500):
    # 1) Load the raw Wikitext-2 dataset
    raw = load_dataset("wikitext", "wikitext-2-raw-v1")

    # 2) Tokenization + label prep
    def _tokenize(examples):
        out = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=block_size,
        )
        out["labels"] = out["input_ids"].copy()
        return out

    tokenized = raw.map(
        _tokenize,
        batched=True,
        remove_columns=["text"],
    )

    # 3) Select the subsets
    train_ds = tokenized["train"].select(range(train_size))
    eval_ds  = tokenized["validation"].select(range(eval_size))

    return train_ds, eval_ds


In [5]:
def create_inc_trainer(model, tokenizer, train_ds, quant_config, output_dir):
    from transformers import TrainingArguments
    from optimum.intel import INCTrainer
    from transformers import default_data_collator

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        fp16=torch.cuda.is_available(),
        optim="adamw_bnb_8bit",
        eval_strategy="no",          # <- replaces deprecated evaluation_strategy
        save_strategy="no",          # <- NO mid-training checkpoints
        save_only_model=True,        # <- smaller, model-only saves when we do save
        logging_steps=50,
        save_total_limit=1,
        report_to=[],                # (quiet logs; optional)
    )

    trainer = INCTrainer(
        model=model,
        quantization_config=quant_config,
        args=training_args,
        train_dataset=train_ds,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )
    return trainer

In [6]:
def make_compute_ppl_fn(pad_token_id: int):
    """
    Returns a compute_metrics function that knows the pad_token_id.
    """
    def compute_ppl(pred: EvalPrediction):
        # 1) Unpack
        logits = pred.predictions         # np array (batch, seq_len, vocab_size)
        labels = pred.label_ids           # np array (batch, seq_len)

        # 2) Shift so each token predicts the next one
        shift_logits = logits[..., :-1, :]
        shift_labels = labels[..., 1:]

        # 3) Flatten
        flat_logits = shift_logits.reshape(-1, shift_logits.shape[-1])
        flat_labels = shift_labels.reshape(-1)

        # 4) To torch
        logits_t = torch.from_numpy(flat_logits)
        labels_t = torch.from_numpy(flat_labels)

        # 5) CE loss ignoring pad_token_id
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
        loss = loss_fct(logits_t, labels_t)

        # 6) Return perplexity
        return {"eval_perplexity": torch.exp(loss).item()}
    return compute_ppl

In [7]:
def run_qat_and_evaluate(trainer, eval_ds, tokenizer):
    # 1) Train with QAT
    trainer.train()

    # 2) Small slice for eval
    small_eval = eval_ds.select(range(min(len(eval_ds), 100)))

    # 3) Build a ppl_fn that closes over pad_token_id
    ppl_fn = make_compute_ppl_fn(tokenizer.pad_token_id)

    # 4) Set up evaluation INCTrainer
    eval_args = TrainingArguments(
        output_dir=trainer.args.output_dir,
        per_device_eval_batch_size=1,
        fp16=True,
        eval_strategy="no",
        save_strategy="no",
        logging_steps=50,
    )
    eval_trainer = INCTrainer(
        model=trainer.model,
        args=eval_args,
        eval_dataset=small_eval,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=ppl_fn,  # << use the closure here
    )

    # 5) Evaluate and return metrics
    return eval_trainer.evaluate()

In [8]:
def save_and_load_qat_model(trainer, output_dir):
    # 1) Save to the specified directory
    trainer.save_model(output_dir)

    # 2) Load it back as an INCModelForCausalLM
    loaded_model = INCModelForCausalLM.from_pretrained(output_dir)

    return loaded_model

In [9]:
def measure_latency_and_throughput(model, tokenizer, prompt: str, device, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].size(1)

    # warm-up
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

    # timed generation
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency, gen_tokens / latency

def measure_peak_mem_and_perplexity(model, tokenizer, text: str, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)

    torch.cuda.reset_peak_memory_stats(device)
    with torch.no_grad():
        out = model(**inputs, labels=inputs["input_ids"])
    torch.cuda.synchronize()
    peak_mem_mib = torch.cuda.max_memory_allocated(device) / 1024**2
    perplexity = math.exp(out.loss.item())
    return peak_mem_mib, perplexity

In [10]:

# 1) Load model & tokenizer
model, tokenizer = load_model_and_tokenizer(MODEL_NAME)

# 2) Prepare datasets
train_ds, eval_ds = prepare_datasets(tokenizer)

# 3) QAT configuration
quant_config = QuantizationAwareTrainingConfig()

# 4) Create and run QAT trainer
qat_trainer = create_inc_trainer(model, tokenizer, train_ds, quant_config, OUTPUT_DIR)
metrics = run_qat_and_evaluate(qat_trainer, eval_ds, tokenizer)
print(f"Final perplexity: {metrics['eval_perplexity']:.2f}")

# 5) Save & load quantized model
qat_model = save_and_load_qat_model(qat_trainer, OUTPUT_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qat_model.to(device)

# 6) Benchmarks
latency, throughput = measure_latency_and_throughput(qat_model, tokenizer, PROMPT, device)
print(f"Latency     : {latency:.3f} s")
print(f"Throughput  : {throughput:.1f} tokens/s")

peak_mem, ppl = measure_peak_mem_and_perplexity(qat_model, tokenizer, PERP_TEXT, device)
print(f"Peak GPU memory     : {peak_mem:.1f} MiB")
print(f"Next-token perplexity: {ppl:.3f}")
size_mb = sum(os.path.getsize(os.path.join(OUTPUT_DIR, f)) for f in os.listdir(OUTPUT_DIR)) / 1024**2

print(f"Quantized model size : {size_mb:.2f} MB")
print(f"Average latency      : {latency*1000:.1f} ms")
print(f"Throughput           : {throughput:.1f} tokens/sec")
if peak_mem is not None:
    print(f"Peak GPU memory      : {peak_mem:.1f} MB")


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,3.3974
100,2.3487
150,2.3516
200,2.0726
250,2.0966


Final perplexity: 55.14
Latency     : 1.180 s
Throughput  : 42.4 tokens/s
Peak GPU memory     : 10660.4 MiB
Next-token perplexity: 10.961
Quantized model size : 4198.67 MB
Average latency      : 1179.8 ms
Throughput           : 42.4 tokens/sec
Peak GPU memory      : 10660.4 MB


In [23]:
def eval_ppl_cpu_texts(model, tokenizer, texts, max_length=64, batch_size=1):
    import math, torch
    model.eval().to("cpu")
    total_loss = 0.0
    total_tokens = 0
    with torch.inference_mode():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            labels = enc["input_ids"].clone()
            # ignore pad positions in loss
            pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
            attention = enc["attention_mask"]
            labels[attention == 0] = -100
            out = model(**enc, labels=labels)
            # HF loss is mean over non -100 positions; weight by #tokens to combine batches
            n_tokens = (labels != -100).sum().item()
            total_loss += float(out.loss) * max(1, n_tokens)
            total_tokens += max(1, n_tokens)
    return math.exp(total_loss / max(1, total_tokens))


# Exercise 1 — Activation fake-quant sensitivity (CPU, per-layer, no training)

Simulate QAT-style activation quantization one layer at a time with a forward pre-hook (8-bit symmetric per-tensor). Measure perplexity drift vs baseline.

In [24]:
def activation_fake_quant_sensitivity_cpu(
    model,
    tokenizer,
    texts=None,
    max_length=64,
    batch_size=1,
    last_k_linear=8,   # test only the last K Linear layers to keep it light
):
    """
    TODO: For each of the last K nn.Linear layers:
      - add a forward pre-hook that fake-quantizes the *input activation* to int8 (symmetric)
      - run a tiny CPU perplexity eval
      - remove the hook
    Returns a DataFrame with baseline and per-layer PPL deltas.
    """
    raise NotImplementedError


In [25]:
df_act_sens = activation_fake_quant_sensitivity_cpu(model, tokenizer, last_k_linear=8)
df_act_sens


Unnamed: 0,layer,ppl_with_act_fakequant,ppl_baseline,delta_ppl
0,model.layers.21.mlp.up_proj,10.71728,10.961409,-0.244129
1,model.layers.21.mlp.gate_proj,10.731192,10.961409,-0.230217
2,lm_head,10.904405,10.961409,-0.057004
3,model.layers.21.self_attn.v_proj,10.955287,10.961409,-0.006121
4,model.layers.21.self_attn.q_proj,10.959927,10.961409,-0.001482
5,model.layers.21.self_attn.o_proj,10.968125,10.961409,0.006716
6,model.layers.21.self_attn.k_proj,10.974499,10.961409,0.013091
7,model.layers.21.mlp.down_proj,11.220519,10.961409,0.25911


# Exercise 2 — CPU decoding strategy benchmark (no quant, no training)

Benchmark latency & tokens/sec on CPU for common decoding setups (greedy / top-k / top-p / beam), across a few prompt lengths. Uses your model as-is; pure eval.

In [26]:
def cpu_decoding_strategy_benchmark(
    model,
    tokenizer,
    base_text="This is a short prompt for CPU decoding benchmark. ",
    prompt_lengths=(16, 64, 256),
    new_tokens=64,
    runs=3,
):
    """
    TODO: On CPU, measure latency and throughput for different decoding strategies
    across several prompt lengths.
    Returns a DataFrame.
    """
    import time, numpy as np, torch, pandas as pd
    from copy import deepcopy

    model.eval().to("cpu")
    torch.set_grad_enabled(False)

    strategies = {
        "greedy": dict(do_sample=False),
        "topk":   dict(do_sample=True, top_k=50, temperature=1.0),
        "topp":   dict(do_sample=True, top_p=0.9, temperature=1.0),
        "beam4":  dict(do_sample=False, num_beams=4, early_stopping=True),
    }

    def build_prompt(L):
        ids = tokenizer(base_text, return_tensors="pt")["input_ids"][0].tolist()
        if len(ids) < 4:
            ids = (ids or [tokenizer.eos_token_id]) * 8
        reps = max(1, (L + len(ids) - 1) // len(ids))
        return tokenizer.decode(ids * reps)[: L]  # approximate length in tokens

    rows = []
    for L in prompt_lengths:
        prompt = build_prompt(L)
        enc = tokenizer(prompt, return_tensors="pt")
        input_len = enc["input_ids"].size(1)

        for name, gen_kwargs in strategies.items():
            # warmup
            _ = model.generate(**enc, max_new_tokens=8, **gen_kwargs)
            # timed runs
            times = []
            for _ in range(runs):
                t0 = time.time()
                _ = model.generate(**enc, max_new_tokens=new_tokens, **gen_kwargs)
                times.append(time.time() - t0)
            lat_ms = float(np.mean(times) * 1000.0)
            toks_per_s = new_tokens / float(np.mean(times))
            rows.append({
                "prompt_len_tokens(approx)": L,
                "strategy": name,
                "latency_ms": lat_ms,
                "throughput_tokens_per_s": toks_per_s,
            })

    return pd.DataFrame(rows).sort_values(["prompt_len_tokens(approx)", "strategy"]).reset_index(drop=True)


In [None]:
def cpu_decoding_strategy_benchmark(
    model,
    tokenizer,
    base_text="This is a short prompt for CPU decoding benchmark. ",
    prompt_lengths=(16, 64, 256),
    new_tokens=64,
    runs=3,
):
    """
    Measure CPU decoding latency & throughput across decoding strategies and prompt lengths.

    Returns
    -------
    pd.DataFrame with columns:
      ['prompt_len_tokens(approx)', 'strategy', 'latency_ms', 'throughput_tokens_per_s']

    Student TODOs:
      - Ensure the model is on CPU and in eval() mode.
      - Define a few decoding strategies (greedy, top-k, top-p, beam).
      - Build prompts that roughly match target token lengths.
      - Warm up each strategy before timing.
      - Time multiple runs and average.
      - Return a tidy DataFrame.
    """
    # --- Imports (keep local so function is self-contained) ---
    import time
    import numpy as np
    import torch
    import pandas as pd

    # ---------- TODO 1: Put model in eval mode and move to CPU ----------
    # Hints:
    #   - Use model.eval()
    #   - Use model.to("cpu")
    #   - Disable grads with torch.set_grad_enabled(False)
    # YOUR CODE HERE
    # model.eval()
    # model.to("cpu")
    # torch.set_grad_enabled(False)

    # ---------- TODO 2: Define decoding strategies ----------
    # Provide a small set with different search behaviors:
    #   - "greedy": no sampling
    #   - "topk":   do_sample=True, top_k=50
    #   - "topp":   do_sample=True, top_p=0.9
    #   - "beam4":  beam search with num_beams=4 (no sampling)
    # Keep temperature at 1.0 unless you want them to explore.
    strategies = {
    }

    # ---------- Helper: approximate a prompt with ~L tokens ----------
    # TODO 3: Implement build_prompt(L) so it returns a string whose tokenized
    # length is approximately L. You can:
    #   - tokenize base_text once to get a token list
    #   - repeat it to exceed L
    #   - decode back to string and (optionally) trim by tokens or characters
    def build_prompt(L: int) -> str:
        raise NotImplementedError("build_prompt(L) not implemented")

    rows = []

    # ---------- Main loop over prompt lengths ----------
    for L in prompt_lengths:
        # TODO 4: Build prompt and encode once on CPU
        #   - Use tokenizer(prompt, return_tensors="pt")
        #   - Keep the encoded dict on CPU (no .to('cuda'))
        prompt = None  # = build_prompt(L)
        enc = None     # = tokenizer(prompt, return_tensors="pt")
        # Optional: record the actual input length from enc["input_ids"].size(1)
        # input_len = enc["input_ids"].size(1)

        # ---------- Loop over decoding strategies ----------
        for name, gen_kwargs in strategies.items():
            # TODO 5: Warm-up (short generate) to trigger kernels/JIT
            #   - e.g., max_new_tokens=8
            # with torch.inference_mode():
            #     _ = model.generate(**enc, max_new_tokens=8, **gen_kwargs)

            # TODO 6: Timed runs (average over `runs`)
            times = []

            # TODO 7: Compute metrics

            # TODO 8: Append a result row

    # ---------- TODO 9: Return a tidy DataFrame sorted by prompt length then strategy ----------
    # df = pd.DataFrame(rows).sort_values(["prompt_len_tokens(approx)", "strategy"]).reset_index(drop=True)
    # return df
    raise NotImplementedError("Return the DataFrame once all TODOs are implemented")


In [27]:
df_cpu_decode = cpu_decoding_strategy_benchmark(model, tokenizer, prompt_lengths=(16, 64, 256), new_tokens=32, runs=2)
df_cpu_decode


Unnamed: 0,prompt_len_tokens(approx),strategy,latency_ms,throughput_tokens_per_s
0,16,beam4,5062.797904,6.320616
1,16,greedy,2567.549944,12.463243
2,16,topk,2583.953261,12.384125
3,16,topp,2620.415092,12.211806
4,64,beam4,5132.56824,6.234695
5,64,greedy,2692.276001,11.885854
6,64,topk,2410.951018,13.272771
7,64,topp,2791.317344,11.464121
8,256,beam4,5807.430387,5.510182
9,256,greedy,2800.542116,11.426359
