In [1]:
import os
import time
import torch
import numpy as np
import evaluate
from datasets import load_dataset, DownloadConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    default_data_collator,
    EvalPrediction,
)
from optimum.intel import INCTrainer, INCModelForCausalLM
from neural_compressor import QuantizationAwareTrainingConfig

2025-08-04 23:14:29.789858: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-04 23:14:29.804082: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-04 23:14:29.822090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-04 23:14:29.827774: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-04 23:14:29.840933: I tensorflow/core/platform/cpu_feature_guar

In [None]:
# ─── SETTINGS ────────────────────────────────────────────────────────────────
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)
MAX_NEW_TOKENS = 50
OUTPUT_DIR = "qat_tinyllama"

In [None]:
def load_model_and_tokenizer(model_id: str):
    """
    TODO:
      - load AutoModelForCausalLM.from_pretrained(model_id)
      - enable gradient checkpointing if desired
      - load AutoTokenizer.from_pretrained(model_id)
      - set tokenizer.pad_token = tokenizer.eos_token, and model.config.pad_token_id
      - return model, tokenizer
    """
    raise NotImplementedError


In [None]:
def prepare_datasets(tokenizer, block_size=8, train_size=1000, eval_size=500):
    """
    TODO:
      - load “wikitext”, "wikitext-2-raw-v1" via load_dataset
      - tokenize via tokenizer in blocks of `block_size`; set labels=input_ids
      - select first train_size for train_ds, eval_size for eval_ds
      - return train_ds, eval_ds
    """
    raise NotImplementedError


In [None]:
def create_inc_trainer(model, tokenizer, train_ds, quant_config, output_dir):
    """
    TODO:
      - instantiate TrainingArguments for QAT: 
          • num_train_epochs=1
          • per_device_train_batch_size=1
          • gradient_accumulation_steps=4
          • fp16=True
          • optim="adamw_bnb_8bit"
          • evaluation_strategy="no"
          • save_strategy="epoch"
      - pass model, quantization_config, args, train_dataset, tokenizer, data_collator
      - return INCTrainer instance
    """
    raise NotImplementedError


In [None]:
def compute_ppl(pred: EvalPrediction):
    """
    TODO:
      - shift logits and labels by one, flatten
      - compute CrossEntropyLoss(ignore_index=pad_token_id)
      - return {"eval_perplexity": exp(loss).item()}
    """
    raise NotImplementedError


In [None]:
def run_qat_and_evaluate(trainer, eval_ds, tokenizer):
    """
    TODO:
      - call trainer.train()
      - create INCTrainer for evaluation with compute_metrics=compute_ppl
      - call eval_trainer.evaluate() on a small subset
      - return eval metrics
    """
    raise NotImplementedError

In [None]:
def save_and_load_qat_model(trainer, output_dir):
    """
    TODO:
      - call trainer.save_model()
      - load back via INCModelForCausalLM.from_pretrained(output_dir)
      - return loaded_model
    """
    raise NotImplementedError

In [None]:
def measure_latency_and_throughput(model, tokenizer, prompt: str, device, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].size(1)

    # warm-up
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

    # timed generation
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency, gen_tokens / latency

def measure_peak_mem_and_perplexity(model, tokenizer, text: str, device):
    inputs = tokenizer(text, return_tensors="pt").to(device)

    torch.cuda.reset_peak_memory_stats(device)
    with torch.no_grad():
        out = model(**inputs, labels=inputs["input_ids"])
    torch.cuda.synchronize()
    peak_mem_mib = torch.cuda.max_memory_allocated(device) / 1024**2
    perplexity = math.exp(out.loss.item())
    return peak_mem_mib, perplexity

In [None]:
def start():
    # 1) Load model & tokenizer
    model, tokenizer = load_model_and_tokenizer(MODEL_NAME)

    # 2) Prepare datasets
    train_ds, eval_ds = prepare_datasets(tokenizer)

    # 3) QAT configuration
    quant_config = QuantizationAwareTrainingConfig()

    # 4) Create and run QAT trainer
    qat_trainer = create_inc_trainer(model, tokenizer, train_ds, quant_config, OUTPUT_DIR)
    metrics = run_qat_and_evaluate(qat_trainer, eval_ds, tokenizer)
    print(f"Final perplexity: {metrics['eval_perplexity']:.2f}")

    # 5) Save & load quantized model
    qat_model = save_and_load_qat_model(qat_trainer, OUTPUT_DIR)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    qat_model.to(device)

    # 6) Benchmarks
    latency, throughput = measure_latency_and_throughput(qat_model, tokenizer, PROMPT, device)
    print(f"Latency     : {latency:.3f} s")
    print(f"Throughput  : {throughput:.1f} tokens/s")

    peak_mem, ppl = measure_peak_mem_and_perplexity(qat_model, tokenizer, PERP_TEXT, device)
    print(f"Peak GPU memory     : {peak_mem:.1f} MiB")
    print(f"Next-token perplexity: {ppl:.3f}")
    size_mb = sum(os.path.getsize(os.path.join(OUTPUT_DIR, f)) for f in os.listdir(OUTPUT_DIR)) / 1024**2

    print(f"Quantized model size : {size_mb:.2f} MB")
    print(f"Average latency      : {latency*1000:.1f} ms")
    print(f"Throughput           : {throughput:.1f} tokens/sec")
    if peak_mem is not None:
        print(f"Peak GPU memory      : {peak_mem:.1f} MB")


In [None]:
start()