In [None]:
from huggingface_hub import login

login(token="")

In [2]:
import time, torch, numpy as np, pandas as pd
from tqdm import trange
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

# ----------------------------------------------------------
# CONFIG
# ----------------------------------------------------------
MODEL_NAME = "meta-llama/Llama-3.2-3B"
NUM_RUNS = 1000
MAX_NEW_TOKENS = 32
PROMPT = "Explain one interesting fact about large language models."

# ----------------------------------------------------------
# DEVICE
# ----------------------------------------------------------
if torch.backends.mps.is_available():
    DEVICE = "mps"  # MacBook M3
elif torch.cuda.is_available():
    DEVICE = "cuda"  # GPU cluster (A100/H100)
else:
    DEVICE = "cpu"

print(f"Running on device: {DEVICE}")

# ----------------------------------------------------------
# PRECISIONS TO TEST
# ----------------------------------------------------------
precisions = ["fp16", "int4"]

# ----------------------------------------------------------
# UTILITIES
# ----------------------------------------------------------
def setup_model(precision):
    fp16_model = "meta-llama/Llama-3.2-3B"
    awq_int4_model = "casperhansen/llama-3.2-3b-instruct-awq"  # already quantized

    if precision == "fp16":
        print("Loading FP16 model...")
        model = AutoModelForCausalLM.from_pretrained(
            fp16_model,
            dtype=torch.float16,
            device_map="auto"
        )
        model.to(DEVICE)

    elif precision == "int4":
        if not torch.cuda.is_available():
            print("Skipping INT4 (requires CUDA).")
            return None

        print("Loading INT4 AWQ model...")
        model = AutoModelForCausalLM.from_pretrained(
            "casperhansen/llama-3.2-3b-instruct-awq", 
            device_map="auto",
            trust_remote_code=True)

    else:
        raise ValueError(f"Unknown precision mode: {precision}")

    return model


def clear_cache():
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
    elif DEVICE == "mps":
        torch.mps.empty_cache()
    else:
        pass


def get_mem_mb():
    if DEVICE == "cuda":
        return torch.cuda.memory_allocated() / (1024**2)
    else:
        return 0.0  # MPS/CPU memory metrics not available in PyTorch


@torch.no_grad()
def benchmark_model(model, tokenizer):
    latencies, peak_mems, avg_mems = [], [], []
    for i in trange(NUM_RUNS):
        clear_cache()
        inputs = tokenizer(PROMPT, return_tensors="pt").to(DEVICE)
        start = time.time()
        _ = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
        if DEVICE == "cuda":
            torch.cuda.synchronize()
        end = time.time()

        latency = end - start
        peak_mem = torch.cuda.max_memory_allocated() / (1024 ** 2) if DEVICE == "cuda" else 0.0
        avg_mem = get_mem_mb()

        latencies.append(latency)
        peak_mems.append(peak_mem)
        avg_mems.append(avg_mem)

    lat = np.array(latencies)
    pm = np.array(peak_mems)
    am = np.array(avg_mems)

    results = {
        "precision": precision,
        "mean_latency_s": lat.mean(),
        "median_latency_s": np.median(lat),
        "p95_latency_s": np.percentile(lat, 95),
        "p99_latency_s": np.percentile(lat, 99),
        "mean_peak_mem_mb": pm.mean(),
        "mean_avg_mem_mb": am.mean(),
    }
    return results

Running on device: cuda


In [4]:
# ----------------------------------------------------------
# MAIN
# ----------------------------------------------------------
import os

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

all_results = []
csv_path = "llama3.2_all_precisions_latency_memory.csv"

if not os.path.exists(csv_path):
    pd.DataFrame(columns=[
        "precision", "mean_latency", "median_latency", "p95_latency", "p99_latency",
        "peak_memory_mb", "avg_memory_mb"
    ]).to_csv(csv_path, index=False)


for precision in precisions:
    model = setup_model(precision)
    if model is None:
        continue  # skip INT8/INT4 on Mac
    model.eval()
    print(f"\n=== Running benchmark: {precision.upper()} ===")
    results = benchmark_model(model, tokenizer)
    all_results.append(results)

    pd.DataFrame([results]).to_csv(csv_path, mode='a', header=False, index=False)

    print(f"\n--- {precision.upper()} Summary ---")
    for k, v in results.items():
        if k != "precision":
            print(f"{k}: {v:.4f}")
    print(f"Results written to {csv_path}\n")

    del model
    clear_cache()

# ----------------------------------------------------------
# OUTPUT
# ----------------------------------------------------------
if all_results:
    df = pd.DataFrame(all_results)
    print("\n===== SUMMARY =====")
    print(df.round(3))
    df.to_csv("llama3.2_all_precisions_latency_memory.csv", index=False)
    print("\nResults saved to llama3.2_all_precisions_latency_memory.csv")
else:
    print("No models ran (probably running on Mac without CUDA).")


Loading FP16 model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


=== Running benchmark: FP16 ===


  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/1000 [00:00<13:04,  1.27it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/1000 [00:01<13:02,  1.27it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/1000 [00:02<13:01,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/1000 [00:03<13:00,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 5/1000 [00:03<12:59,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/1000 [00:04<12:58,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/1000 [00:05<12:59,  1.27it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 8/1000 [00:06<12:57,  1


--- FP16 Summary ---
mean_latency_s: 0.7637
median_latency_s: 0.7706
p95_latency_s: 0.8098
p99_latency_s: 0.8169
mean_peak_mem_mb: 12280.1745
mean_avg_mem_mb: 12269.4639
Results written to llama3.2_all_precisions_latency_memory.csv

Loading INT4 AWQ model...


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/




=== Running benchmark: INT4 ===


  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/1000 [00:01<28:30,  1.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/1000 [00:03<25:12,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/1000 [00:04<24:07,  1.45s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/1000 [00:05<23:44,  1.43s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 5/1000 [00:07<23:38,  1.43s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/1000 [00:08<23:18,  1.41s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/1000 [00:10<23:05,  1.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 8/1000 [00:11<22:56,  1


--- INT4 Summary ---
mean_latency_s: 1.3712
median_latency_s: 1.3650
p95_latency_s: 1.4143
p99_latency_s: 1.4340
mean_peak_mem_mb: 8310.7285
mean_avg_mem_mb: 8299.9639
Results written to llama3.2_all_precisions_latency_memory.csv


===== SUMMARY =====
  precision  mean_latency_s  median_latency_s  p95_latency_s  p99_latency_s  \
0      fp16           0.764             0.771          0.810          0.817   
1      int4           1.371             1.365          1.414          1.434   

   mean_peak_mem_mb  mean_avg_mem_mb  
0         12280.174        12269.464  
1          8310.729         8299.964  

Results saved to llama3.2_all_precisions_latency_memory.csv





In [11]:
model.save_pretrained("llama3.2-3B-local")
tokenizer.save_pretrained("llama3.2-3B-local")

('llama3.2-3B-local/tokenizer_config.json',
 'llama3.2-3B-local/special_tokens_map.json',
 'llama3.2-3B-local/tokenizer.json')