<a href="https://colab.research.google.com/github/ubiiii/coa-llm-quantization/blob/main/notebooks/coa-llm-quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Test with a larger model (GPT-2 Medium - 355M parameters)
print("Loading GPT-2 Medium for better quantization comparison...")

# Load baseline GPT-2 Medium
model_gpt2 = AutoModelForCausalLM.from_pretrained("gpt2-medium")
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2-medium")

# Test baseline
inputs_gpt2 = tokenizer_gpt2("Hello, how are you?", return_tensors="pt").to('cuda')

start_time = time.time()
with torch.no_grad():
    outputs_gpt2 = model_gpt2.generate(**inputs_gpt2, max_new_tokens=10, do_sample=False)
end_time = time.time()

speed_gpt2 = 10 / (end_time - start_time)
print(f"GPT-2 Medium Baseline: {speed_gpt2:.2f} tokens/sec")

Loading GPT-2 Medium for better quantization comparison...


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but got index is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__index_select)

In [6]:
# Create 8-bit configuration
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

# Load 8-bit model
print("Loading 8-bit quantized model...")
model_8bit = AutoModelForCausalLM.from_pretrained(
    "microsoft/DialoGPT-small",
    quantization_config=quantization_config,
    device_map="auto"
)

print("✅ 8-bit model loaded!")

# Test 8-bit speed (with device fix)
print("Testing 8-bit inference...")

# Move inputs to GPU
inputs_gpu = {k: v.to('cuda') for k, v in inputs.items()}

start_time = time.time()
with torch.no_grad():
    outputs_8bit = model_8bit.generate(**inputs_gpu, max_new_tokens=10, do_sample=False)
end_time = time.time()

generated_text_8bit = tokenizer.decode(outputs_8bit[0], skip_special_tokens=True)
new_tokens_8bit = len(outputs_8bit[0]) - len(inputs_gpu['input_ids'][0])
speed_8bit = new_tokens_8bit / (end_time - start_time)

print(f"8-bit Generated: {generated_text_8bit}")
print(f"8-bit Speed: {speed_8bit:.2f} tokens/second")

Loading 8-bit quantized model...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ 8-bit model loaded!
Testing 8-bit inference...
8-bit Generated: Hello, how are you? Good morning everyone!
8-bit Speed: 5.58 tokens/second


In [7]:
print("=== QUANTIZATION COMPARISON ===")
print(f"Baseline (FP16): {speed_baseline:.2f} tokens/sec")
print(f"8-bit Quantized: {speed_8bit:.2f} tokens/sec")
print(f"Speedup: {speed_8bit/speed_baseline:.2f}x")
print(f"Memory reduction: ~50% (estimated)")

=== QUANTIZATION COMPARISON ===
Baseline (FP16): 10.75 tokens/sec
8-bit Quantized: 5.58 tokens/sec
Speedup: 0.52x
Memory reduction: ~50% (estimated)


In [3]:
import time

# Test prompt
prompt = "Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt")

# Test baseline speed
print("Testing baseline inference...")
start_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, do_sample=False)
end_time = time.time()

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
new_tokens = len(outputs[0]) - len(inputs.input_ids[0])
speed_baseline = new_tokens / (end_time - start_time)

print(f"Baseline Generated: {generated_text}")
print(f"Baseline Speed: {speed_baseline:.2f} tokens/second")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Testing baseline inference...
Baseline Generated: Hello, how are you? Good morning everyone!
Baseline Speed: 10.75 tokens/second


In [2]:
# Load baseline model (FP16)
print("Loading baseline model...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

print("✅ Baseline model loaded!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")

Loading baseline model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Baseline model loaded!
Model size: 124.4M parameters


In [1]:
import torch
import transformers
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

print("✅ PyTorch version:", torch.__version__)
print("✅ Transformers version:", transformers.__version__)
print("✅ BitsAndBytes version:", bnb.__version__)
print("✅ CUDA available:", torch.cuda.is_available())
print("✅ GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

✅ PyTorch version: 2.8.0+cu126
✅ Transformers version: 4.57.0
✅ BitsAndBytes version: 0.48.1
✅ CUDA available: True
✅ GPU: Tesla T4


In [5]:
# Install BitsAndBytes for INT8 quantization
!pip install bitsandbytes --no-cache-dir

# Verify installation
import bitsandbytes as bnb
print("✅ BitsAndBytes installed successfully!")
print("Version:", bnb.__version__)

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m345.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1
✅ BitsAndBytes installed successfully!
Version: 0.48.1


In [4]:
# Test basic inference
import time

# Test prompt
prompt = "Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt")

print("Testing inference...")
start_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, do_sample=False)
end_time = time.time()

# Calculate speed
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
new_tokens = len(outputs[0]) - len(inputs.input_ids[0])
speed = new_tokens / (end_time - start_time)

print(f"Generated: {generated_text}")
print(f"Speed: {speed:.2f} tokens/second")
print(f"Time: {end_time - start_time:.3f} seconds")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Testing inference...
Generated: Hello, how are you? Good morning everyone!
Speed: 12.77 tokens/second
Time: 0.391 seconds


In [3]:
# Test basic functionality
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load a small test model
model_name = "microsoft/DialoGPT-small"
print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print("✅ Model loaded successfully!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")

Loading microsoft/DialoGPT-small...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model loaded successfully!
Model size: 124.4M parameters


In [2]:
# Check installed packages
!pip list | grep -E "(torch|transformers|autoawq|bitsandbytes|datasets|accelerate)"

# Check Python version
import sys
print("Python version:", sys.version)

# Check CUDA
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU:", torch.cuda.get_device_name(0))

accelerate                               1.10.1
datasets                                 4.0.0
sentence-transformers                    5.1.1
tensorflow-datasets                      4.9.9
torch                                    2.8.0+cu126
torchao                                  0.10.0
torchaudio                               2.8.0+cu126
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.23.0+cu126
transformers                             4.56.2
vega-datasets                            0.9.0
Python version: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
GPU: Tesla T4
