# Quantization Fundamentals for Model Compression

In [23]:
# !pip install torch==2.1.1
# !pip install transformers==4.35.0
!pip install quanto==0.0.11

Collecting quanto==0.0.11
  Using cached quanto-0.0.11-py3-none-any.whl (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=2.1.1->quanto==0.0.11)
  Using cached nv

In [2]:
import torch

## Downcasting

Get a random pytorch tensor with float32 and size=100

In [3]:
tensor_fp32 = torch.rand(100, dtype = torch.float32)

In [4]:
tensor_fp32[:5]

tensor([0.6573, 0.7227, 0.5918, 0.5869, 0.6816])

Let's downcast the tensor to bfloat16 using the "to" method

In [5]:
tensor_fp32_to_bf16 = tensor_fp32.to(dtype = torch.bfloat16)

In [6]:
tensor_fp32_to_bf16[:5]

tensor([0.6562, 0.7227, 0.5898, 0.5859, 0.6797], dtype=torch.bfloat16)

## Compare Models in Different Dtypes

In [13]:
from transformers import BertForMaskedLM, BertTokenizer

# Initialize a tokenizer and a pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Tokenize an example text
text = "Let's compare the memory footprint of open-source models to understand the impact of different data types for a language task. In this blog we use BertForMaskedLM for example, if you would like to explore different modalities, the course uses BlipForConditionalGeneration for images."
input_ids = tokenizer(text, return_tensors="pt").input_ids

In [15]:
# Get logits from the original FP32 model
logits_fp32 = model(input_ids).logits

# Convert to BF16
model_bf16 = model.to(torch.bfloat16)

# Get logits from the BF16 models
logits_bf16 = model_bf16(input_ids).logits

In [16]:
# Calculate mean and max differences between FP32 and BF16
mean_diff = torch.abs(logits_bf16 - logits_fp32).mean().item()
max_diff = torch.abs(logits_bf16 - logits_fp32).max().item()

print("Mean difference between FP32 and BF16:", mean_diff)
print("Max difference between FP32 and BF16:", max_diff)

Mean difference between FP32 and BF16: 0.03718937560915947
Max difference between FP32 and BF16: 0.4945411682128906


In [18]:
# Memory footprint in bytes for FP32
memory_fp32 = model.get_memory_footprint()
# Memory footprint in bytes for BF16
memory_bf16 = model_bf16.get_memory_footprint()
print("Memory footprint (FP32):", memory_fp32)
print("Memory footprint (BF16):", memory_bf16)

Memory footprint (FP32): 219036788
Memory footprint (BF16): 219036788


### Compare memory footprint

In [21]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Footprint of the fp32 model in bytes:  438065384
Footprint of the fp32 model in MBs:  438.065384


In [None]:
fp32_mem_footprint = model.get_memory_footprint()
print("Footprint of the fp32 model in bytes: ",
      fp32_mem_footprint)
print("Footprint of the fp32 model in MBs: ",
      fp32_mem_footprint/1e+6)

In [20]:
model_bf16 = BertForMaskedLM.from_pretrained("bert-base-uncased",
                               torch_dtype=torch.bfloat16,
                               ignore_mismatched_sizes=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Footprint of the bf16 model in MBs:  219.036788
Relative diff: 0.500009350202389


In [None]:
bf16_mem_footprint = model_bf16.get_memory_footprint()

# Get the relative difference
relative_diff = bf16_mem_footprint / fp32_mem_footprint

print("Footprint of the bf16 model in MBs: ",
      bf16_mem_footprint/1e+6)
print(f"Relative diff: {relative_diff}")

## Quanto Library

In [24]:
from quanto import quantize, freeze

Let's see how to quantize the model in 8-bit precision

In [25]:
quantize(model, weights=torch.int8, activations=None)
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): QLinear(in_features=768, out_features=768, bias=True)
              (key): QLinear(in_features=768, out_features=768, bias=True)
              (value): QLinear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): QLinear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [26]:
freeze(model)