<a href="https://www.kaggle.com/code/vasumanmishra/afpq-implementation?scriptVersionId=180892392" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install torch torchvision torchaudio transformers



In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
import time

# Load the model and tokenizer from Hugging Face
model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to measure inference time
def measure_inference_time(model, tokenizer, text, num_runs=10):
    inputs = tokenizer(text, return_tensors="pt")
    start_time = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            outputs = model(**inputs)
    avg_time = (time.time() - start_time) / num_runs
    return avg_time

# Define the AFPQ quantization function
def afpq_quantization(tensor, bits=4):
    qmin = -(2 ** (bits - 1))
    qmax = (2 ** (bits - 1)) - 1
    
    pos_values = tensor[tensor > 0]
    neg_values = tensor[tensor < 0]

    pos_scale = pos_values.max().item() / qmax if pos_values.numel() > 0 else 1.0
    neg_scale = neg_values.min().item() / qmin if neg_values.numel() > 0 else 1.0
    
    pos_tensor = tensor.clamp(min=0) / pos_scale
    neg_tensor = tensor.clamp(max=0) / neg_scale
    
    quantized_tensor = pos_tensor + neg_tensor
    quantized_tensor = quantized_tensor.round().clamp(qmin, qmax).to(torch.int8)
    
    return quantized_tensor, pos_scale, neg_scale

# Apply AFPQ to all model parameters and convert to lower precision
quantized_model_state_dict = {}
scales = {}

for name, param in model.named_parameters():
    quantized_data, pos_scale, neg_scale = afpq_quantization(param.data, bits=4)
    quantized_model_state_dict[name] = quantized_data
    scales[f"{name.replace('.', '_')}_pos_scale"] = torch.tensor(pos_scale, dtype=torch.float32)
    scales[f"{name.replace('.', '_')}_neg_scale"] = torch.tensor(neg_scale, dtype=torch.float32)

# Save quantized model parameters and scales
torch.save(quantized_model_state_dict, "quantized_model_state_dict.pth")
torch.save(scales, "scales.pth")

print("Quantized model saved successfully!")

# Measure and compare model sizes and inference time
original_model_size = sum(p.numel() for p in AutoModel.from_pretrained(model_name).parameters()) * 4 / (1024 ** 2)
quantized_model_size = sum(p.numel() for p in quantized_model_state_dict.values()) * 1 / (1024 ** 2)  # Assuming int8

original_model = AutoModel.from_pretrained(model_name)
text = "This is a sample text for inference measurement."

original_inference_time = measure_inference_time(original_model, tokenizer, text)

# Load the quantized model state dict for inference
quantized_model = AutoModel.from_pretrained(model_name)
quantized_model.load_state_dict(quantized_model_state_dict, strict=False)
for name, buffer in scales.items():
    quantized_model.register_buffer(name, buffer)

quantized_inference_time = measure_inference_time(quantized_model, tokenizer, text)

print(f"Original model size: {original_model_size:.2f} MB")
print(f"Quantized model size: {quantized_model_size:.2f} MB")
print(f"Original model inference time: {original_inference_time:.6f} seconds")
print(f"Quantized model inference time: {quantized_inference_time:.6f} seconds")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Quantized model saved successfully!
Original model size: 417.64 MB
Quantized model size: 104.41 MB
Original model inference time: 0.061620 seconds
Quantized model inference time: 0.051524 seconds
