In [None]:
!pip install optimum
!pip install auto-gptq

In [2]:
import torch
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize

# Qunatization: Compressing LLM

### Comparing in-memory size of Normal model with Quantized Model
#### We will be working with facebook/opt-125m

In [None]:
device = 'cuda'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_checkpoint= "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)

In [6]:
unquantized_memory_footprint = model.get_memory_footprint()/1024**2

#### For parameters details on GPTQConfig check [@HuggingFace Documentation](https://huggingface.co/docs/transformers/v4.33.0/en/main_classes/quantization)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

In [8]:
quantized_memory_footprint = quantized_model.get_memory_footprint()/1024**2

In [9]:
print(f'Original Model Size:{unquantized_memory_footprint} MB, Quantized Model Size:{quantized_memory_footprint} MB Difference:{unquantized_memory_footprint-quantized_memory_footprint} MB')

Original Model Size:477.75 MB, Quantized Model Size:119.2734375 MB Difference:358.4765625 MB


#### Check the Quantized Model

In [10]:
quantized_model.model.decoder.layers[0].self_attn.q_proj.__dict__ ## contain qweights and qzeors in int32 format whereas normal model has ony weights and bias in FP16/FP32 format

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict([('qweight',
               tensor([[ 1711760090, -1248295259, -2025411892,  ..., -1486452502,
                         2019142072, -1735820810],
                       [-2000132747,  -578262345,  1484081337,  ..., -1230600537,
                        -2019252040, -2023311003],
                       [ -710293851, -1153090188,  1431922298,  ..., -1768449094,
                         2042194587, -2004125258],
                       ...,
                       [-1183500136, -1494510422, -1772782904,  ..., -1518753378,
                         -411710600,  -392845654],
                       [-1990626701,  1469278281,  1469864108,  ...,  1740208533,
                        -1732560507, -1738077576],
                       [ 2015914598,  2040232821,  2005572185,  ..., -1463179655,
                        -1450400136, -2024523156]], device='cuda:0', dtype=torch.int32)),
              ('qzeros',
               tensor(

# Comparing outputs

#### Quantized Model output

In [11]:
text = "ML is different from classical algorithm in the fact that"
inputs = tokenizer(text, return_tensors="pt").to(device)

out = quantized_model.generate(**inputs,max_length=100,min_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))

ML is different from classical algorithm in the fact that it is a very complex algorithm.                                                                                  


#### Normal model output

In [12]:
text = "ML is different from classical algorithm in the fact that"
inputs = tokenizer(text, return_tensors="pt").to(device)

out = model.generate(**inputs,max_length=100,min_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))

ML is different from classical algorithm in the fact that it is not a linear algorithm.
I'm not sure what you mean by linear.
It's a linear algorithm that is not linear in that it is not a linear algorithm.
I'm not sure what you mean by linear.
It's a linear algorithm that is not linear in that it is not a linear algorithm.
I'm not sure what you mean by linear.
It's a linear algorithm that is not linear in


### Quantizing a model with custom dataset

In [None]:
quantization_config = GPTQConfig(
    bits=4,
    group_size=128,
    desc_act=False,
    dataset=["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")

In [14]:
text = "ML is different from classical algorithm in the fact that"
#text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)

out = quant_model.generate(**inputs,max_length=100,min_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))

ML is different from classical algorithm in the fact that.                                                                                        
