In [None]:
%pip install transformers
%pip install -U "huggingface_hub[cli]"

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import OPTForCausalLM, AutoTokenizer, AutoConfig

from huggingface_hub import HfApi, create_repo
from huggingface_hub import hf_hub_download

# make sure to use your own auth token as I'll be deleting it after publish this post.
!huggingface-cli login --token hf_THkbLhyIHHmluGkwwnzpXOvR########## 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

2024-06-29 09:20:25.786831: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-29 09:20:25.786951: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-29 09:20:25.951224: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# Define quantizer class QuantizedLinearLayer and its functions quantize and foward function
class QuantizedLinearLayer(nn.Module):
    def __init__(self, in_features, out_features, bias = True, dtype=torch.float32):
        super().__init__()
        
        self.register_buffer("weight", torch.randint(-8, 7, (out_features, in_features)).to(torch.int8).to(device))
        self.register_buffer("scale", torch.randn((out_features), dtype=dtype).to(device))

        if bias:
            self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype).to(device))
        else:
            self.bias = None

# For 4-bit quantization
    def quantize(self, weight):
        weight_f32 = weight.clone().to(torch.float32).to(device)
        scale = weight_f32.abs().max(dim=-1).values/7
        scale = scale.to(weight.dtype)

        quantized_weight = torch.clamp(torch.round(weight/scale.unsqueeze(1)), -8, 7).to(torch.int8).to(device)    
        # Further quantized the weight to 4-bit by PACKING THE WEIGHT
        quantized_weight4bit = pack_weights(quantized_weight) 

        self.weight = quantized_weight4bit    
        self.scale = scale
    def forward(self, input):
        # unpack the self.weight first
        unpacked_weight = unpack_weights(self.weight)     
        print(unpacked_weight.shape)
        output = F.linear(input, unpacked_weight.to(input.dtype)) * self.scale
        if self.bias is not None:
            output = output + self.bias
        return output
    
def replace_linearlayer(base_model, quantizer_class, exception_list, quantized=True):
    for name, child in base_model.named_children():
        if isinstance(child, nn.Linear) and not any([x == name for x in exception_list]):
            old_bias = child.bias
            old_weight = child.weight
            in_features = child.in_features
            out_features = child.out_features

          # intantiate a quantizer class layer
            if quantized:
                quantizer_layer = quantizer_class(in_features, out_features, old_bias is not None, old_weight.dtype).to(device)
            else:
                in_features = in_features//2
                quantizer_layer = quantizer_class(in_features, out_features, old_bias is not None, old_weight.dtype).to(device)

          # replace the name with quantizer_module layer
            setattr(base_model, name, quantizer_layer)

          # since the base_model name is now replaced with quantizer_module, we can call its quantize function to quantize the old_weight. the weight of the quantizer layer is a quantized weight with int8 type
            if quantized:
                getattr(base_model, name).quantize(old_weight)

          # we can also update the quantizer module bias with the old_bias if it is not none
            if old_bias is not None:
                getattr(base_model, name).bias = old_bias

        # if the child has further any sub linear layer, we can invoke the function again and loop inside the child. Pass the child in place of base_model
        else:
            replace_linearlayer(child, quantizer_class, exception_list, quantized=quantized)
          

In [3]:
def pack_weights(quantized_weight8bit):
  # given a tensor with 2bit encoded value encoded_weight4bit. total number of value in encoded_weight4bit * bits(2bit encoded in this case). that should be divisible by 8. 
  # why divisible by 8? becuase we're storing number of 2bit encoded value in new 8bit tensor which sholud fit. PyTorch only support int8 precisio, not int2 or 4.     
    bits = 4
    if quantized_weight8bit.shape[-1] * bits % 8 != 0:
        raise ValueError("encoded_weight4bit.shape[0] * bits shoul be divisible by 8")

  # total number of int8 values after int2 are packed in  
    num_values = quantized_weight8bit.shape[-1] * bits // 8

  # total number of 2-bit value within a single int8 packed tensor. num_values is total number of int8 packed tensor.
    num_steps = 8 // bits    #8 is the total number of bit in 8bit and dividing by bits can give the total number of individual 2-bit encoded value located/packed inside that single int8 value tensor  
    packed_weights = torch.zeros((quantized_weight8bit.shape[0], num_values), dtype=torch.int8).to(device)
    weight_index = 0
    
    for row in range(quantized_weight8bit.shape[0]):
        weight_index = 0
        for i in range(num_values):
            for j in range(num_steps):
                if j==0 and quantized_weight8bit[row,weight_index] < 0:   
                    encoded_weight4bit_zero = 0 # First value of packed_tensor shouldn't be negative - for now can't find logic yet
                    packed_weights[row, i] |= encoded_weight4bit_zero << bits * j        
                else:
                    packed_weights[row, i] |= quantized_weight8bit[row,weight_index] << bits * j
                weight_index += 1
    return packed_weights
  
def unpack_weights(packed_weights):
  # how many 2-bit value are there in the entire packed_tensor
  # first calcualte totals bits = packed_tensor.shape[0](total no of values in the packed tensor ) * 8 (each value is a unsigned 8bit tensor)
  # then divide it by unpacked bits or original bits that we encoded - bits  
    bits = 4
    packed_weights = packed_weights.to(torch.int8).to(device)
    num_values = packed_weights.shape[-1] * 8 // bits

  # number of steps is how many encoded bits value is in the single packed tensor value
    num_steps = 8 // bits

  #lets initialized a unpacked_tensor with zero and later to be update with actual 2bit encoded value
  # first we'll just extract encoded value in int8 and later we'll extract only the 2-bit part, we'll see how
    unpacked_weights = torch.zeros((packed_weights.shape[0],num_values), dtype = torch.int8).to(device)
    
    for row in range(packed_weights.shape[0]):
        unpacked_index = 0
        for i in range(packed_weights.shape[-1]):
            for j in range(num_steps):
                unpacked_weights[row, unpacked_index] |= packed_weights[row, i] >> bits * j
                unpacked_index += 1
            mask = 2**bits - 1
            unpacked_weights[row] &= mask
            unpacked_weights[row] = optimize_unpacked_weights(unpacked_weights[row])
    return unpacked_weights

def optimize_unpacked_weights(unpacked_weights):
    updated_unpacked_weights = torch.zeros(unpacked_weights.shape[0], dtype=torch.int8).to(device)
    for i in range(unpacked_weights.shape[0]):
        a_binary = format(unpacked_weights[i].item(), '04b')
        a3=int(a_binary[0])
        a2=int(a_binary[1])
        a1=int(a_binary[2])
        a0=int(a_binary[3])

        if a3 == 1:
            updated_unpacked_weights[i] = -a3*pow(2, 3) + a2*pow(2, 2) + a1*pow(2, 1) + a0*pow(2, 0)
        else:
            updated_unpacked_weights[i]= a3*pow(2, 3) + a2*pow(2, 2) + a1*pow(2, 1) + a0*pow(2, 0)
    return updated_unpacked_weights

In [6]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", torch_dtype=torch.bfloat16)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [7]:
print("facebook/opt-125m: base model architecture")
print("-"*50)
print(model)

facebook/opt-125m: base model architecture
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_

In [8]:
model_memory_size_before_quantization = model.get_memory_footprint()
print(f"Total memory size before quantization (in GB): {model_memory_size_before_quantization / 1e+9}")

Total memory size before quantization (in GB): 0.250478592


In [9]:
# Let's perform inference on this facebook/opt-125m base model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("Malaysia is a beautiful country and ", max_new_tokens=50)

[{'generated_text': "Malaysia is a beautiful country and  I'm glad to see it is getting better.\nI'm glad to see Malaysia getting better.  I'm glad to see Malaysia getting better.  I'm glad to see Malaysia getting better.  I'm glad to see Malaysia getting better."}]

In [10]:
# The base model seems to be working good. Lets start quantizing this facebook/opt-125m model by calling our custom quantizer which we've builded earlier.
replace_linearlayer(model, QuantizedLinearLayer, ["lm_head"], quantized=True)
print("facebook/opt-125m: quantized model architecture")
print("-"*50)
print(model)

facebook/opt-125m: quantized model architecture
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantizedLinearLayer()
            (v_proj): QuantizedLinearLayer()
            (q_proj): QuantizedLinearLayer()
            (out_proj): QuantizedLinearLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizedLinearLayer()
          (fc2): QuantizedLinearLayer()
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(i

In [11]:
model_memory_size_after_quantization = model.get_memory_footprint()
print(f"Total memory size after quantization (in GB): {model_memory_size_after_quantization / 1e+9}")

Total memory size after quantization (in GB): 0.123242496


In [12]:
# save the quantize model and push to huggingface hub for later inferencing use for all users
# saving in local directory
torch.save(model.state_dict(), "quantized_opt125_state_dict.pth")

In [13]:
api = HfApi()

api.upload_file(
    path_or_fileobj="quantized_opt125_state_dict.pth",
    path_in_repo = "quantized_opt125_state_dict.pth",
    repo_id = "tamangmilan/quantized_facebook_opt_125m",    
)

quantized_opt125_state_dict.pth:   0%|          | 0.00/123M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tamangmilan/quantized_facebook_opt_125m/commit/9698c0f8724a93d04f46db33df3eb27390f8ac57', commit_message='Upload quantized_opt125_state_dict.pth with huggingface_hub', commit_description='', oid='9698c0f8724a93d04f46db33df3eb27390f8ac57', pr_url=None, pr_revision=None, pr_num=None)

In [4]:
# AutoConfig gets all the skeleton of model architecture 
config = AutoConfig.from_pretrained("facebook/opt-125m")

# using this skeleton, we'll initialize the model withouht weight, just empty model with everythign else same of the architecture
with torch.device("meta"):
  new_model = OPTForCausalLM(config)

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
print(new_model)



config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [5]:
# Replace only the linear layer but dont perform quantization,hence quantized=False.
replace_linearlayer(new_model, QuantizedLinearLayer, ["lm_head"], quantized=False)
print(new_model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantizedLinearLayer()
            (v_proj): QuantizedLinearLayer()
            (q_proj): QuantizedLinearLayer()
            (out_proj): QuantizedLinearLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizedLinearLayer()
          (fc2): QuantizedLinearLayer()
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=50272, bias=False)
)


In [6]:
state_dict_cache_path = hf_hub_download(
    repo_id="tamangmilan/quantized_facebook_opt_125m",    
    filename="quantized_opt125_state_dict.pth"
)
state_dict = torch.load(state_dict_cache_path)

quantized_opt125_state_dict.pth:   0%|          | 0.00/123M [00:00<?, ?B/s]

In [7]:
new_model.load_state_dict(state_dict, strict=True, assign=True)

<All keys matched successfully>

In [9]:
# Let's perform inference on this gemma-2b base model
pipe = pipeline("text-generation", model=new_model, tokenizer=tokenizer)
pipe("Malaysia is a beautiful country and ", max_new_tokens=50)

torch.Size([768, 768])
torch.Size([768, 768])



KeyboardInterrupt

