In [1]:
# Make sure to install these two libraries
# !pip install transformers
# !pip install -U "huggingface_hub[cli]"  #For hugging face authentication

# First of all, import all the necessary libraries.
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Since we're taking basemodel-facebook/opt-350m from huggingface,
# we must authenticate first. Please create your own token with huggingface

!huggingface-cli login --token hf_THkbLhyIHHmluGkwwnzpXOvR##########

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# Define QuantizedLinearLayer class
class QuantizedLinearLayer(nn.Module):
    # As our target is to replace the linear layer from base model. We must use same parameters
    # such as in_features, out_features, bias = True, dtype=torch.float32, The dtype is a type of bias
    def __init__(self, in_features, out_features, bias = True, dtype=torch.float32):
        super().__init__()

        # Note that we're using self.register_buffer to store parameter variable value. This is because if we use nn.Parameter, the network will start calculating gradient.
        # We don't want that as we're not using this for training.
        # weight will be initialized randomly between (-128, 127) which is range of signed int-8
        self.register_buffer("weight", torch.randint(-128, 127, (out_features, in_features)).to(torch.int8))

        # scale will have dimension and data type same as the output as this will be multiplying to the output of linear layer
        self.register_buffer("scale", torch.randn((out_features), dtype=dtype))

        # bias is an optional parameter, so we only add it if is not none.
        # bias dimension is (1, out_features) as it can later broadcasted during addition.
        if bias:
            self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype))
        else:
            self.bias = None

  # 8-bit quantization function
    def quantize(self, weight):
        # Clone the weight and outcast it to fp32 which is necessary to calculate scale as both type must be in fp32
        weight_f32 = weight.clone().to(torch.float32)

        # calculating the min and max of int-8 quantized range. qmin=-128, qmax=127
        Qmin = torch.iinfo(torch.int8).min
        Qmax = torch.iinfo(torch.int8).max

        # calculting per channel scale
        # In per channel scale, you'll be calculating scale for every row. So, you'll storing scale in a tensor in this case.)
        # In per tensor scale, you'll calculate one scale for entire tensor. Per channel will be more accurate but take more memory footprint as it has to store more scales value.
        # weight_f32.abs().max(dim=-1).values -> this give the max-value for original weight value range in fp32.
        scale = weight_f32.abs().max(dim=-1).values/127
        scale = scale.to(weight.dtype)

        # This gives the quantized weight value for the given weight tensor.
        # This formula was derived from symmetric quantization. please read the link i've shared above if you want to learn in details.
        quantized_weight = torch.clamp(torch.round(weight/scale.unsqueeze(1)), Qmin, Qmax).to(torch.int8)

        self.weight = quantized_weight
        self.scale = scale

    def forward(self, input):
        # This gives the output the same way as the linear function in base model.
        # The only difference is that the weight value is now the quantized weight value.
        # Hence, this gives less processing by faster calculation and less memory utilization.

        output = F.linear(input, self.weight.to(input.dtype)) * self.scale
        if self.bias is not None:
            output = output + self.bias
        return output

def replace_linearlayer(base_model, quantizer_class, exclude_list, quantized=True):

    # Finding only the instance of base_model which has the linear layer
    # Also we have to make sure to exclude those linearlayer that are in the exclude list.
    for name, child in base_model.named_children():
        if isinstance(child, nn.Linear) and not any([x == name for x in exclude_list]):
            old_bias = child.bias
            old_weight = child.weight
            in_features = child.in_features
            out_features = child.out_features

          # This is the stage where we'll initialize quantizer class with the in_features, out_features, bias and dtype.
          # The base_model parameters values are given to the quantizer class parameters.
            quantizer_layer = quantizer_class(in_features, out_features, old_bias is not None, old_weight.dtype)

          # After the quantizer class is initialized, The replacement takes place as below.
            setattr(base_model, name, quantizer_layer)

          # Now that after replacement, base_model linearlayer is now a quantizer layer.
          # We can now call quantize_layers quantize function to quantize the old_weights of FP16 new quantized weights of int8 type.
            if quantized:
                getattr(base_model, name).quantize(old_weight)

          # If bias is not none, we'll also update bias with base model bias value
            if old_bias is not None:
                getattr(base_model, name).bias = old_bias

        # If the base model child instance has further sub components with linear layers, we'll have to quantize them by call the replace_linear_layer function with the child as base_model now.
        # This will replace all the linear layer with quantized layers that are under the child sub section.
        else:
            replace_linearlayer(child, quantizer_class, exclude_list, quantized=quantized)



In [3]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [4]:
print("facebook/opt-350m: base model architecture before quantization")
print("-"*50)
print(model)

facebook/opt-350m: base model architecture
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): 

In [5]:
# check the size of this base model before quantization
model_memory_size_before_quantization = model.get_memory_footprint()
print(f"Total memory size before quantization (in GB): {model_memory_size_before_quantization / 1e+9}")

Total memory size before quantization (in GB): 0.662392832


In [6]:
# Let's perform inference on this facebook/opt-350m base model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("Malaysia is a beautiful country and ", max_new_tokens=50)

[{'generated_text': 'Malaysia is a beautiful country and  I would love to visit it.\nI would love to visit Malaysia too.'}]

In [7]:
# model: base_model, QuantizedLinearLayer: quantized layer we've created in step 1, ["lm_head"]: exclude list
# quantized=True: If we set quantized value to False, the quantizer will only replace the linear layer with quantized layer but it wont quantized the weight.
# We'll need this if we're to save the quantized model to say huggingface or other cloud.
# Later, any user can download this quantized model and create base model skeleton to load the model.

replace_linearlayer(model, QuantizedLinearLayer, ["lm_head"], quantized=True)
print("facebook/opt-350m: quantized model architecture")
print("-"*50)
print(model)

facebook/opt-350m: quantized model architecture
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): QuantizedLinearLayer()
      (project_in): QuantizedLinearLayer()
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantizedLinearLayer()
            (v_proj): QuantizedLinearLayer()
            (q_proj): QuantizedLinearLayer()
            (out_proj): QuantizedLinearLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizedLinearLayer()
          (fc2): QuantizedLinearLayer()
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head)

In [8]:
model_memory_size_after_quantization = model.get_memory_footprint()
print(f"Total memory size after quantization (in GB): {model_memory_size_after_quantization / 1e+9}")

Total memory size after quantization (in GB): 0.359799808


In [9]:
# Let's perform inference on this facebook/opt-350m quantized model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("Malaysia is a beautiful country and ", max_new_tokens=50)

[{'generated_text': 'Malaysia is a beautiful country and  I would love to visit it.\nI would love to visit Malaysia too.'}]