In [15]:
from datasets import load_dataset

import torch
import torch.nn as nn

import os

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)


### Activation Scaling ###

Before quantization we need to calculate the scaling factor. For activations, some channels are very different from the rest of channels. To make them better suited for quantization, per channel scaling factor is calculated which is max per channel of activations for a small calibration set.

For first step we work with calibration dataset.

1. load from load_dataset on run
2. load it from predownloaded and stored json

First load on the run.


In [49]:
# loading the wikitext 2 data with validation split
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation") 

# selecting 512 sentences of non-zero lengths
calib_lines = [s for s in ds["text"] if s and len(s.strip()) > 0][:512]

# seeing sentence with the longest length
longest = max(calib_lines, key=len)

print(f"len(calib_lines): {len(calib_lines)}")
print(f"calib_lines[0]: {calib_lines[0]}")
print(f"len(calib_lines[0]): {len(calib_lines[0].split())}")

print(f"longest: {longest}")
print(f"len(longest): {len(longest.split())}")

len(calib_lines): 512
calib_lines[0]:  = Homarus gammarus = 

len(calib_lines[0]): 4
longest:  Meridian is rightly considered an architectural treasure trove being one the nations most intact cities from the turn of the last century . Architecture students from around the nation and Canada are known to visit Meridian in groups as part of their coursework due to numerous structures in the city having been designed by noted architects . The only home in the US south designed by noted Canadian born Architect Louis S. Curtiss , famous for inventing the glass curtain wall skyscraper , is extant on Highland Park . The Frank Fort designed Threefoot Building is generally considered one of the best Art Deco skyscrapers in the US and is often compared to Detroit 's famed Fisher Building . Noted California Architect Wallace Neff designed a number of homes in Meridian as well as in the Alabama Black Belt which adjoins the city across the nearby Alabama State line . He had relatives in Meridian and

In [52]:
'''
Loading the tokenizer for this model then passing each sentence through the tokenizer
tokenizer is given sentences iteratively and it produces token vector and attention mask.
The output dictionary can be fed to model
we can decode the token vector to see what the original sentence was.
'''

data_path = '/home/tahir/workspace2/EntroQ/dataset/val.jsonl.zst'
MODEL_NAME = 'facebook/opt-125m' 
NUM_SAMPLES = 512 
SEQ_LEN = 512 
MODEL_MAX_LENGTH = 512  

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MODEL_MAX_LENGTH)

for i in range(512):
    input_ids = tokenizer(
                calib_lines[i], return_tensors="pt", max_length=512, truncation=True            
                )

print(f"input_ids['input_ids']: {input_ids['input_ids']}")
print("=================")
print("Decoded:", tokenizer.decode(input_ids["input_ids"][0]))
print("=================")
print(f"calib_lines[-1]: {calib_lines[-1]}")
print(f"input_ids: {input_ids}")




input_ids['input_ids']: tensor([[    2,    20, 19826,  2372,    32,    31,     5,  3072,   787,    12,
          1039, 10103,  2156,     8,    51,    32,  6329,  8260,    11,   494,
             8,   587,   479, 14201,  2156,  5676,  4775,  5621,    32,  1537,
           227,   587,     8,   779,   479,  8314,  5621,    11,     5,  1035,
           377,  3805,     7,    28,    55,  8067,     8,   747,  2592,  3841,
         14489,  5315,   479,  1437, 50118]])
Decoded: </s> The prevailing winds are from the west @-@ southwest, and they are normally strongest in March and April. Brief, intense thunderstorms are common between April and October. Thunderstorms in the summer months tend to be more isolated and often produce dry lightning strikes. 

calib_lines[-1]:  The prevailing winds are from the west @-@ southwest , and they are normally strongest in March and April . Brief , intense thunderstorms are common between April and October . Thunderstorms in the summer months tend to be more

In [53]:
''' 
Doing the same thing but with a saved dataset
'''

data_path = '/home/tahir/workspace2/EntroQ/dataset/val.jsonl.zst'
MODEL_NAME = 'facebook/opt-125m' 
NUM_SAMPLES = 512 
SEQ_LEN = 512 
MODEL_MAX_LENGTH = 512  


dataset = load_dataset("json", data_files=data_path, split="train")
print(f"dataset: {dataset}")
print("==============")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MODEL_MAX_LENGTH)

for i in range(512):
    input_ids = tokenizer(
                dataset[i]["text"], return_tensors="pt", max_length=512, truncation=True
            )


print(f"input_ids['input_ids']: {input_ids['input_ids']}")
print("=================")
print("Decoded:", tokenizer.decode(input_ids["input_ids"][0]))
print("=================")
print(f"dataset[-1]['text']: {dataset[-1]['text']}")
print(f"input_ids: {input_ids}")

dataset: Dataset({
    features: ['text'],
    num_rows: 512
})




input_ids['input_ids']: tensor([[    2,    20, 19826,  2372,    32,    31,     5,  3072,   787,    12,
          1039, 10103,  2156,     8,    51,    32,  6329,  8260,    11,   494,
             8,   587,   479, 14201,  2156,  5676,  4775,  5621,    32,  1537,
           227,   587,     8,   779,   479,  8314,  5621,    11,     5,  1035,
           377,  3805,     7,    28,    55,  8067,     8,   747,  2592,  3841,
         14489,  5315,   479,  1437,  1437]])
Decoded: </s> The prevailing winds are from the west @-@ southwest, and they are normally strongest in March and April. Brief, intense thunderstorms are common between April and October. Thunderstorms in the summer months tend to be more isolated and often produce dry lightning strikes.  
dataset[-1]['text']:  The prevailing winds are from the west @-@ southwest , and they are normally strongest in March and April . Brief , intense thunderstorms are common between April and October . Thunderstorms in the summer months tend to be 