# Implementation of Quantization from Scratch
In this notebook, we gonna try to implement the quantization from scratch using numpy.

## 1. Imports

In [1]:
import numpy as np

# Suppress scientific notation
np.set_printoptions(suppress=True)

## 2. Generate parameters
In LLMs, quantization would be done on the model weights. Consider a 1d weight matrix.

In [2]:
# Generate randomly distributed parameters
params = np.random.uniform(low=-50, high=150, size=20)

Let's round the numbers to the second decimal place

In [3]:
params = np.round(params, 2)

# Print the parameters
print(params)

[ 73.01 -42.44 114.48  13.72  77.91 145.29  93.61 104.81  90.27  85.97
  81.49 110.62 100.19 -39.48  92.56 109.81  66.74 108.35 109.48  11.7 ]


### 3. Helper Methods
Let's create list of helper methods for quantization such as clamp, asymmetric_quantize, symmetric_quantize, etc.

**Clamp Function**

In [4]:
def clamp(params_q: np.array, lower_bound: int, upper_bound: int) -> np.array:
    params_q[params_q < lower_bound] = lower_bound
    params_q[params_q > upper_bound] = upper_bound
    return params_q

**Asymmetric Quantization with Min-Max Approach**

In [5]:
def asymmetric_quantization(params: np.array, bits: int) -> tuple[np.array, float, int]:
    # Calculate the scale and zero point
    alpha = np.max(params)
    beta = np.min(params)
    scale = (alpha - beta) / (2**bits-1)
    zero = -1*np.round(beta / scale)
    lower_bound, upper_bound = 0, 2**bits-1
    
    # Quantize the parameters
    quantized = clamp(np.round(params / scale + zero), lower_bound, upper_bound).astype(np.int32)
    return quantized, scale, zero

**Asymmetric De-quantization with Min-Max Approach**

In [6]:
def asymmetric_dequantize(params_q: np.array, scale: float, zero: int) -> np.array:
    return (params_q - zero) * scale

**Symmetric Quantization with Min-Max Approach**

In [7]:
def symmetric_quantization(params: np.array, bits: int) -> tuple[np.array, float]:
    # Calculate the scale
    alpha = np.max(np.abs(params))
    scale = alpha / (2**(bits-1)-1)
    lower_bound = -(2**(bits-1)-1)
    upper_bound = 2**(bits-1)-1
    # Quantize the parameters
    quantized = clamp(np.round(params / scale), lower_bound, upper_bound).astype(np.int32)
    return quantized, scale

**Symmetric De-quantization with Min-Max Approach**

In [8]:
def symmetric_dequantize(params_q: np.array, scale: float) -> np.array:
    return params_q * scale

**Quantization Error**

In [9]:
def quantization_error(params: np.array, params_q: np.array):
    # calculate the MSE
    return np.mean((params - params_q)**2)

## 4. Apply
Now, apply both asymmetric and symmetric quantization on above generated parameters.

In [10]:
(asymmetric_q, asymmetric_scale, asymmetric_zero) = asymmetric_quantization(params, 8) # 8bit representation
(symmetric_q, symmetric_scale) = symmetric_quantization(params, 8) # 8bit representation

print(f'Original:')
print(np.round(params, 2))
print('')
print(f'Asymmetric scale: {asymmetric_scale}, zero: {asymmetric_zero}')
print(asymmetric_q)
print('')
print(f'Symmetric scale: {symmetric_scale}')
print(symmetric_q)

Original:
[ 73.01 -42.44 114.48  13.72  77.91 145.29  93.61 104.81  90.27  85.97
  81.49 110.62 100.19 -39.48  92.56 109.81  66.74 108.35 109.48  11.7 ]

Asymmetric scale: 0.7361960784313725, zero: 58.0
[157   0 214  77 164 255 185 200 181 175 169 208 194   4 184 207 149 205
 207  74]

Symmetric scale: 1.144015748031496
[ 64 -37 100  12  68 127  82  92  79  75  71  97  88 -35  81  96  58  95
  96  10]


Now, again de-quantize the quantized values and check how much error is there.

In [11]:
# Dequantize the parameters back to 32 bits
params_deq_asymmetric = asymmetric_dequantize(asymmetric_q, asymmetric_scale, asymmetric_zero)
params_deq_symmetric = symmetric_dequantize(symmetric_q, symmetric_scale)

print(f'Original:')
print(np.round(params, 2))
print('')
print(f'Dequantize Asymmetric:')
print(np.round(params_deq_asymmetric,2))
print('')
print(f'Dequantize Symmetric:')
print(np.round(params_deq_symmetric, 2))

Original:
[ 73.01 -42.44 114.48  13.72  77.91 145.29  93.61 104.81  90.27  85.97
  81.49 110.62 100.19 -39.48  92.56 109.81  66.74 108.35 109.48  11.7 ]

Dequantize Asymmetric:
[ 72.88 -42.7  114.85  13.99  78.04 145.03  93.5  104.54  90.55  86.13
  81.72 110.43 100.12 -39.75  92.76 109.69  66.99 108.22 109.69  11.78]

Dequantize Symmetric:
[ 73.22 -42.33 114.4   13.73  77.79 145.29  93.81 105.25  90.38  85.8
  81.23 110.97 100.67 -40.04  92.67 109.83  66.35 108.68 109.83  11.44]


In [12]:
# Calculate the quantization error
print(f'{"Asymmetric error: ":>20}{np.round(quantization_error(params, params_deq_asymmetric), 2)}')
print(f'{"Symmetric error: ":>20}{np.round(quantization_error(params, params_deq_symmetric), 2)}')

  Asymmetric error: 0.05
   Symmetric error: 0.08


Now, we gonna use different approach while choosing the value for $\alpha$ and $\beta$

## Percentile
Now, we have to create new function for asymmetric_quantization

In [13]:
def asymmetric_quantization_percentile(params: np.array, bits: int, percentile: float = 99.99) -> tuple[np.array, float, int]:
    # find the percentile value
    alpha = np.percentile(params, percentile)
    beta = np.percentile(params, 100-percentile)
    scale = (alpha - beta) / (2**bits-1)
    zero = -1*np.round(beta / scale)
    lower_bound, upper_bound = 0, 2**bits-1
    quantized = clamp(np.round(params / scale + zero), lower_bound, upper_bound).astype(np.int32)
    return quantized, scale, zero

Now, to see the power of percentile. We first going to add an outlier inside  parameters

In [14]:
# Introduce an outlier
params[-1] = 1000

In [15]:
params

array([  73.01,  -42.44,  114.48,   13.72,   77.91,  145.29,   93.61,
        104.81,   90.27,   85.97,   81.49,  110.62,  100.19,  -39.48,
         92.56,  109.81,   66.74,  108.35,  109.48, 1000.  ])

## 5. Compare min-max and percentile range selection strategies

In [16]:
(asymmetric_q, asymmetric_scale, asymmetric_zero) = asymmetric_quantization(params, 8)
(asymmetric_q_percentile, asymmetric_scale_percentile, asymmetric_zero_percentile) = asymmetric_quantization_percentile(params, 8)

print(f'Original:')
print(np.round(params, 2))
print('')
print(f'Asymmetric (min-max) scale: {asymmetric_scale}, zero: {asymmetric_zero}')
print(asymmetric_q)
print(f'')
print(f'Asymmetric (percentile) scale: {asymmetric_scale_percentile}, zero: {asymmetric_zero_percentile}')
print(asymmetric_q_percentile)

Original:
[  73.01  -42.44  114.48   13.72   77.91  145.29   93.61  104.81   90.27
   85.97   81.49  110.62  100.19  -39.48   92.56  109.81   66.74  108.35
  109.48 1000.  ]

Asymmetric (min-max) scale: 4.088, zero: 10.0
[ 28   0  38  13  29  46  33  36  32  31  30  37  35   0  33  37  26  37
  37 255]

Asymmetric (percentile) scale: 4.081609517647049, zero: 10.0
[ 28   0  38  13  29  46  33  36  32  31  30  37  35   0  33  37  26  37
  37 255]


In [17]:
# Dequantize the parameters back to 32 bits
params_deq_asymmetric = asymmetric_dequantize(asymmetric_q, asymmetric_scale, asymmetric_zero)
params_deq_asymmetric_percentile = asymmetric_dequantize(asymmetric_q_percentile, asymmetric_scale_percentile, asymmetric_zero_percentile)

print(f'Original:')
print(np.round(params, 2))
print('')
print(f'De-quantized (min-max):')
print(np.round(params_deq_asymmetric,2))
print('')
print(f'De-quantized (percentile):')
print(np.round(params_deq_asymmetric_percentile,2))

Original:
[  73.01  -42.44  114.48   13.72   77.91  145.29   93.61  104.81   90.27
   85.97   81.49  110.62  100.19  -39.48   92.56  109.81   66.74  108.35
  109.48 1000.  ]

De-quantized (min-max):
[  73.58  -40.88  114.46   12.26   77.67  147.17   94.02  106.29   89.94
   85.85   81.76  110.38  102.2   -40.88   94.02  110.38   65.41  110.38
  110.38 1001.56]

De-quantized (percentile):
[ 73.47 -40.82 114.29  12.24  77.55 146.94  93.88 106.12  89.8   85.71
  81.63 110.2  102.04 -40.82  93.88 110.2   65.31 110.2  110.2  999.99]


In [18]:
# Calculate the quantization error
print(f'{"Error (min-max) excluding outlier: ":>40}{np.round(quantization_error(params[:-1], params_deq_asymmetric[:-1]),2)}')
print(f'{"Error (percentile) excluding outlier: ":>40}{np.round(quantization_error(params[:-1], params_deq_asymmetric_percentile[:-1]),2)}')

     Error (min-max) excluding outlier: 1.38
  Error (percentile) excluding outlier: 1.23
