In [12]:
# Simple example of packing
vals = [0, 4, 8, 14, 2, 6, 10, 12]
assert len(vals) == 8

# Original AWQ packing code
"""
for i in range(pack_num):
             qweight_col = intweight[:, col * pack_num + order_map[i]]
             qweight[:, col] |= qweight_col << (i * awq_linear.w_bit)
"""

# Binary values are packed right to left
# as 4-bit values into a single 32-bit value
packed = 0
for idx in range(len(vals)):
    packed |= vals[idx] << (idx * 4)

assert packed == 0b11001010011000101110100001000000

packed_str = format(packed, 'b')
assert len(packed_str) == 32

assert packed_str[28:] == '0000'
assert packed_str[24:28] == '0100'
assert packed_str[20:24] == '1000'
assert packed_str[0:4] == '1100'

In [22]:
# This is a breakdown of (numpy broadcasting
# + unpacking inside the original VLLM AWQ implementation)
# I think the original implementation is wrong
import numpy as np

BLOCK_K = 2
PACKED_BLOCK_N = 2
BLOCK_N = 4
P = 2
# Packed:
# [1, 4]
# [17, 33]
# Unpacked:
# [0, 1, 0, 4]
# [1, 1, 2, 1]

b = np.array([[0b00000001, 0b00000100], [0b00010001, 0b00100001]], dtype=np.uint8)
shifter = np.array([0, 1]) * 4
AWQ_MASK = 0b1111     # Set the mask value to select the lower 4 bits of an 8-bit int

assert b.shape == (BLOCK_K, PACKED_BLOCK_N)
expanded_b = b[:, None, :] >> np.array([0, 0])[None, :, None]
assert expanded_b.shape == (BLOCK_K, P, PACKED_BLOCK_N)
expanded_b = expanded_b.tolist()
assert expanded_b == [
                      [[1, 4], [1, 4]], 
                      [[17, 33], [17, 33]]
                     ]

empty_b = np.zeros_like(b)
expanded_shifter =  shifter[None, :, None] >> np.zeros_like(b)[:, None, :]
assert expanded_shifter.shape == (BLOCK_K, P, PACKED_BLOCK_N)
expanded_shifter = expanded_shifter.tolist()
assert expanded_shifter == [[[0, 0], [4, 4]], 
                           [[0, 0], [4, 4]]]

# convert expanded_b and expanded_shifter to numpy arrays
expanded_b = np.array(expanded_b)
expanded_shifter = np.array(expanded_shifter)

shifted = expanded_b >> expanded_shifter
masked_out = shifted & AWQ_MASK
assert masked_out.shape == (BLOCK_K, P, PACKED_BLOCK_N)
masked_out = masked_out.tolist()
print(masked_out) # wrong

[[[1, 4], [0, 0]], [[1, 1], [1, 2]]]


In [21]:
# A breakdown of broadcasting semantics
# The TL;DR is:
# - `None` adds a new axis (dimension)
# - Concretely, this means adding a *single* pair of brackets
#    This can be visualized as either 
#    - adding brackets inside the previous dimension (e.g b[:, None])
#    - adding brackets outside the next dimension (e.g. b[None, :])
# - Once the brackets are added, duplicate the values to match the other array's shape
b  = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

b_expanded = b[:, None, :] + np.array([0, 0])[None, :, None]
assert b_expanded.tolist() == [
    [[1, 2, 3], [1, 2, 3]],
    [[4, 5, 6], [4, 5, 6]]
]

b2 = np.array([1, 2, 3])
b_expanded = b2[None, :, None] + np.zeros_like(b)[:, None, :]
assert b_expanded.tolist() == [
    [
        [1, 1, 1],
        [2, 2, 2],
        [3, 3, 3]
    ],
    [
        [1, 1, 1],
        [2, 2, 2],
        [3, 3, 3]
    ]
]

[[[1 1 1]
  [2 2 2]
  [3 3 3]]

 [[1 1 1]
  [2 2 2]
  [3 3 3]]]
