The main idea here is that I want to guess what are the most important keys ahead of time ()

In [1]:
# For boba

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch
print(torch.cuda.device_count())

2


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "meta-llama/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(model_name,torch_dtype=torch.float16, attn_implementation='sdpa', device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [3]:
from typing import Optional, Tuple
from transformers.cache_utils import Cache
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
import math
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from time import perf_counter

import numpy as np
import torch
import triton
import triton.language as tl

In [4]:
@triton.jit
def _triton_block_sparse_attn_fwd_kernel(
    Q, K, seqlens, sm_scale,
    block_index,
    Out,
    stride_qz, stride_qh, stride_qm, stride_qk,
    stride_kz, stride_kh, stride_kn, stride_kk,
    stride_oz, stride_oh, stride_om, stride_ok,
    Z, H, N_CTX,
    NUM_ROWS, MAX_BLOCKS_PRE_ROW,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    dtype: tl.constexpr,
):
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)

    seqlen = tl.load(seqlens + off_hz // H)
    if start_m * BLOCK_M >= seqlen:
        return

    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    offs_d = tl.arange(0, BLOCK_DMODEL)

    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh

    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok

    blocks_ptr = block_index + (off_hz * NUM_ROWS + start_m) * MAX_BLOCKS_PRE_ROW


    tl.static_print(f"O_ptrs {o_ptrs.shape[0]} {o_ptrs.shape[1]}")
    tl.static_print(f"Offs_n {offs_n.shape[0]} {offs_n.shape[0]}")

    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    # acc = tl.zeros([BLOCK_M, MAX_BLOCKS_PRE_ROW * BLOCK_N], dtype=tl.float32)
    # scale sm_scale by log_2(e) and use
    # 2^x instead of exp in the loop because CSE and LICM
    # don't work as expected with `exp` in the loop
    qk_scale = sm_scale * 1.44269504
    # load q: it will stay in SRAM throughout
    q = tl.load(q_ptrs)
    q = (q * qk_scale).to(dtype)

    # loop over k, v and update accumulator
    m_mask = offs_m[:, None] < seqlen
    block_count = tl.minimum((start_m + 1) * BLOCK_M // BLOCK_N, MAX_BLOCKS_PRE_ROW)

    for sparse_block_idx in range(block_count):
        real_block_idx = tl.load(blocks_ptr + sparse_block_idx)
        start_n = real_block_idx * BLOCK_N
        cols = start_n + offs_n
        # -- load k, v --
        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
        k = k.to(dtype)
        # -- compute qk --
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        # if start_n + BLOCK_N < seqlen:
        #     qk = tl.where(m_mask, qk, float("-inf"))
        # else:
        causal_mask = cols[None, :] <= offs_m[:, None]
        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
        qk += tl.dot(q, k)
        # -- compute scaling constant --
        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
        alpha = tl.math.exp2(m_i - m_i_new)
        p = tl.math.exp2(qk - m_i_new[:, None])
        # -- scale and update acc --
        # acc_scale = l_i * 0 + alpha  # workaround some compiler bug
        # acc *= acc_scale[:, None]
        store_cols = (sparse_block_idx * BLOCK_N) + offs_n
        tl.static_print(f"P shape: {p.shape[0]} {p.shape[1]}")
        tl.static_print(f"Store cols shape: {store_cols.shape[0]}")
        tl.static_print(f"Regular cols shape: {cols.shape[0]}")
        tl.store(o_ptrs + store_cols[None, :], p.to(dtype), mask=m_mask)
        # acc += tl.dot(p.to(dtype), v)
        # -- update m_i and l_i --
        l_i = l_i * alpha + tl.sum(p, 1)
        m_i = m_i_new

    # write back O
    # acc /= l_i[:, None]
    # tl.store(o_ptrs, acc.to(dtype), mask=m_mask)


def _triton_block_sparse_attention_scores(
    q,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
    k,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
    seqlens,           # [BATCH, ]
    block_index,       # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), MAX_BLOCKS_PRE_ROW]
    sm_scale,
    block_size_M=64,
    block_size_N=64,
) -> torch.Tensor:
    # shape constraints
    Lq, Lk = q.shape[-1], k.shape[-1]
    assert Lq == Lk
    assert Lk in {16, 32, 64, 128}
    print("Q shape: ", q.shape , " K shape: ", k.shape, " Seqlens shape: ", seqlens.shape, " Block index: ", block_index.shape)

    o = torch.zeros((q.shape[0], q.shape[1], q.shape[2], block_index.shape[-1] * block_size_N)).to(q.device)

    print("Output shape: ", o.shape)

    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
    _triton_block_sparse_attn_fwd_kernel[grid](
        q, k, seqlens, sm_scale,
        block_index,
        o,
        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
        q.shape[0], q.shape[1], q.shape[2],
        block_index.shape[-2], block_index.shape[-1],
        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
        BLOCK_DMODEL=Lk,
        dtype=dtype,
        num_warps=4, num_stages=2,
    )

    return o

In [5]:
from typing import Optional
import einops
from minference.ops.block_sparse_flash_attention import _triton_block_sparse_attention

def _build_block_index(
    query: torch.Tensor,     # [BATCH, N_HEADS, N_CTX, D_HEAD]
    key: torch.Tensor,       # [BATCH, N_HEADS, N_CTX, D_HEAD]
    top_k: int,
    block_size_M: int = 64,
    block_size_N: int = 64,
):
    batch_size, num_heads, context_size, head_dim = query.shape
    print("Query shape in _build_block_index: ", query.shape, " Key shape in _build_block_index: ", key.shape)
    query_pool = query.reshape((batch_size, num_heads, -1, block_size_M, head_dim)).mean(dim=-2)
    key_pool = key.reshape((batch_size, num_heads, -1, block_size_N, head_dim)).mean(dim=-2)

    print("After taking the mean: query shape: ", query_pool.shape, " key shape: ", key_pool.shape)
    arange_M = torch.arange(query_pool.shape[-2], dtype=torch.int32, device=query.device) * block_size_M
    arange_N = torch.arange(key_pool.shape[-2], dtype=torch.int32, device=key.device) * block_size_N
    p_pool = torch.einsum(f'bhmk, bhnk -> bhmn', query_pool, key_pool)
    p_pool = p_pool.where(arange_M[None, None, :, None] >= arange_N[None, None, None, :], -torch.inf)
    top_k = min(top_k, context_size // block_size_N)
    print("Top k: ", top_k)
    return torch.topk(p_pool, top_k, dim=-1).indices.to(torch.int32).sort(dim=-1).values

def _build_block_index_subselect(
    query: torch.Tensor,     # [BATCH, N_HEADS, N_CTX, D_HEAD]
    key: torch.Tensor,       # [BATCH, N_HEADS, N_CTX, D_HEAD]
    top_k: int,
    block_size_M: int = 32,
    block_size_N: int = 32,
    subselect_scale: int = 8
):
    print(f"Subselect input shapes: Query shape: {query.shape} Key shape: {key.shape} Subselect scale: {subselect_scale}")
    batch_size, num_heads, context_size, head_dim = query.shape
    query_pool = query.reshape((batch_size, num_heads, -1, block_size_M, head_dim)).mean(dim=-2)
    key_pool = key.reshape((batch_size, num_heads, -1, block_size_N, head_dim)).mean(dim=-2)
    
    subselect_block_index = _build_block_index(query_pool, key_pool, top_k * 2, block_size_M=subselect_scale, block_size_N=subselect_scale) # [BATCH, N_HEADS, N_CTX // BLOCK_SIZE_M, TOP_K]
    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
    
    print(f"Block sparse attention scores measurement: ", query_pool.shape, key_pool.shape)
    p_pool = _triton_block_sparse_attention_scores(
        query_pool, key_pool, seqlens, subselect_block_index, subselect_scale, subselect_scale
    )
    return p_pool


def block_sparse_attention(
    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
    key: torch.Tensor,    # [BATCH, N_HEADS, N_CTX, D_HEAD]
    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
    top_k: int,
    block_size_M: int = 32,
    block_size_N: int = 32,
    subselect_scale: int = 8,
):
    print(f"Params: block_m {block_size_M} block_n {block_size_N} subselect_scale {subselect_scale}")
    batch_size, num_heads, context_size, head_dim = query.shape
    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
    sm_scale = head_dim ** -0.5
    block_index = _build_block_index_subselect(query, key, top_k, block_size_N, block_size_N, subselect_scale)
    out = _triton_block_sparse_attention(query, key, value, seqlens, block_index, sm_scale, block_size_M, block_size_N)
    return out[..., :context_size, :]



In [6]:
def sdpa_forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    # print("Running custom forward function")
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    if past_key_value is not None:
        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    if attention_mask is not None:
        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
            raise ValueError(
                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
            )

    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
    # Reference: https://github.com/pytorch/pytorch/issues/112577.
    if query_states.device.type == "cuda" and attention_mask is not None:
        query_states = query_states.contiguous()
        key_states = key_states.contiguous()
        value_states = value_states.contiguous()

    # generate weights for which tokens are important or not

    # Apply padding to q and k
    max_block_size = 16 * 16 # apparently this is the only way that it's allowed. 
    k_len = key_states.shape[-2]
    pad_len_q = 0 if (q_len % max_block_size) == 0 else (max_block_size - (q_len % max_block_size))
    pad_len_k = 0 if (k_len % max_block_size) == 0 else (max_block_size - (k_len % max_block_size))
    if pad_len_q > 0:
        bq, hq, tq, dq = query_states.shape
        pad_matrix = torch.zeros((bq, hq, pad_len_q, dq), device=query_states.device)
        query_states = torch.cat((query_states, pad_matrix), dim=-2)  # Padding on the last dimension
    if pad_len_k > 0:
        bk, hk, tk, dk = key_states.shape
        pad_matrix = torch.zeros((bk, hk, pad_len_k, dk), device=query_states.device)
        key_states = torch.cat((key_states, pad_matrix), dim=-2)  # Padding on the last dimension
        value_states = torch.cat((value_states, pad_matrix), dim=-2)
    print("Padding length: ", pad_len_q, pad_len_k, "Q states: ", query_states.shape, " Key states: ", key_states.shape)

    # attn_output = torch.nn.functional.scaled_dot_product_attention(
    #     q_blocks,
    #     k_blocks,
    #     v_blocks,
    #     attn_mask=attention_mask,
    #     dropout_p=self.attention_dropout if self.training else 0.0,
    #     # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
    #     is_causal=self.is_causal and attention_mask is None and q_len > 1,
    # )

    # TODO: rejoin the output to fit the desired size.
    output = torch.empty_like(query_states)
    for head in range(query_states.size(1)):
        q = query_states[:, head, :, :].unsqueeze(1)
        k = key_states[:, head, :, :].unsqueeze(1)
        v = value_states[:, head, :, :].unsqueeze(1)
        # if self.layer_idx >= self.starting_layer and not self.is_search:
        attn_output = block_sparse_attention(q, k, v, head, block_size_M=16, block_size_N=16, subselect_scale=16) 
        # else:
        #     attn_output = torch.nn.functional.scaled_dot_product_attention(
        #         q,
        #         k,
        #         v,
        #         attn_mask=attention_mask,
        #         is_causal=self.is_causal
        #     )
        output[:, head:head + 1] = attn_output

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) # q_len is padded to be of the same length for everything

    attn_output = self.o_proj(attn_output)

    return attn_output, None, past_key_value

for i in range(len(model.model.layers)):
    model.model.layers[i].self_attn.forward = sdpa_forward.__get__(model.model.layers[i].self_attn, type(model.model.layers[i].self_attn))

In [7]:
sample_text = open("snapkv_full.txt", "r", encoding="utf-8").read()
encoded_tokens = tokenizer(sample_text, return_tensors="pt")
for key in encoded_tokens:
    encoded_tokens[key] = encoded_tokens[key].cuda()

# find a way to start by chunking this portion

# there shouuld be a mix between the global attention and the attention that only the last few tokens (query) pay

past_key_values = None
outputs = model.generate(**encoded_tokens, output_attentions=True, return_dict_in_generate=True, max_new_tokens=1)

Padding length:  79 79 Q states:  torch.Size([1, 32, 17664, 128])  Key states:  torch.Size([1, 32, 17664, 128])
Params: block_m 16 block_n 16 subselect_scale 16
Subselect input shapes: Query shape: torch.Size([1, 1, 17664, 128]) Key shape: torch.Size([1, 1, 17664, 128]) Subselect scale: 16
Query shape in _build_block_index:  torch.Size([1, 1, 1104, 128])  Key shape in _build_block_index:  torch.Size([1, 1, 1104, 128])
After taking the mean: query shape:  torch.Size([1, 1, 69, 128])  key shape:  torch.Size([1, 1, 69, 128])
Top k:  0
Block sparse attention scores measurement:  torch.Size([1, 1, 1104, 128]) torch.Size([1, 1, 1104, 128])
Q shape:  torch.Size([1, 1, 1104, 128])  K shape:  torch.Size([1, 1, 1104, 128])  Seqlens shape:  torch.Size([1])  Block index:  torch.Size([1, 1, 69, 0])
Output shape:  torch.Size([1, 1, 1104, 0])
O_ptrs 16 128
Offs_n 64 64
P shape: 16 64
Store cols shape: 64
Regular cols shape: 64


CompilationError: at 82:17:
        # -- compute scaling constant --
        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
        alpha = tl.math.exp2(m_i - m_i_new)
        p = tl.math.exp2(qk - m_i_new[:, None])
        # -- scale and update acc --
        # acc_scale = l_i * 0 + alpha  # workaround some compiler bug
        # acc *= acc_scale[:, None]
        store_cols = (sparse_block_idx * BLOCK_N) + offs_n
        tl.static_print(f"P shape: {p.shape[0]} {p.shape[1]}")
        tl.static_print(f"Store cols shape: {store_cols.shape[0]}")
        tl.static_print(f"Regular cols shape: {cols.shape[0]}")
        tl.store(o_ptrs + store_cols[None, :], p.to(dtype), mask=m_mask)
                 ^
ValueError('Cannot make_shape_compatible: incompatible dimensions at index 1: 128 and 64')