In [1]:
from torch import nn
import torch

class MockTransformerLayer(nn.Module):
    def __init__(self):
        super(MockTransformerLayer, self).__init__()
        self.fc = nn.Linear(1024, 1024)

    def forward(self, x):
        return self.fc(x)

class MockTransformer(nn.Module):
    def __init__(self, n_layers):
        super(MockTransformer, self).__init__()
        self.layers = nn.ModuleList([MockTransformerLayer() for _ in range(n_layers)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
import contextlib

from torch.cuda import Stream


s = Stream()

torch.manual_seed(42)
t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True)
t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False)
t3_cuda = torch.randn(1024**2 * 5, device="cuda:0")

assert torch.cuda.is_available()
device = torch.device("cuda", torch.cuda.current_device())


# The function we want to profile
def inner(pinned: bool, streamed: bool):
    with torch.cuda.stream(s) if streamed else contextlib.nullcontext():
        if pinned:
            t1_cuda = t1_cpu_pinned.to(device, non_blocking=True)
        else:
            t2_cuda = t2_cpu_paged.to(device, non_blocking=True)
        t_star_cuda_h2d_event = s.record_event()
    # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
    #  done in the other stream
    t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
    t3_cuda_h2d_event = torch.cuda.current_stream().record_event()
    t_star_cuda_h2d_event.synchronize()
    t3_cuda_h2d_event.synchronize()


# Our profiler: profiles the `inner` function and stores the results in a .json file
def benchmark_with_profiler(
    pinned,
    streamed,
) -> None:
    torch._C._profiler._set_cuda_sync_enabled_val(True)
    wait, warmup, active = 1, 1, 2
    num_steps = wait + warmup + active
    rank = 0
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        schedule=torch.profiler.schedule(
            wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
        ),
        with_stack=True,
        record_shapes=True,
        profile_memory=True,
    ) as prof:
        for step_idx in range(1, num_steps + 1):
            inner(streamed=streamed, pinned=pinned)
            if rank is None or rank == 0:
                prof.step()
    prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json")

In [3]:
def benchmark_with_profiler(fn) -> None:
    torch._C._profiler._set_cuda_sync_enabled_val(True)
    wait, warmup, active = 1, 1, 2
    num_steps = wait + warmup + active
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        schedule=torch.profiler.schedule(
            wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1
        ),
    ) as prof:
        for step_idx in range(1, num_steps + 1):
            fn()
            prof.step()
    prof.export_chrome_trace(f"trace.json")

In [None]:
import torch
from torch import nn
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple
from k_means_constrained import KMeansConstrained

executor = ThreadPoolExecutor(max_workers=1)

def reorder_random(act: np.ndarray, slice_size: int = 32) -> np.ndarray:
    n_tokens = act.shape[0]
    perm = np.random.permutation(n_tokens)
    return perm

def reorder_smat(act: torch.Tensor, slice_size: int, threshold: float = 0.7) -> torch.Tensor:
    clusters = _smat(act, slice_size, threshold)
    
    perm = torch.zeros(act.shape[0], dtype=torch.int64)
    current_index = 0
    for cluster in clusters:
        for token_index, _ in cluster:
            perm[current_index] = token_index
            current_index += 1

    return perm

def _smat(act: torch.Tensor, slice_size: int, threshold: float) -> List[List[Tuple[int, torch.Tensor]]]:
    V = list(enumerate(act))
    C = []
    
    def dist(v1: torch.Tensor, v2: torch.Tensor) -> float:
        intersection = torch.sum(v1 & v2).item()
        union = torch.sum(v1 | v2).item()
        epsilon = 1e-8
        return 1.0 - (intersection + epsilon) / (union + epsilon)
    
    while V:
        v = V.pop(0)
        c = [v]
        pc = v[1]
        
        for w in V[:]:
            if dist(w[1], pc) <= threshold:
                V.remove(w)
                c.append(w)
                pc = pc | w[1]
        C.append(c)
    
    return C

stream = torch.cuda.Stream()

def cluster(device_tensor):
    host_tensor = torch.empty_like(device_tensor, device='cpu', pin_memory=True)
    with torch.cuda.stream(stream):
        host_tensor.copy_(device_tensor, non_blocking=True)
        device_to_tensor_event = stream.record_event()
    device_to_tensor_event.synchronize()
    # host_ndarray = host_tensor.numpy()
    # perm_ndarray = reorder_random(host_ndarray)
    host_tensor_bool = (host_tensor > 0.01).to(torch.int8)
    host_perm = reorder_smat(host_tensor_bool, 32)
    device_perm = torch.empty_like(host_perm, device='cuda')
    with torch.cuda.stream(stream):
        device_perm.copy_(host_perm, non_blocking=True)
        tensor_to_device_event = stream.record_event()
    tensor_to_device_event.synchronize()
    return device_perm

class MockTransformerLayer(torch.nn.Module):
    def __init__(self):
        super(MockTransformerLayer, self).__init__()
        self.fc1 = nn.Linear(4096, 14336, bias=False, dtype=torch.float16)
        self.fc2 = nn.Linear(14336, 4096, bias=False, dtype=torch.float16)

    def forward(self, x):
        future = executor.submit(cluster, x)
        x = self.fc1(x)
        x = self.fc2(x)
        perm = future.result()
        # print('perm:', perm)
        return x

model = MockTransformerLayer().cuda()
x = torch.randn(512, 4096, dtype=torch.float16, device='cuda')

model.eval()

def fn():
    model(x)
    torch.cuda.synchronize()

benchmark_with_profiler(fn)


In [8]:
import numpy as np
from k_means_constrained import KMeansConstrained
from torch.utils.benchmark import Timer

# Setup your data and model
arr = np.random.rand(512, 4096)
# convert arr to bool
# arr = arr > 0.5
clf = KMeansConstrained(
    n_clusters=16,
    size_min=32,
    size_max=32,
    n_jobs=-1,
)

# Create a Timer object
timer = Timer(
    stmt='clf.fit_predict(arr)',
    globals={'clf': clf, 'arr': arr}
)

# Run the timer
result = timer.timeit(number=1)

# Print the timing result
print(f"Time taken for clf.fit_predict(arr): {result.mean:.6f} seconds")

Time taken for clf.fit_predict(arr): 1.752438 seconds


In [None]:
from transformers import LlamaForCausalLM
