In [None]:
import os
import sys
sys.path.append("../train-vanilla-transformer/")
import torch
from transformer import Transformer
import matplotlib.pyplot as plt
from tqdm import tqdm

# Initialize model parameters
vocab_size = 256  # Using a small vocab size for testing
hidden_dim = 512
n_layers = 6
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create model
model = Transformer(depth=n_layers, hidden_dim=hidden_dim, vocab_size=vocab_size, device=device)
model = model.to(device)
model.eval()  # Set to evaluation mode

# Generate sample input
batch_size = 1
seq_len = 512
x = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)

# Run inference
with torch.no_grad():
    out = model(x)


ModuleNotFoundError: No module named 'transformer'

In [None]:
import os
import sys
import time
import torch
from transformer import Transformer
from generation import TransformerGenerator # Import the KV cache generator
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

# Initialize model parameters
vocab_size = 256  # Using a small vocab size for testing
hidden_dim = 512
n_layers = 6
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create model
model = Transformer(depth=n_layers, hidden_dim=hidden_dim, vocab_size=vocab_size, device=device)
model = model.to(device)
model.eval()  # Set to evaluation mode

# Create KV cache generator instance
generator = TransformerGenerator(model)

# Profile decoding across different numbers of tokens to generate
input_seq_len = 512  # Fixed input sequence length
num_decode_tokens = [512, 2048, 4096, 8192]  # Number of tokens to decode
decode_times_no_kv = []
decode_times_kv = []
n_runs = 1

with torch.no_grad():
    for n_tokens in tqdm(num_decode_tokens):
        batch_size = 1
        times_for_this_n_no_kv = []
        times_for_this_n_kv = []

        for _ in range(n_runs):
            # Create input sequence (prompt)
            prompt_tensor = torch.randint(0, vocab_size, (batch_size, input_seq_len)).to(device)
            prompt_list = prompt_tensor.squeeze(0).tolist() # For KV cache generator

            # --- Time the decoding WITHOUT KV Cache ---
            x = prompt_tensor.clone() # Use a copy for the non-KV version
            start_time_no_kv = time.time()
            # Autoregressive decoding (no KV cache)
            for _ in range(n_tokens):
                logits = model(x)  # Get next token logits
                # logits is [B, S, V] from model.unemb, logits[:, -1:] is [B, 1, V]
                next_token = torch.argmax(logits[:, -1:, :], dim=-1)  # [B, 1]
                x = torch.cat([x, next_token], dim=1)  # Append to sequence
            torch.cuda.synchronize()  # Wait for CUDA operations to complete
            end_time_no_kv = time.time()
            times_for_this_n_no_kv.append((end_time_no_kv - start_time_no_kv) * 1000)  # Convert to ms

            # --- Time the decoding WITH KV Cache ---
            start_time_kv = time.time()
            # Autoregressive decoding (with KV cache) using TransformerGenerator
            _ = generator.generate(prompt_list, n_tokens)
            torch.cuda.synchronize() # Wait for CUDA operations to complete
            end_time_kv = time.time()
            times_for_this_n_kv.append((end_time_kv - start_time_kv) * 1000) # Convert to ms

        # Take average across runs
        avg_time_no_kv = np.mean(times_for_this_n_no_kv)
        decode_times_no_kv.append(avg_time_no_kv)
        avg_time_kv = np.mean(times_for_this_n_kv)
        decode_times_kv.append(avg_time_kv)


# Plot results
plt.figure(figsize=(10, 6))
plt.plot(num_decode_tokens, decode_times_no_kv, marker='o', label='No KV Cache')
plt.plot(num_decode_tokens, decode_times_kv, marker='s', label='With KV Cache')
plt.xlabel('Number of Decoded Tokens')
plt.ylabel('Total Time (ms)')
plt.title(f'Decoding Latency vs Number of Generated Tokens\n(Input Length: {input_seq_len}, Averaged over {n_runs} runs)')
plt.grid(True)
plt.legend()
plt.yscale('log')
plt.xscale('log')
plt.show()

# Print average time per token
print(f"\nAverage time per decoded token (averaged over {n_runs} runs):")
print("----------------------------------------------------")
print("Decoding Method | Tokens Decoded | Avg ms/token")
print("----------------------------------------------------")
for n_tokens, total_time_no_kv, total_time_kv in zip(num_decode_tokens, decode_times_no_kv, decode_times_kv):
    avg_per_token_no_kv = total_time_no_kv / n_tokens if n_tokens > 0 else 0
    avg_per_token_kv = total_time_kv / n_tokens if n_tokens > 0 else 0
    print(f"No KV Cache     | {n_tokens:14d} | {avg_per_token_no_kv:12.2f}")
    print(f"With KV Cache   | {n_tokens:14d} | {avg_per_token_kv:12.2f}")
    print("----------------------------------------------------")


ModuleNotFoundError: No module named 'transformer'

In [2]:
import os
import sys
import time
try:
    os.chdir("/n/home11/tanishqkumar/gravity-chamber/fundamentals/llm")
except FileNotFoundError:
    print("Warning: Could not change directory. Make sure the path is correct.")

import torch
from transformer import Transformer
from generation import TransformerGenerator
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

vocab_size = 256
hidden_dim = 512
n_layers = 6
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize model with layer_idx for attention layers
model = Transformer(depth=n_layers, hidden_dim=hidden_dim, vocab_size=vocab_size, device=device) # , gqa=True
model = model.to(device)
model.eval()

# Initialize generator with model
generator = TransformerGenerator(model)

input_seq_len = 512
num_decode_tokens = [256]
n_runs = 20

with torch.no_grad():
    for n_tokens in tqdm(num_decode_tokens):
        batch_size = 1
        times_for_this_n_kv = []

        for _ in range(n_runs):
            # Generate random prompt
            prompt_tensor = torch.randint(0, vocab_size, (batch_size, input_seq_len)).to(device)
            prompt_list = prompt_tensor.squeeze(0).tolist()

            # Time the generation
            start_time_kv = time.time()
            generated = generator.generate(prompt_list, n_tokens)
            torch.cuda.synchronize()
            end_time_kv = time.time()
            times_for_this_n_kv.append((end_time_kv - start_time_kv) * 1000)


        # Calculate and print metrics
        avg_time_kv = np.mean(times_for_this_n_kv)
        tokens_per_sec = n_tokens / (avg_time_kv / 1000)
        print(f"Tokens per second with KV cache: {tokens_per_sec:.2f}")


  0%|                                                                                                                                                  | 0/1 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.53s/it]

Tokens per second with KV cache: 378.62



