<a href="https://colab.research.google.com/github/usp787/CS5800_Final_Project_KV_Cache/blob/Code/kv_cache_code_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
import psutil
import os
from typing import List, Dict, Tuple

In [21]:
# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [22]:
# CONFIGURATION - Easy to Edit

CONFIG = {
    # Experiment parameters
    'output_lengths': [10, 50, 100, 250, 500],  # Token lengths to test
    'trials_per_length': 3,                      # Number of trials per length
    'initial_prompt': "The future of artificial intelligence",  # Starting prompt

    # Generation parameters
    'temperature': 0.7,
    'top_k': 50,
    'do_sample': True,

    # Model
    'model_name': 'distilgpt2'
}

In [23]:
# ### 3. Load Model and Tokenizer

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = AutoModelForCausalLM.from_pretrained(CONFIG['model_name'])

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on: {device}")
print(f"Model parameters: {model.num_parameters():,}")

Loading model and tokenizer...
Model loaded on: cuda
Model parameters: 81,912,576


In [24]:
# ### 4. Memory Tracking Utilities

# In[4]:


def get_memory_usage() -> Dict[str, float]:
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()

    memory_data = {
        'ram_mb': memory_info.rss / 1024 / 1024,  # Resident Set Size
    }

    # GPU memory if available
    if torch.cuda.is_available():
        memory_data['gpu_allocated_mb'] = torch.cuda.memory_allocated() / 1024 / 1024
        memory_data['gpu_reserved_mb'] = torch.cuda.memory_reserved() / 1024 / 1024

    return memory_data

def get_model_size() -> float:
    """Calculate model size in MB"""
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / 1024 / 1024

In [25]:
# ### 5. Token Generation WITHOUT KV Cache

# In[5]:


def generate_without_cache(
    prompt: str,
    max_new_tokens: int,
    measure_memory: bool = True
) -> Dict:
    """
    Generate tokens WITHOUT KV cache and measure performance

    Returns:
        Dictionary with timing and memory metrics
    """
    # Encode prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    prompt_length = input_ids.shape[1]

    # Get initial memory
    if measure_memory:
        initial_memory = get_memory_usage()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Generate WITHOUT cache
    model.config.use_cache = False  # Disable KV cache

    start_time = time.perf_counter()

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            use_cache=False,  # CRITICAL: No KV cache
            do_sample=CONFIG['do_sample'],
            temperature=CONFIG['temperature'],
            top_k=CONFIG['top_k'],
            pad_token_id=tokenizer.pad_token_id
        )

    end_time = time.perf_counter()
    generation_time = end_time - start_time

    # Get final memory
    if measure_memory:
        final_memory = get_memory_usage()
        memory_used = {
            key: final_memory.get(key, 0) - initial_memory.get(key, 0)
            for key in initial_memory.keys()
        }
    else:
        memory_used = {}

    # Calculate tokens generated
    total_tokens = output.shape[1]
    tokens_generated = total_tokens - prompt_length

    # Decode output (optional - for verification)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return {
        'generation_time': generation_time,
        'tokens_generated': tokens_generated,
        'time_per_token': generation_time / tokens_generated if tokens_generated > 0 else 0,
        'prompt_length': prompt_length,
        'total_length': total_tokens,
        'memory_used': memory_used,
        'generated_text': generated_text
    }


In [26]:
# ### 6. Run Experiments

# In[6]:


print("=" * 80)
print("STARTING EXPERIMENTS - WITHOUT KV CACHE")
print("=" * 80)
print(f"\nInitial Prompt: '{CONFIG['initial_prompt']}'")
print(f"Output Lengths: {CONFIG['output_lengths']}")
print(f"Trials per Length: {CONFIG['trials_per_length']}")
print(f"Model Size: {get_model_size():.2f} MB")
print("\n")

# Store all results
all_results = []

# Run experiments for each output length
for output_length in CONFIG['output_lengths']:
    print(f"\n--- Testing Output Length: {output_length} tokens ---")

    for trial in range(CONFIG['trials_per_length']):
        print(f"  Trial {trial + 1}/{CONFIG['trials_per_length']}...", end=" ")

        # Run generation
        result = generate_without_cache(
            CONFIG['initial_prompt'],
            output_length,
            measure_memory=True
        )

        # Store results
        result_record = {
            'output_length': output_length,
            'trial': trial + 1,
            'generation_time_ms': result['generation_time'] * 1000,
            'time_per_token_ms': result['time_per_token'] * 1000,
            'tokens_generated': result['tokens_generated'],
            'prompt_length': result['prompt_length'],
            'ram_used_mb': result['memory_used'].get('ram_mb', 0),
        }

        # Add GPU memory if available
        if 'gpu_allocated_mb' in result['memory_used']:
            result_record['gpu_allocated_mb'] = result['memory_used']['gpu_allocated_mb']
            result_record['gpu_reserved_mb'] = result['memory_used']['gpu_reserved_mb']

        all_results.append(result_record)

        print(f"✓ {result['generation_time']*1000:.2f}ms ({result['time_per_token']*1000:.2f}ms/token)")

print("\n" + "=" * 80)
print("EXPERIMENTS COMPLETE")
print("=" * 80)


STARTING EXPERIMENTS - WITHOUT KV CACHE

Initial Prompt: 'The future of artificial intelligence'
Output Lengths: [10, 50, 100, 250, 500]
Trials per Length: 3
Model Size: 318.47 MB



--- Testing Output Length: 10 tokens ---
  Trial 1/3... ✓ 135.58ms (13.56ms/token)
  Trial 2/3... ✓ 155.91ms (15.59ms/token)
  Trial 3/3... ✓ 175.58ms (17.56ms/token)

--- Testing Output Length: 50 tokens ---
  Trial 1/3... ✓ 861.40ms (17.23ms/token)
  Trial 2/3... ✓ 355.00ms (7.10ms/token)
  Trial 3/3... ✓ 282.32ms (5.65ms/token)

--- Testing Output Length: 100 tokens ---
  Trial 1/3... ✓ 132.74ms (5.53ms/token)
  Trial 2/3... ✓ 581.77ms (5.82ms/token)
  Trial 3/3... ✓ 565.56ms (5.66ms/token)

--- Testing Output Length: 250 tokens ---
  Trial 1/3... ✓ 1607.90ms (6.43ms/token)
  Trial 2/3... ✓ 1633.66ms (6.53ms/token)
  Trial 3/3... ✓ 180.16ms (6.01ms/token)

--- Testing Output Length: 500 tokens ---
  Trial 1/3... ✓ 4791.14ms (9.58ms/token)
  Trial 2/3... ✓ 5165.88ms (10.33ms/token)
  Trial 3/3... ✓ 150.2