<a href="https://colab.research.google.com/github/usp787/CS5800_Final_Project_KV_Cache/blob/Code/kv_cache_code_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
import psutil
import os
from typing import List, Dict, Tuple

In [3]:
# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [4]:
# CONFIGURATION - Easy to Edit

CONFIG = {
    # Experiment parameters
    'output_lengths': [10, 50, 100, 250, 500],  # Token lengths to test
    'trials_per_length': 3,                      # Number of trials per length
    'initial_prompt': "The future of artificial intelligence",  # Starting prompt

    # Generation parameters
    'temperature': 0.7,
    'top_k': 50,
    'do_sample': True,

    # Model
    'model_name': 'distilgpt2'
}

In [5]:
# SETUP - Load Model and Tokenizer

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = AutoModelForCausalLM.from_pretrained(CONFIG['model_name'])

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on: {device}")
print(f"Model parameters: {model.num_parameters():,}")

Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded on: cuda
Model parameters: 81,912,576


In [6]:
# MEMORY TRACKING UTILITIES

def get_memory_usage() -> Dict[str, float]:
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()

    memory_data = {
        'ram_mb': memory_info.rss / 1024 / 1024,  # Resident Set Size
    }

    # GPU memory if available
    if torch.cuda.is_available():
        memory_data['gpu_allocated_mb'] = torch.cuda.memory_allocated() / 1024 / 1024
        memory_data['gpu_reserved_mb'] = torch.cuda.memory_reserved() / 1024 / 1024

    return memory_data

def get_model_size() -> float:
    """Calculate model size in MB"""
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
    return (param_size + buffer_size) / 1024 / 1024

In [7]:
# TOKEN GENERATION WITHOUT KV CACHE

def generate_without_cache(
    prompt: str,
    max_new_tokens: int,
    measure_memory: bool = True
) -> Dict:
    """
    Generate tokens WITHOUT KV cache and measure performance

    Returns:
        Dictionary with timing and memory metrics
    """
    # Encode prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    prompt_length = input_ids.shape[1]

    # Get initial memory
    if measure_memory:
        initial_memory = get_memory_usage()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Generate WITHOUT cache
    model.config.use_cache = False  # Disable KV cache

    start_time = time.perf_counter()

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            use_cache=False,  # CRITICAL: No KV cache
            do_sample=CONFIG['do_sample'],
            temperature=CONFIG['temperature'],
            top_k=CONFIG['top_k'],
            pad_token_id=tokenizer.pad_token_id
        )

    end_time = time.perf_counter()
    generation_time = end_time - start_time

    # Get final memory
    if measure_memory:
        final_memory = get_memory_usage()
        memory_used = {
            key: final_memory.get(key, 0) - initial_memory.get(key, 0)
            for key in initial_memory.keys()
        }
    else:
        memory_used = {}

    # Calculate tokens generated
    total_tokens = output.shape[1]
    tokens_generated = total_tokens - prompt_length

    # Decode output (optional - for verification)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return {
        'generation_time': generation_time,
        'tokens_generated': tokens_generated,
        'time_per_token': generation_time / tokens_generated if tokens_generated > 0 else 0,
        'prompt_length': prompt_length,
        'total_length': total_tokens,
        'memory_used': memory_used,
        'generated_text': generated_text
    }

In [8]:
# RUN EXPERIMENTS

print("\n" + "="*80)
print("STARTING EXPERIMENTS - WITHOUT KV CACHE")
print("="*80)
print(f"\nInitial Prompt: '{CONFIG['initial_prompt']}'")
print(f"Output Lengths: {CONFIG['output_lengths']}")
print(f"Trials per Length: {CONFIG['trials_per_length']}")
print(f"Model Size: {get_model_size():.2f} MB")
print("\n")


STARTING EXPERIMENTS - WITHOUT KV CACHE

Initial Prompt: 'The future of artificial intelligence'
Output Lengths: [10, 50, 100, 250, 500]
Trials per Length: 3
Model Size: 318.47 MB




In [9]:
all_results = []

# Run experiments for each output length
for output_length in CONFIG['output_lengths']:
    print(f"\n--- Testing Output Length: {output_length} tokens ---")

    for trial in range(CONFIG['trials_per_length']):
        print(f"  Trial {trial + 1}/{CONFIG['trials_per_length']}...", end=" ")

        # Run generation
        result = generate_without_cache(
            CONFIG['initial_prompt'],
            output_length,
            measure_memory=True
        )

        # Store results
        result_record = {
            'output_length': output_length,
            'trial': trial + 1,
            'generation_time_ms': result['generation_time'] * 1000,
            'time_per_token_ms': result['time_per_token'] * 1000,
            'tokens_generated': result['tokens_generated'],
            'prompt_length': result['prompt_length'],
            'ram_used_mb': result['memory_used'].get('ram_mb', 0),
        }

        # Add GPU memory if available
        if 'gpu_allocated_mb' in result['memory_used']:
            result_record['gpu_allocated_mb'] = result['memory_used']['gpu_allocated_mb']
            result_record['gpu_reserved_mb'] = result['memory_used']['gpu_reserved_mb']

        all_results.append(result_record)

        print(f"✓ {result['generation_time']*1000:.2f}ms ({result['time_per_token']*1000:.2f}ms/token)")

print("\n" + "="*80)
print("EXPERIMENTS COMPLETE")
print("="*80)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Testing Output Length: 10 tokens ---
  Trial 1/3... ✓ 1166.77ms (116.68ms/token)
  Trial 2/3... ✓ 56.01ms (5.60ms/token)
  Trial 3/3... ✓ 58.52ms (5.85ms/token)

--- Testing Output Length: 50 tokens ---
  Trial 1/3... ✓ 306.55ms (6.13ms/token)
  Trial 2/3... ✓ 296.56ms (5.93ms/token)
  Trial 3/3... ✓ 289.70ms (5.79ms/token)

--- Testing Output Length: 100 tokens ---
  Trial 1/3... ✓ 573.22ms (5.73ms/token)
  Trial 2/3... ✓ 73.22ms (6.66ms/token)
  Trial 3/3... ✓ 575.61ms (5.76ms/token)

--- Testing Output Length: 250 tokens ---
  Trial 1/3... ✓ 986.93ms (5.95ms/token)
  Trial 2/3... ✓ 52.08ms (5.79ms/token)
  Trial 3/3... ✓ 55.95ms (5.59ms/token)

--- Testing Output Length: 500 tokens ---
  Trial 1/3... ✓ 4984.32ms (9.97ms/token)
  Trial 2/3... ✓ 4829.33ms (9.66ms/token)
  Trial 3/3... ✓ 1273.40ms (6.24ms/token)

EXPERIMENTS COMPLETE


In [10]:
# DATA ANALYSIS

# Convert to DataFrame
df = pd.DataFrame(all_results)

# Calculate summary statistics
summary_stats = df.groupby('output_length').agg({
    'generation_time_ms': ['mean', 'std', 'min', 'max'],
    'time_per_token_ms': ['mean', 'std'],
    'ram_used_mb': ['mean', 'max']
}).round(2)

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(summary_stats)

# Save results to CSV
csv_filename = 'kv_cache_baseline_results.csv'
df.to_csv(csv_filename, index=False)
print(f"\n✓ Results saved to: {csv_filename}")


SUMMARY STATISTICS
              generation_time_ms                            time_per_token_ms  \
                            mean      std      min      max              mean   
output_length                                                                   
10                        427.10   640.57    56.01  1166.77             42.71   
50                        297.60     8.47   289.70   306.55              5.95   
100                       407.35   289.37    73.22   575.61              6.05   
250                       364.98   538.62    52.08   986.93              5.78   
500                      3695.68  2099.19  1273.40  4984.32              8.62   

                     ram_used_mb          
                 std        mean     max  
output_length                             
10             64.06      164.57  493.63  
50              0.17        0.23    0.52  
100             0.53        0.11    0.18  
250             0.18        0.05    0.14  
500             2.07        0.

In [None]:
# Token generation with KV_cache