<a href="https://colab.research.google.com/github/usp787/CS5800_Final_Project_KV_Cache/blob/Code/kv_cache_code_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
import psutil
import os
from typing import Dict, List
import gc

In [None]:
# Configuration
CONFIG = {
    'output_lengths': [10, 25, 50, 75, 100, 250, 500, 750, 1000],
    'trials_per_length': 5,  # Increased for better statistics
    'warmup_runs': 3,  # Warmup runs per configuration
    'initial_prompt': "The future of artificial intelligence",
    'temperature': 0.7,
    'top_k': 50,
    'do_sample': True,
    'model_name': 'distilgpt2'
}

In [None]:
def setup_model_and_tokenizer():
    """Load model and tokenizer"""
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
    model = AutoModelForCausalLM.from_pretrained(CONFIG['model_name'])

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Set to evaluation mode

    print(f"Model loaded on: {device}")
    print(f"Model parameters: {model.num_parameters():,}")

    return model, tokenizer, device

In [None]:
def get_memory_usage() -> Dict[str, float]:
    """Get current memory usage in MB"""
    memory_data = {}

    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Wait for all operations to complete
        memory_data['gpu_allocated_mb'] = torch.cuda.memory_allocated() / 1024 / 1024
        memory_data['gpu_reserved_mb'] = torch.cuda.memory_reserved() / 1024 / 1024

    return memory_data

In [None]:
def clear_memory():
    """Clear GPU memory cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

In [None]:
def generate_tokens(
    model,
    tokenizer,
    device,
    prompt: str,
    max_new_tokens: int,
    use_cache: bool,
    measure_memory: bool = False
) -> Dict:
    """
    Generate tokens with or without KV cache

    Args:
        use_cache: If True, use KV cache; if False, disable it
    """
    # Encode prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    prompt_length = input_ids.shape[1]

    # Clear memory before measurement
    if measure_memory:
        clear_memory()
        initial_memory = get_memory_usage()

    # Set cache configuration
    model.config.use_cache = use_cache

    # Synchronize GPU before timing
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    start_time = time.perf_counter()

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            use_cache=use_cache,
            do_sample=CONFIG['do_sample'],
            temperature=CONFIG['temperature'],
            top_k=CONFIG['top_k'],
            pad_token_id=tokenizer.pad_token_id,
            attention_mask=torch.ones_like(input_ids)  # Explicit attention mask
        )

    # Synchronize GPU after generation
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    end_time = time.perf_counter()
    generation_time = end_time - start_time

    # Measure memory after generation
    if measure_memory:
        final_memory = get_memory_usage()
        memory_used = {
            key: final_memory.get(key, 0) - initial_memory.get(key, 0)
            for key in initial_memory.keys()
        }
    else:
        memory_used = {}

    # Calculate metrics
    total_tokens = output.shape[1]
    tokens_generated = total_tokens - prompt_length

    return {
        'generation_time': generation_time,
        'tokens_generated': tokens_generated,
        'time_per_token': generation_time / tokens_generated if tokens_generated > 0 else 0,
        'prompt_length': prompt_length,
        'total_length': total_tokens,
        'memory_used': memory_used,
        'use_cache': use_cache
    }

In [None]:
def warmup_model(model, tokenizer, device, max_new_tokens: int, use_cache: bool):
    """Warmup the model to avoid first-run overhead"""
    print(f"  Warming up (use_cache={use_cache})...", end=" ")
    for _ in range(CONFIG['warmup_runs']):
        _ = generate_tokens(
            model, tokenizer, device,
            CONFIG['initial_prompt'],
            max_new_tokens,
            use_cache,
            measure_memory=False
        )
    clear_memory()
    print("✓")

In [None]:
def run_experiments(model, tokenizer, device):
    """Run complete experimental suite"""
    print("\n" + "="*80)
    print("STARTING KV CACHE COMPARISON EXPERIMENTS")
    print("="*80)
    print(f"Prompt: '{CONFIG['initial_prompt']}'")
    print(f"Output lengths: {CONFIG['output_lengths']}")
    print(f"Trials per configuration: {CONFIG['trials_per_length']}")
    print(f"Warmup runs: {CONFIG['warmup_runs']}")

    all_results = []

    for output_length in CONFIG['output_lengths']:
        print(f"\n{'─'*80}")
        print(f"Testing output length: {output_length} tokens")
        print(f"{'─'*80}")

        # Test both with and without cache
        for use_cache in [True, False]:
            cache_str = "WITH" if use_cache else "WITHOUT"
            print(f"\n{cache_str} KV Cache:")

            # Warmup for this configuration
            warmup_model(model, tokenizer, device, output_length, use_cache)

            # Run trials
            for trial in range(CONFIG['trials_per_length']):
                print(f"  Trial {trial + 1}/{CONFIG['trials_per_length']}...", end=" ")

                result = generate_tokens(
                    model, tokenizer, device,
                    CONFIG['initial_prompt'],
                    output_length,
                    use_cache,
                    measure_memory=(trial == 0)  # Only measure memory on first trial
                )

                # Store results
                result_record = {
                    'output_length': output_length,
                    'use_cache': use_cache,
                    'trial': trial + 1,
                    'generation_time_ms': result['generation_time'] * 1000,
                    'time_per_token_ms': result['time_per_token'] * 1000,
                    'tokens_generated': result['tokens_generated'],
                }

                # Add memory info if measured
                if result['memory_used']:
                    result_record.update({
                        f'{k}': v for k, v in result['memory_used'].items()
                    })

                all_results.append(result_record)

                print(f"✓ {result['generation_time']*1000:.2f}ms ({result['time_per_token']*1000:.2f}ms/token)")

                # Small delay between trials
                time.sleep(0.1)

    print("\n" + "="*80)
    print("EXPERIMENTS COMPLETE")
    print("="*80)

    return pd.DataFrame(all_results)

In [None]:
def analyze_results(df: pd.DataFrame):
    """Analyze and visualize experimental results"""
    print("\n" + "="*80)
    print("STATISTICAL ANALYSIS")
    print("="*80)

    # Group by output length and cache usage
    summary = df.groupby(['output_length', 'use_cache']).agg({
        'generation_time_ms': ['mean', 'std', 'min', 'max'],
        'time_per_token_ms': ['mean', 'std']
    }).round(2)

    print("\nGeneration Time Summary (ms):")
    print(summary)

    # Calculate speedup
    print("\n" + "─"*80)
    print("SPEEDUP ANALYSIS")
    print("─"*80)

    for length in df['output_length'].unique():
        with_cache = df[(df['output_length'] == length) & (df['use_cache'] == True)]['time_per_token_ms'].mean()
        without_cache = df[(df['output_length'] == length) & (df['use_cache'] == False)]['time_per_token_ms'].mean()
        speedup = without_cache / with_cache

        print(f"\nLength {length} tokens:")
        print(f"  WITH cache:    {with_cache:.2f} ms/token")
        print(f"  WITHOUT cache: {without_cache:.2f} ms/token")
        print(f"  Speedup:       {speedup:.2f}x")

    return summary

In [None]:
# Main execution
if __name__ == "__main__":
    # Setup
    model, tokenizer, device = setup_model_and_tokenizer()

    # Run experiments
    results_df = run_experiments(model, tokenizer, device)

    # Save raw results
    #results_df.to_csv('kv_cache_results.csv', index=False)
    #print("\nResults saved to 'kv_cache_results.csv'")

    # Analyze
    summary = analyze_results(results_df)

Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded on: cuda
Model parameters: 81,912,576

STARTING KV CACHE COMPARISON EXPERIMENTS
Prompt: 'The future of artificial intelligence'
Output lengths: [10, 25, 50, 75, 100, 250, 500, 750, 1000]
Trials per configuration: 5
Warmup runs: 3

────────────────────────────────────────────────────────────────────────────────
Testing output length: 10 tokens
────────────────────────────────────────────────────────────────────────────────

WITH KV Cache:
  Warming up (use_cache=True)... ✓
  Trial 1/5... ✓ 80.02ms (8.00ms/token)
  Trial 2/5... ✓ 56.01ms (5.60ms/token)
  Trial 3/5... ✓ 55.91ms (5.59ms/token)
  Trial 4/5... ✓ 58.61ms (5.86ms/token)
  Trial 5/5... ✓ 68.72ms (6.87ms/token)

WITHOUT KV Cache:
  Warming up (use_cache=False)... ✓
  Trial 1/5... ✓ 59.18ms (5.92ms/token)
  Trial 2/5... ✓ 57.00ms (5.70ms/token)
  Trial 3/5... ✓ 56.90ms (5.69ms/token)
  Trial 4/5... ✓ 60.08ms (6.01ms/token)
  Trial 5/5... ✓ 57.25ms (5.73ms/token)

──────────────────────────────────────────────────────