# Zenith Performance Benchmark - Proper Backend

This notebook tests the **real** Zenith optimization pipeline, not a pass-through.

**Hardware:** NVIDIA Tesla T4 (Google Colab)
**Model:** TinyLlama 1.1B with LoRA fine-tuning

In [None]:
# Install dependencies
!pip install -q transformers peft datasets trl accelerate bitsandbytes
# Install Zenith from GitHub (latest with integrations module)
!pip install -q git+https://github.com/vibeswithkk/ZENITH.git

In [None]:
# Verify GPU
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Import Zenith - this auto-registers the 'zenith' backend!
import zenith
print(f"Zenith Version: {zenith.__version__}")

# Check if backend is registered
from zenith.integrations.torch_dynamo import is_registered
print(f"Zenith Backend Registered: {is_registered()}")

# List available backends
if hasattr(torch, '_dynamo'):
    backends = torch._dynamo.list_backends()
    print(f"Available backends: {backends}")
    assert 'zenith' in backends, "Zenith backend not registered!"

In [None]:
# Setup imports
import gc
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

def clean_memory():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

In [None]:
# Benchmark function
def run_benchmark(use_zenith, steps=50, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    mode_name = "ZENITH (Real Backend)" if use_zenith else "PYTORCH (Baseline)"
    print(f"\n{'='*20} {mode_name} {'='*20}")
    
    clean_memory()
    
    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        device_map="auto"
    )
    
    # Apply LoRA
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, peft_config)
    
    # Apply Zenith optimization (REAL backend, not pass-through!)
    if use_zenith:
        print("Applying Zenith optimization via torch.compile...")
        # This uses auto-registered 'zenith' backend from zenith.integrations
        model.model = torch.compile(model.model, backend="zenith")
        print("Zenith compilation complete!")
    
    # Dataset
    dataset = load_dataset("tatsu-lab/alpaca", split=f"train[:{steps*2}]")
    def format_prompt(sample):
        return f"### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['output']}"
    
    # Trainer
    args = SFTConfig(
        output_dir=f"./results_{mode_name.replace(' ', '_')}",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        logging_steps=10,
        max_steps=steps,
        fp16=(torch.cuda.is_available() and not torch.cuda.is_bf16_supported()),
        bf16=torch.cuda.is_bf16_supported(),
        report_to="none",
        packing=False
    )
    
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        args=args,
        processing_class=tokenizer,
        formatting_func=format_prompt
    )
    
    # Train and time
    start = time.time()
    trainer.train()
    end = time.time()
    
    # Results
    total_time = end - start
    peak_mem = torch.cuda.max_memory_allocated() / 1024**3
    
    results = {
        "mode": mode_name,
        "total_time": total_time,
        "peak_vram_gb": peak_mem,
        "steps": steps
    }
    
    print(f"Total Time: {total_time:.2f}s")
    print(f"Peak VRAM: {peak_mem:.2f} GB")
    
    del model, trainer, dataset
    clean_memory()
    
    return results

In [None]:
# Run benchmarks
print("="*60)
print("  ZENITH PERFORMANCE BENCHMARK - PROPER BACKEND")
print("="*60)

# Run PyTorch baseline first
pytorch_results = run_benchmark(use_zenith=False, steps=50)

# Run Zenith optimized
zenith_results = run_benchmark(use_zenith=True, steps=50)

In [None]:
# Calculate and display results
print("\n" + "="*60)
print("  BENCHMARK RESULTS")
print("="*60)

speedup = (pytorch_results['total_time'] - zenith_results['total_time']) / pytorch_results['total_time'] * 100

print(f"\nPyTorch Baseline: {pytorch_results['total_time']:.2f}s")
print(f"Zenith Optimized: {zenith_results['total_time']:.2f}s")
print(f"\nSpeedup: {speedup:+.2f}%")
print(f"\nPeak VRAM (PyTorch): {pytorch_results['peak_vram_gb']:.2f} GB")
print(f"Peak VRAM (Zenith):  {zenith_results['peak_vram_gb']:.2f} GB")

## Expected Results

With the **proper** Zenith backend (not pass-through):
- Training speedup: +5-15%
- Same or lower VRAM usage
- Numerical accuracy preserved (MSE ~ 0)