# vLLM GPU Inference Example

This notebook demonstrates how to use vLLM for high-performance LLM inference on your RTX 4090.

In [None]:
# Check GPU status
import torch
import subprocess

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory // (1024**3) if torch.cuda.is_available() else 0} GB")

# Show nvidia-smi
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print("\nNVIDIA-SMI Output:")
print(result.stdout)

In [None]:
# Import vLLM
from vllm import LLM, SamplingParams
import time

print("vLLM imported successfully!")

In [None]:
# Load a small model for demonstration
# You can replace this with larger models like "meta-llama/Llama-2-7b-chat-hf" if you have access

print("Loading model...")
llm = LLM(
    model="microsoft/DialoGPT-small",  # Small model for demo
    trust_remote_code=True,
    gpu_memory_utilization=0.5,  # Use 50% of GPU memory
    max_model_len=512  # Limit sequence length
)
print("Model loaded successfully!")

In [None]:
# Create sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=100
)

# Test prompts
prompts = [
    "Hello, how are you today?",
    "What is artificial intelligence?",
    "Explain machine learning in simple terms.",
    "What are the benefits of GPU acceleration?"
]

print(f"Generating responses for {len(prompts)} prompts...\n")

# Measure inference time
start_time = time.time()
outputs = llm.generate(prompts, sampling_params)
end_time = time.time()

# Display results
for i, output in enumerate(outputs):
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt {i+1}: {prompt}")
    print(f"Response: {generated_text}")
    print("-" * 50)

print(f"\nTotal inference time: {end_time - start_time:.2f} seconds")
print(f"Average time per prompt: {(end_time - start_time) / len(prompts):.2f} seconds")

In [None]:
# Demonstrate batch processing benefits
import matplotlib.pyplot as plt
import numpy as np

# Test different batch sizes
batch_sizes = [1, 2, 4, 8]
times = []

test_prompt = "Tell me about the future of AI."

for batch_size in batch_sizes:
    prompts_batch = [test_prompt] * batch_size
    
    start_time = time.time()
    outputs = llm.generate(prompts_batch, sampling_params)
    end_time = time.time()
    
    avg_time_per_prompt = (end_time - start_time) / batch_size
    times.append(avg_time_per_prompt)
    
    print(f"Batch size {batch_size}: {avg_time_per_prompt:.3f}s per prompt")

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(batch_sizes, times, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Batch Size')
plt.ylabel('Average Time per Prompt (seconds)')
plt.title('vLLM Batch Processing Performance on RTX 4090')
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nBest performance at batch size {batch_sizes[np.argmin(times)]} with {min(times):.3f}s per prompt")

## Conclusion

This notebook demonstrated:
1. GPU detection and status checking
2. vLLM model loading and configuration
3. Single and batch inference
4. Performance measurement and optimization

### Next Steps:
- Try larger models like Llama 2 7B or 13B
- Experiment with different sampling parameters
- Implement streaming responses for real-time applications
- Explore quantization for memory efficiency