## Step 1: Verify Dual GPU Environment

In [None]:
import subprocess
import os

print("="*70)
print("üîç DUAL GPU ENVIRONMENT CHECK")
print("="*70)

# Get GPU info
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,name,memory.total,compute_cap", "--format=csv,noheader"],
    capture_output=True, text=True
)

gpus = result.stdout.strip().split('\n')
print(f"\nüìä Detected {len(gpus)} GPU(s):")
for gpu in gpus:
    print(f"   {gpu}")

if len(gpus) >= 2:
    print("\n‚úÖ Dual T4 environment confirmed!")
    print("   Total VRAM: 30GB (15GB √ó 2)")
else:
    print("\n‚ö†Ô∏è Only 1 GPU detected!")
    print("   Enable 'GPU T4 x2' in Kaggle settings.")

# CUDA version
print("\nüìä CUDA Version:")
!nvcc --version | grep release

## Step 2: Install llamatelemetry v0.1.0

In [None]:
%%time
# Install llamatelemetry v0.1.0 (force fresh install to ensure correct binaries)
!pip install -q --no-cache-dir --force-reinstall git+https://github.com/llamatelemetry/llamatelemetry.git@v0.1.0
!pip install -q huggingface_hub sseclient-py

import llamatelemetry
print(f"‚úÖ llamatelemetry {llamatelemetry.__version__} installed")

## Step 3: Understanding Multi-GPU Options

llama.cpp provides several flags for multi-GPU configuration:

In [None]:
from llamatelemetry.api.multigpu import MultiGPUConfig, SplitMode, GPUInfo

print("="*70)
print("üìã MULTI-GPU CONFIGURATION OPTIONS")
print("="*70)

print("""
üîπ --tensor-split, -ts
   Distributes VRAM usage across GPUs.
   Example: --tensor-split 0.5,0.5 (50% each GPU)
   Example: --tensor-split 0.7,0.3 (70% GPU0, 30% GPU1)

üîπ --split-mode, -sm  
   How to split the model across GPUs:
   ‚Ä¢ 'layer' - Split by transformer layers (default, recommended)
   ‚Ä¢ 'row'   - Split by matrix rows (can be slower)
   ‚Ä¢ 'none'  - Disable multi-GPU (single GPU only)

üîπ --main-gpu, -mg
   Primary GPU for small tensors and scratch buffers.
   Default: 0 (first GPU)

üîπ --n-gpu-layers, -ngl
   Number of layers to offload to GPU(s).
   Use 99 to offload all layers.
""")

# Show split mode enum
print("\nüìã Split Modes:")
for mode in SplitMode:
    print(f"   {mode.name}: {mode.value}")

## Step 4: GPU Info Utility

In [None]:
from llamatelemetry.api.multigpu import detect_gpus, get_free_vram

print("="*70)
print("üìä GPU INFORMATION")
print("="*70)

# Get detailed GPU info using detect_gpus (actual API function)
gpus = detect_gpus()

for gpu in gpus:
    print(f"\nüîπ GPU {gpu.id}: {gpu.name}")
    print(f"   Total VRAM: {gpu.memory_total_gb:.1f} GB")
    print(f"   Free VRAM: {gpu.memory_free_gb:.1f} GB")
    if gpu.compute_capability:
        print(f"   Compute Capability: {gpu.compute_capability}")

# Calculate total available VRAM
total_vram = sum(gpu.memory_free_gb for gpu in gpus)
print(f"\nüìä Total Available VRAM: {total_vram:.1f} GB")

## Step 5: Download a Larger Model for Multi-GPU Testing

We'll use Gemma-3-4B which benefits from dual-GPU distribution.

In [None]:
%%time
from huggingface_hub import hf_hub_download
import os

# For multi-GPU testing, use a 4B model
MODEL_REPO = "unsloth/gemma-3-4b-it-GGUF"
MODEL_FILE = "gemma-3-4b-it-Q4_K_M.gguf"

print(f"üì• Downloading {MODEL_FILE}...")
print(f"   This ~2.5GB model will be split across both GPUs.")

model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    local_dir="/kaggle/working/models"
)

size_gb = os.path.getsize(model_path) / (1024**3)
print(f"\n‚úÖ Model downloaded: {model_path}")
print(f"   Size: {size_gb:.2f} GB")

## Step 6: Tensor-Split Configurations

Different ways to distribute the model across GPUs.

In [None]:
from llamatelemetry.api.multigpu import MultiGPUConfig, SplitMode

print("="*70)
print("üìã TENSOR-SPLIT CONFIGURATIONS")
print("="*70)

configs = [
    {
        "name": "Equal Split (50/50)",
        "tensor_split": [0.5, 0.5],
        "description": "Equal distribution across both GPUs",
        "use_case": "Default, balanced workload"
    },
    {
        "name": "GPU 0 Heavy (70/30)",
        "tensor_split": [0.7, 0.3],
        "description": "More VRAM on GPU 0",
        "use_case": "When GPU 1 needed for other tasks"
    },
    {
        "name": "GPU 0 Only (100/0)",
        "tensor_split": [1.0, 0.0],
        "description": "Single GPU mode",
        "use_case": "When GPU 1 reserved for RAPIDS/Graphistry"
    },
]

for i, config in enumerate(configs, 1):
    print(f"\nüîπ Config {i}: {config['name']}")
    print(f"   Tensor Split: {config['tensor_split']}")
    print(f"   Description: {config['description']}")
    print(f"   Use Case: {config['use_case']}")

## Step 7: Start Server with Dual-GPU Configuration

In [None]:
from llamatelemetry.server import ServerManager
from llamatelemetry.api.multigpu import SplitMode

print("="*70)
print("üöÄ STARTING DUAL-GPU SERVER")
print("="*70)

# Dual-GPU configuration parameters
dual_config = {
    "model_path": model_path,
    "host": "127.0.0.1",
    "port": 8080,
    
    # Multi-GPU settings
    "gpu_layers": 99,              # Offload all layers
    "tensor_split": "0.5,0.5",     # Equal split (as comma-separated string)
    
    # Performance
    "ctx_size": 8192,
    "batch_size": 1024,
    
    # Parallelism
    "n_parallel": 4,
}

print(f"\nüìã Dual-GPU Configuration:")
print(f"   Model: {model_path.split('/')[-1]}")
print(f"   Tensor Split: GPU0=50%, GPU1=50%")
print(f"   Context Size: {dual_config['ctx_size']}")

# Start server
server = ServerManager(server_url=f"http://{dual_config['host']}:{dual_config['port']}")
print("\nüöÄ Starting server...")

try:
    server.start_server(
        model_path=dual_config['model_path'],
        host=dual_config['host'],
        port=dual_config['port'],
        gpu_layers=dual_config['gpu_layers'],
        ctx_size=dual_config['ctx_size'],
        batch_size=dual_config['batch_size'],
        n_parallel=dual_config['n_parallel'],
        timeout=120,
        verbose=True,
        # Multi-GPU tensor split
        tensor_split=dual_config['tensor_split']
    )
    print("\n‚úÖ Dual-GPU server started successfully!")
except Exception as e:
    print(f"\n‚ùå Server failed to start: {e}")

## Step 8: Verify Multi-GPU Distribution

In [None]:
import subprocess

print("="*70)
print("üìä GPU MEMORY DISTRIBUTION")
print("="*70)

# Check memory usage on both GPUs
result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,name,memory.used,memory.total,utilization.gpu", 
     "--format=csv,noheader"],
    capture_output=True, text=True
)

print("\nüìä GPU Memory After Model Load:")
for line in result.stdout.strip().split('\n'):
    parts = line.split(', ')
    if len(parts) >= 4:
        idx, name, used, total = parts[0], parts[1], parts[2], parts[3]
        print(f"   GPU {idx}: {used} / {total}")

print("\nüí° Both GPUs should show VRAM usage if tensor-split is working.")

## Step 9: Benchmark Multi-GPU Performance

In [None]:
import time
from llamatelemetry.api.client import LlamaCppClient

print("="*70)
print("üìä MULTI-GPU PERFORMANCE BENCHMARK")
print("="*70)

client = LlamaCppClient(base_url="http://127.0.0.1:8080")

# Longer prompts to test multi-GPU throughput
prompts = [
    "Write a detailed explanation of how GPU parallelism works in deep learning.",
    "Explain the architecture of a transformer model step by step.",
    "Describe the CUDA programming model and its key concepts.",
    "What are the advantages of using multiple GPUs for inference?",
    "Explain tensor parallelism vs pipeline parallelism.",
]

print(f"\nüèÉ Running benchmark with {len(prompts)} prompts...\n")

total_input_tokens = 0
total_output_tokens = 0
total_time = 0

for i, prompt in enumerate(prompts, 1):
    start = time.time()
    
    response = client.chat.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.7
    )
    
    elapsed = time.time() - start
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    
    total_input_tokens += input_tokens
    total_output_tokens += output_tokens
    total_time += elapsed
    
    tok_per_sec = output_tokens / elapsed
    print(f"   Prompt {i}: {output_tokens} tokens in {elapsed:.2f}s ({tok_per_sec:.1f} tok/s)")

print(f"\nüìä Benchmark Results:")
print(f"   Total Input Tokens: {total_input_tokens}")
print(f"   Total Output Tokens: {total_output_tokens}")
print(f"   Total Time: {total_time:.2f}s")
print(f"   Average Generation Speed: {total_output_tokens/total_time:.1f} tokens/second")

## Step 10: Test Different Split Configurations

In [None]:
# Stop current server
print("üõë Stopping server for reconfiguration...")
server.stop_server()

import time
time.sleep(2)
print("‚úÖ Server stopped")

In [None]:
print("="*70)
print("üîß TESTING 70/30 SPLIT CONFIGURATION")
print("="*70)

# 70/30 split - more on GPU 0
config_70_30 = {
    "model_path": model_path,
    "host": "127.0.0.1",
    "port": 8080,
    "gpu_layers": 99,
    "tensor_split": "0.7,0.3",  # 70% GPU0, 30% GPU1
    "ctx_size": 8192,
}

print(f"\nüìã Configuration:")
print(f"   Tensor Split: GPU0=70%, GPU1=30%")
print(f"   Use Case: When GPU1 needs memory for other tasks")

try:
    server.start_server(
        model_path=config_70_30['model_path'],
        host=config_70_30['host'],
        port=config_70_30['port'],
        gpu_layers=config_70_30['gpu_layers'],
        ctx_size=config_70_30['ctx_size'],
        timeout=60,
        verbose=True,
        tensor_split=config_70_30['tensor_split']
    )
    print("\n‚úÖ 70/30 split server started!")
    
    # Check memory distribution
    print("\nüìä Memory Distribution:")
    !nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
except Exception as e:
    print(f"\n‚ùå Failed to start: {e}")

## Step 11: Split-GPU Mode for LLM + RAPIDS

Configure for running LLM on GPU 0 while reserving GPU 1 for RAPIDS/Graphistry.

In [None]:
# Stop current server
server.stop_server()
import time
time.sleep(2)

print("="*70)
print("üéØ SPLIT-GPU MODE: LLM + RAPIDS")
print("="*70)

print("""
This configuration runs the LLM entirely on GPU 0,
leaving GPU 1 free for RAPIDS/cuGraph/Graphistry.

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ       GPU 0 (15GB)      ‚îÇ        GPU 1 (15GB)           ‚îÇ
‚îÇ  ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê    ‚îÇ   ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê ‚îÇ
‚îÇ  ‚îÇ  llama-server   ‚îÇ    ‚îÇ   ‚îÇ  RAPIDS / Graphistry    ‚îÇ ‚îÇ
‚îÇ  ‚îÇ  (Full Model)   ‚îÇ    ‚îÇ   ‚îÇ  (Graph Visualization)  ‚îÇ ‚îÇ
‚îÇ  ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò    ‚îÇ   ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
""")

# Single GPU configuration (GPU 0 only)
split_gpu_config = {
    "model_path": model_path,
    "host": "127.0.0.1",
    "port": 8080,
    "gpu_layers": 99,
    "tensor_split": "1.0,0.0",  # 100% on GPU 0
    "ctx_size": 4096,  # Smaller context to fit in single GPU
}

try:
    server.start_server(
        model_path=split_gpu_config['model_path'],
        host=split_gpu_config['host'],
        port=split_gpu_config['port'],
        gpu_layers=split_gpu_config['gpu_layers'],
        ctx_size=split_gpu_config['ctx_size'],
        timeout=60,
        verbose=True,
        tensor_split=split_gpu_config['tensor_split']
    )
    print("\n‚úÖ Split-GPU mode server started!")
    print("   GPU 0: llama-server")
    print("   GPU 1: Available for RAPIDS/Graphistry")
    
    print("\nüìä Memory Distribution:")
    !nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
except Exception as e:
    print(f"\n‚ùå Failed to start: {e}")

## Step 12: Verify GPU 1 is Free for RAPIDS

In [None]:
import subprocess

print("="*70)
print("üìä GPU 1 AVAILABILITY CHECK")
print("="*70)

result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,memory.used,memory.free", "--format=csv,noheader"],
    capture_output=True, text=True
)

lines = result.stdout.strip().split('\n')
if len(lines) >= 2:
    gpu1_info = lines[1].split(', ')
    used = gpu1_info[1].strip()
    free = gpu1_info[2].strip()
    
    print(f"\nüìä GPU 1 Status:")
    print(f"   Memory Used: {used}")
    print(f"   Memory Free: {free}")
    
    # Parse free memory
    free_mb = int(free.replace(' MiB', ''))
    if free_mb > 14000:  # > 14GB free
        print(f"\n‚úÖ GPU 1 has {free_mb/1024:.1f} GB free - Ready for RAPIDS!")
    else:
        print(f"\n‚ö†Ô∏è GPU 1 has limited free memory.")

## Step 13: Quick RAPIDS Verification on GPU 1

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Force RAPIDS to use GPU 1

print("="*70)
print("üî• RAPIDS ON GPU 1 VERIFICATION")
print("="*70)

try:
    import cudf
    import cupy as cp
    
    # Create a small cuDF DataFrame on GPU 1
    df = cudf.DataFrame({
        'x': range(1000),
        'y': range(1000, 2000)
    })
    
    print(f"\n‚úÖ cuDF working on GPU 1")
    print(f"   DataFrame shape: {df.shape}")
    print(f"   Memory used: {df.memory_usage().sum() / 1024:.2f} KB")
    
    # Verify GPU
    print(f"\nüìä cuPy GPU Info:")
    device = cp.cuda.Device(0)  # Device 0 in filtered view = actual GPU 1
    print(f"   Name: {device.attributes['Name'].decode()}")
    
except ImportError as e:
    print(f"\n‚ö†Ô∏è RAPIDS not available: {e}")
    print("   Install with: pip install cudf-cu12 cuml-cu12")

# Reset CUDA_VISIBLE_DEVICES
del os.environ["CUDA_VISIBLE_DEVICES"]

## Step 14: Cleanup

In [None]:
print("üõë Stopping server...")
server.stop_server()

print("\n‚úÖ Server stopped")
print("\nüìä Final GPU Status:")
!nvidia-smi --query-gpu=index,memory.used,memory.free --format=csv

## üìö Summary

You've learned:
1. ‚úÖ Multi-GPU configuration options (tensor-split, split-mode)
2. ‚úÖ Equal split (50/50) for maximum model size
3. ‚úÖ Asymmetric split (70/30) for mixed workloads
4. ‚úÖ Single-GPU mode (100/0) for LLM + RAPIDS
5. ‚úÖ Performance benchmarking across GPUs

## Configuration Quick Reference

| Use Case | Tensor Split | GPU 0 | GPU 1 |
|----------|--------------|-------|-------|
| Max Model Size | 0.5, 0.5 | LLM (50%) | LLM (50%) |
| LLM + Light Task | 0.7, 0.3 | LLM (70%) | LLM (30%) |
| LLM + RAPIDS | 1.0, 0.0 | LLM (100%) | RAPIDS (100%) |

---

**Next:** [04-gguf-quantization](04-gguf-quantization-llamatelemetry-v0.1.0.ipynb)