# Real-Time Performance Monitoring

**Duration:** ~25 min | **Platform:** Kaggle dual Tesla T4

This notebook demonstrates **real-time monitoring** of LLM inference —
continuous GPU metrics, inference latency tracking, throughput analysis,
and performance profiling.

### What you'll learn
1. Background GPU monitoring with `start_sampler()`
2. Inference latency tracking
3. Throughput analysis and charts
4. Performance profiling across configurations
5. Live monitoring in Jupyter

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.2.0
!pip install -q matplotlib

import llamatelemetry
from llamatelemetry.llama import ServerManager, LlamaCppClient
from llamatelemetry.gpu import start_sampler, snapshot
from huggingface_hub import hf_hub_download
import time

llamatelemetry.init(service_name="realtime-monitoring")

model_path = hf_hub_download(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-Q4_K_M.gguf",
    cache_dir="/root/.cache/huggingface",
)

mgr = ServerManager()
mgr.start_server(model_path=model_path, gpu_layers=99, tensor_split="0.5,0.5", ctx_size=2048)
mgr.wait_until_ready(timeout=60)
client = LlamaCppClient(base_url="http://127.0.0.1:8090")
print("Ready")

## GPU Background Monitoring

`start_sampler()` launches a background thread that captures GPU metrics
at regular intervals without blocking inference.

In [None]:
# Start background sampling at 500ms intervals
handle = start_sampler(interval_ms=500)

# Run a sustained workload
prompts = [
    "Explain backpropagation in detail.",
    "What is the difference between supervised and unsupervised learning?",
    "Describe the attention mechanism in transformers.",
    "How does batch normalization work?",
    "What are the advantages of residual connections?",
    "Explain the concept of regularization.",
    "What is transfer learning and why is it useful?",
    "Describe the architecture of a GAN.",
]

for prompt in prompts:
    client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=64, temperature=0.7,
    )

# Stop and collect
handle.stop()
samples = handle.get_snapshots()
print(f"Collected {len(samples)} GPU samples over {prompts.__len__()} requests")

# Summary statistics per GPU
for gpu_id in [0, 1]:
    gpu_samples = [s for s in samples if s.gpu_id == gpu_id]
    if gpu_samples:
        mem = [s.mem_used_mb for s in gpu_samples]
        util = [s.utilization_pct for s in gpu_samples]
        print(f"\nGPU {gpu_id}:")
        print(f"  Memory: {min(mem)}-{max(mem)} MB (avg {sum(mem)//len(mem)} MB)")
        print(f"  Utilization: {min(util)}-{max(util)}% (avg {sum(util)//len(util)}%)")

## Inference Latency Tracking

Measure per-request latency with traced spans.

In [None]:
latencies = []
token_counts = []
token_rates = []

@llamatelemetry.workflow(name="latency-test")
def latency_test(client, prompts, max_tokens=64):
    for prompt in prompts:
        with llamatelemetry.span("inference") as span:
            t0 = time.perf_counter()
            resp = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens, temperature=0.7,
            )
            elapsed_ms = (time.perf_counter() - t0) * 1000

        tokens = resp.usage.completion_tokens
        tps = tokens / (elapsed_ms / 1000) if elapsed_ms > 0 else 0

        latencies.append(elapsed_ms)
        token_counts.append(tokens)
        token_rates.append(tps)

test_prompts = [f"Tell me interesting fact #{i+1} about neural networks." for i in range(20)]
latency_test(client, test_prompts)

import numpy as np
print(f"Latency statistics ({len(latencies)} requests):")
print(f"  Mean: {np.mean(latencies):.0f} ms")
print(f"  P50:  {np.percentile(latencies, 50):.0f} ms")
print(f"  P95:  {np.percentile(latencies, 95):.0f} ms")
print(f"  P99:  {np.percentile(latencies, 99):.0f} ms")
print(f"  Avg tokens/s: {np.mean(token_rates):.1f}")

## Throughput Analysis

Measure how throughput varies with batch size and output length.

In [None]:
import matplotlib.pyplot as plt

# Test different output lengths
output_lengths = [16, 32, 64, 128, 256]
throughput_results = []

for max_tok in output_lengths:
    times = []
    total_tokens = 0
    for _ in range(5):
        t0 = time.perf_counter()
        resp = client.chat.completions.create(
            messages=[{"role": "user", "content": "Explain gradient descent."}],
            max_tokens=max_tok, temperature=0.7,
        )
        elapsed = time.perf_counter() - t0
        times.append(elapsed)
        total_tokens += resp.usage.completion_tokens

    avg_time = np.mean(times)
    avg_tps = total_tokens / sum(times)
    throughput_results.append((max_tok, avg_time * 1000, avg_tps))
    print(f"  max_tokens={max_tok:4d}: {avg_time*1000:.0f} ms avg, {avg_tps:.1f} tok/s")

# Plot throughput
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

lengths = [r[0] for r in throughput_results]
avg_latencies = [r[1] for r in throughput_results]
avg_tps = [r[2] for r in throughput_results]

axes[0].plot(lengths, avg_latencies, "o-", color="steelblue", linewidth=2)
axes[0].set_xlabel("Max Output Tokens")
axes[0].set_ylabel("Latency (ms)")
axes[0].set_title("Latency vs Output Length")
axes[0].grid(True, alpha=0.3)

axes[1].plot(lengths, avg_tps, "o-", color="green", linewidth=2)
axes[1].set_xlabel("Max Output Tokens")
axes[1].set_ylabel("Tokens/sec")
axes[1].set_title("Throughput vs Output Length")
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Performance Profiling

Profile different configurations to find the optimal setup.

In [None]:
# Profile latency distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Latency histogram
axes[0].hist(latencies, bins=15, color="steelblue", edgecolor="black", alpha=0.8)
axes[0].axvline(np.mean(latencies), color="red", linestyle="--", label=f"Mean: {np.mean(latencies):.0f} ms")
axes[0].axvline(np.percentile(latencies, 95), color="orange", linestyle="--", label=f"P95: {np.percentile(latencies, 95):.0f} ms")
axes[0].set_xlabel("Latency (ms)")
axes[0].set_ylabel("Count")
axes[0].set_title("Latency Distribution")
axes[0].legend()

# Token rate over time
axes[1].plot(range(len(token_rates)), token_rates, "o-", color="green", markersize=4)
axes[1].axhline(np.mean(token_rates), color="red", linestyle="--", label=f"Mean: {np.mean(token_rates):.1f} tok/s")
axes[1].set_xlabel("Request #")
axes[1].set_ylabel("Tokens/sec")
axes[1].set_title("Token Generation Rate Over Time")
axes[1].legend()

plt.tight_layout()
plt.show()

## Live GPU Dashboard (Jupyter)

Display live GPU utilization and inference metrics in Jupyter.

In [None]:
from IPython.display import clear_output

# Live monitoring loop (runs for 10 iterations)
handle = start_sampler(interval_ms=1000)

for iteration in range(10):
    # Run inference
    t0 = time.perf_counter()
    resp = client.chat.completions.create(
        messages=[{"role": "user", "content": f"Fact #{iteration+1} about deep learning."}],
        max_tokens=32, temperature=0.7,
    )
    latency = (time.perf_counter() - t0) * 1000

    # Get latest GPU snapshot
    latest = handle.get_latest()

    clear_output(wait=True)
    print(f"=== Live Dashboard (iteration {iteration+1}/10) ===")
    print(f"\nInference: {latency:.0f} ms | {resp.usage.completion_tokens} tokens | "
          f"{resp.usage.completion_tokens/(latency/1000):.1f} tok/s")

    if latest:
        for s in snapshot():
            bar_len = s.utilization_pct // 5
            bar = "█" * bar_len + "░" * (20 - bar_len)
            print(f"\nGPU {s.gpu_id}: [{bar}] {s.utilization_pct}%")
            print(f"  Memory: {s.mem_used_mb}/{s.mem_total_mb} MB")
            print(f"  Temp: {s.temp_c}°C | Power: {s.power_w:.0f} W")

handle.stop()
print("\n--- Monitoring complete ---")

## Summary — Monitoring Checklist

- [x] **GPU monitoring**: `start_sampler()` for continuous background metrics
- [x] **Latency tracking**: Per-request timing with traced spans
- [x] **Throughput analysis**: Tokens/sec across different configurations
- [x] **Performance profiling**: Histogram and time-series visualization
- [x] **Live dashboard**: Real-time Jupyter display with `clear_output()`

In [None]:
mgr.stop_server()
llamatelemetry.flush()
llamatelemetry.shutdown()
print("Done.")