# End-to-End Production Workflow

**Duration:** ~45 min | **Platform:** Kaggle dual Tesla T4

This notebook demonstrates a **complete production workflow**: model selection,
server deployment, inference pipelines, multi-turn chat, batch processing,
GPU monitoring, and observability export.

### What you'll learn
1. Model selection and registry
2. Optimal server deployment
3. Traced inference pipelines
4. Multi-turn chat with ChatEngine
5. Batch processing with session tracking
6. GPU monitoring and observability
7. Graceful shutdown

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.2.0

import llamatelemetry

# Initialize with OTLP endpoint (optional — set for Grafana Cloud export)
llamatelemetry.init(
    service_name="production-workflow",
    environment="kaggle",
    # otlp_endpoint="https://otlp-gateway-prod-us-central-0.grafana.net/otlp",
    # otlp_headers={"Authorization": "Basic <token>"},
)
print(f"llamatelemetry {llamatelemetry.version()} initialized")

## Step 1 — Model Selection

Choose the best model for your task and hardware.

In [None]:
from huggingface_hub import hf_hub_download
from llamatelemetry.llama import parse_gguf_header, get_model_summary

@llamatelemetry.task(name="select-model")
def select_model():
    model_path = hf_hub_download(
        repo_id="bartowski/google_gemma-3-1b-it-GGUF",
        filename="google_gemma-3-1b-it-Q4_K_M.gguf",
        cache_dir="/root/.cache/huggingface",
    )

    info = parse_gguf_header(model_path)
    print(f"Selected model: {get_model_summary(model_path)}")
    print(f"  Architecture: {info.metadata.architecture}")
    print(f"  Size: {info.size_mb:.0f} MB")
    print(f"  Context: {info.metadata.context_length}")
    return model_path

model_path = select_model()

## Step 2 — Server Deployment

Deploy with optimal settings for dual T4.

In [None]:
from llamatelemetry.llama import ServerManager, LlamaCppClient

@llamatelemetry.task(name="deploy-server")
def deploy_server(model_path):
    mgr = ServerManager()
    mgr.start_server(
        model_path=model_path,
        gpu_layers=99,
        tensor_split="0.5,0.5",
        ctx_size=2048,
        batch_size=512,
        n_parallel=2,
    )
    mgr.wait_until_ready(timeout=60)

    client = LlamaCppClient(base_url="http://127.0.0.1:8090")
    health = client.health()
    print(f"Server deployed: {health.status} ({health.slots_idle} idle slots)")
    return mgr, client

mgr, client = deploy_server(model_path)

## Step 3 — Inference Pipeline

Build a traced inference pipeline with prompt → completion → streaming.

In [None]:
import time

@llamatelemetry.workflow(name="inference-pipeline")
def run_inference(client, prompt, max_tokens=128):
    """Traced inference pipeline."""
    with llamatelemetry.span("prepare-request"):
        messages = [{"role": "user", "content": prompt}]

    with llamatelemetry.span("llm-completion") as span:
        t0 = time.perf_counter()
        resp = client.chat.completions.create(
            messages=messages, max_tokens=max_tokens, temperature=0.7,
        )
        latency_ms = (time.perf_counter() - t0) * 1000

    with llamatelemetry.span("process-response"):
        result = {
            "text": resp.choices[0].message.content,
            "tokens": resp.usage.completion_tokens,
            "latency_ms": latency_ms,
            "tokens_per_sec": resp.usage.completion_tokens / (latency_ms / 1000),
        }

    return result

result = run_inference(client, "What are the key components of a transformer architecture?")
print(f"Response ({result['tokens']} tokens, {result['latency_ms']:.0f} ms, {result['tokens_per_sec']:.1f} tok/s):")
print(result["text"])

## Step 4 — Multi-Turn Chat

`ChatEngine` manages conversation history and context automatically.

In [None]:
from llamatelemetry.chat import ChatEngine

chat = ChatEngine(
    engine=client,
    system_prompt="You are a helpful AI assistant specializing in machine learning.",
    max_history=10,
    max_tokens=128,
    temperature=0.7,
)

# Multi-turn conversation
turns = [
    "What is gradient descent?",
    "How does the learning rate affect it?",
    "What's a good default learning rate?",
]

for turn in turns:
    chat.add_user_message(turn)
    response = chat.complete()
    chat.add_assistant_message(response)
    print(f"User: {turn}")
    print(f"Assistant: {response}\n")

print(f"Chat history: {len(chat.get_history())} messages")

## Step 5 — Batch Processing

Process multiple prompts with session-level tracking.

In [None]:
@llamatelemetry.workflow(name="batch-inference")
def batch_process(client, prompts):
    results = []
    with llamatelemetry.session("batch-job-001"):
        for i, prompt in enumerate(prompts):
            with llamatelemetry.span(f"request-{i}", prompt_index=i):
                resp = client.chat.completions.create(
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=64, temperature=0.7,
                )
                results.append({
                    "prompt": prompt,
                    "response": resp.choices[0].message.content,
                    "tokens": resp.usage.completion_tokens,
                })
    return results

batch_prompts = [
    "Define overfitting in one sentence.",
    "What is regularization?",
    "Explain dropout in neural networks.",
    "What is batch normalization?",
    "Define the bias-variance tradeoff.",
]

results = batch_process(client, batch_prompts)
for r in results:
    print(f"Q: {r['prompt']}")
    print(f"A: {r['response'][:100]}... ({r['tokens']} tokens)\n")

## Step 6 — GPU Monitoring

Continuous GPU monitoring with background sampler.

In [None]:
from llamatelemetry.gpu import start_sampler, snapshot

# Start background monitoring
handle = start_sampler(interval_ms=500)

# Run a workload
for _ in range(5):
    client.chat.completions.create(
        messages=[{"role": "user", "content": "Write a detailed explanation of backpropagation."}],
        max_tokens=128,
    )

handle.stop()
samples = handle.get_snapshots()

# Analyze
if samples:
    print(f"Collected {len(samples)} GPU samples")
    for gpu_id in [0, 1]:
        gpu_samples = [s for s in samples if s.gpu_id == gpu_id]
        if gpu_samples:
            mem_values = [s.mem_used_mb for s in gpu_samples]
            util_values = [s.utilization_pct for s in gpu_samples]
            print(f"  GPU {gpu_id}: mem {min(mem_values)}-{max(mem_values)} MB, util {min(util_values)}-{max(util_values)}%")

## Step 7 — Observability Export

Export traces and metrics to Grafana Cloud (or any OTLP-compatible backend).

In [None]:
from llamatelemetry.kaggle import auto_configure_grafana_cloud

# Configure Grafana Cloud (uses Kaggle secrets: GRAFANA_CLOUD_ORG_ID, GRAFANA_CLOUD_API_TOKEN)
try:
    configured = auto_configure_grafana_cloud()
    if configured:
        print("Grafana Cloud configured — traces will be exported")
    else:
        print("Grafana Cloud not configured — set Kaggle secrets to enable")
except Exception as e:
    print(f"Grafana Cloud setup: {e}")

# Flush pending telemetry
llamatelemetry.flush(timeout_s=5.0)
print("Telemetry flushed")

## Step 8 — Graceful Shutdown

Always shut down cleanly to flush telemetry and release GPU resources.

In [None]:
# Final status
health = client.health()
print(f"Server status: {health.status}")

mem = snapshot()
for s in mem:
    print(f"GPU {s.gpu_id}: {s.mem_used_mb}/{s.mem_total_mb} MB")

# Shutdown sequence
mgr.stop_server()          # Stop llama-server
llamatelemetry.flush()     # Flush remaining telemetry
llamatelemetry.shutdown()  # Release SDK resources
print("\nProduction workflow complete — all resources released.")

## Production Checklist

- [x] Model selected and validated with `parse_gguf_header()`
- [x] Server deployed with optimal tensor-split
- [x] Inference pipeline traced with `@workflow` / `@task`
- [x] Multi-turn chat with `ChatEngine`
- [x] Batch processing with `session()` tracking
- [x] GPU monitoring with `start_sampler()`
- [x] Observability export to Grafana Cloud
- [x] Graceful shutdown with `flush()` + `shutdown()`

## Architecture Diagram

```
┌─────────────┐     ┌──────────────────┐     ┌─────────────────┐
│  Notebook    │────▶│  llamatelemetry  │────▶│  Grafana Cloud  │
│  (Client)    │     │  SDK             │     │  (Traces/Metrics)│
└─────────────┘     └──────────────────┘     └─────────────────┘
       │                     │
       │              ┌──────┴──────┐
       │              │             │
       ▼              ▼             ▼
┌─────────────┐ ┌──────────┐ ┌──────────┐
│ llama-server│ │  GPU 0   │ │  GPU 1   │
│ (port 8090) │ │ (T4 15GB)│ │ (T4 15GB)│
└─────────────┘ └──────────┘ └──────────┘
```