In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [22]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [1]:
# ==============================================================================
# Step 1: Verify Dual GPU Environment
# ==============================================================================

import subprocess
import os

print("="*70)
print("üîç SPLIT-GPU ENVIRONMENT CHECK")
print("="*70)

result = subprocess.run(
    ["nvidia-smi", "--query-gpu=index,name,memory.total,memory.free", "--format=csv,noheader"],
    capture_output=True, text=True
)

gpus = result.stdout.strip().split('\n')
print(f"\nüìä Detected {len(gpus)} GPU(s):")
for gpu in gpus:
    print(f"   {gpu}")

if len(gpus) >= 2:
    print("\n‚úÖ Dual T4 ready for split-GPU operation!")
    print("   GPU 0 ‚Üí llama-server (GGUF model inference)")
    print("   GPU 1 ‚Üí RAPIDS/Graphistry (architecture visualization)")
else:
    print("\n‚ö†Ô∏è Need 2 GPUs for split operation")


üîç SPLIT-GPU ENVIRONMENT CHECK

üìä Detected 2 GPU(s):
   0, Tesla T4, 15360 MiB, 14913 MiB
   1, Tesla T4, 15360 MiB, 14913 MiB

‚úÖ Dual T4 ready for split-GPU operation!
   GPU 0 ‚Üí llama-server (GGUF model inference)
   GPU 1 ‚Üí RAPIDS/Graphistry (architecture visualization)


In [2]:
# ==============================================================================
# Step 2: Install llamatelemetry v0.1.0
# ==============================================================================
print("üì¶ Installing dependencies...")

# Install llamatelemetry v0.1.0
!pip install -q https://github.com/llamatelemetry/llamatelemetry/releases/download/v0.1.0/llamatelemetry-v0.1.0-source.tar.gz
#!pip install -q --no-cache-dir git+https://github.com/llamatelemetry/llamatelemetry.git@v0.1.0

# Install cuGraph for GPU-accelerated graph algorithms
!pip install -q --extra-index-url=https://pypi.nvidia.com "cugraph-cu12==25.6.*" "cudf-cu12==25.6.*"

# Install Graphistry for visualization
!pip install -q "graphistry[ai]"

# Install additional utilities
!pip install -q pyarrow pandas numpy scipy huggingface_hub

# Verify installations
import llamatelemetry
print(f"\n‚úÖ llamatelemetry {llamatelemetry.__version__} installed")

try:
    import cudf, cugraph
    print(f"‚úÖ cuDF {cudf.__version__}")
    print(f"‚úÖ cuGraph {cugraph.__version__}")
except ImportError as e:
    print(f"‚ö†Ô∏è RAPIDS: {e}")

try:
    import graphistry
    print(f"‚úÖ Graphistry {graphistry.__version__}")
except ImportError as e:
    print(f"‚ö†Ô∏è Graphistry: {e}")

üì¶ Installing dependencies...
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m763.5/763.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for llamatelemetry (pyproject.toml) ... [?25l[?25hdone
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m42.1/42.1 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take




üéØ llamatelemetry v0.1.0 First-Time Setup - Kaggle 2√ó T4 Multi-GPU

üéÆ GPU Detected: Tesla T4 (Compute 7.5)
  ‚úÖ Tesla T4 detected - Perfect for llamatelemetry v0.1.0!
üåê Platform: Colab

üì¶ Downloading Kaggle 2√ó T4 binaries (~961 MB)...
    Features: FlashAttention + Tensor Cores + Multi-GPU tensor-split

‚û°Ô∏è  Attempt 1: HuggingFace (llamatelemetry-v0.1.0-cuda12-kaggle-t4x2.tar.gz)
üì• Downloading v0.1.0 from HuggingFace Hub...
   Repo: waqasm86/llamatelemetry-binaries
   File: v0.1.0/llamatelemetry-v0.1.0-cuda12-kaggle-t4x2.tar.gz


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


v0.1.0/llamatelemetry-v0.1.0-cuda12-kagg(‚Ä¶):   0%|          | 0.00/1.40G [00:00<?, ?B/s]

üîê Verifying SHA256 checksum...
   ‚úÖ Checksum verified
üì¶ Extracting llamatelemetry-v0.1.0-cuda12-kaggle-t4x2.tar.gz...
Found 21 files in archive
Extracted 21 files to /root/.cache/llamatelemetry/extract_0.1.0
‚úÖ Extraction complete!
  Found bin/ and lib/ under /root/.cache/llamatelemetry/extract_0.1.0/llamatelemetry-v0.1.0-cuda12-kaggle-t4x2
  Copied 13 binaries to /usr/local/lib/python3.12/dist-packages/llamatelemetry/binaries/cuda12
  Copied 2 libraries to /usr/local/lib/python3.12/dist-packages/llamatelemetry/lib
‚úÖ Binaries installed successfully!


‚úÖ llamatelemetry 0.1.0 installed
‚úÖ cuDF 25.06.00
‚úÖ cuGraph 25.06.00
‚úÖ Graphistry 0.50.6


In [3]:
import matplotlib, numpy, pandas, requests, pyarrow

print("requests:", requests.__version__)
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)
print("matplotplib:", matplotlib.__version__)
print("pyarrow:", pyarrow.__version__)

requests: 2.32.5
numpy: 2.0.2
pandas: 2.2.2
matplotplib: 3.10.0
pyarrow: 19.0.1


In [4]:
!pip install -q --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m66.8/66.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m405.7/405.7 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m310.8/310.8 kB[0m [31m299.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# First, downgrade to compatible versions
!pip install -q \
  opentelemetry-api==1.37.0 \
  opentelemetry-sdk==1.37.0 \
  opentelemetry-proto==1.37.0 \
  opentelemetry-exporter-otlp-proto-common==1.37.0 \
  opentelemetry-exporter-otlp-proto-grpc==1.37.0 \
  rich==13.9.4 \
  --upgrade-strategy=only-if-needed

# Also install the missing bigquery storage package
!pip install -q google-cloud-bigquery-storage==2.31.0 --upgrade-strategy=only-if-needed


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m242.4/242.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gradio 5.49.1 requires pydantic<2.12,>=2.0, but you have pydantic 2.12.5 which is incompatible.
fastai 2.8.4 requires fastcore<1.9,>=1.8.0, but you have fastcore 1.11.3 which is incompatible.[0m[31m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m256.5/256.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [7]:
# ==============================================================================
# Step 3: Setup Secrets (Kaggle Secrets)
# ==============================================================================
import os
from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()

# Grafana OTLP
GRAFANA_OTLP_ENDPOINT = secrets.get_secret("GRAFANA_OTLP_ENDPOINT").rstrip("/")
GRAFANA_OTLP_BASIC_B64 = secrets.get_secret("GRAFANA_OTLP_BASIC_B64")
GRAFANA_OTLP_INSTANCE_ID = secrets.get_secret("GRAFANA_OTLP_INSTANCE_ID")
GRAFANA_OTLP_TOKEN = secrets.get_secret("GRAFANA_OTLP_TOKEN")

# HuggingFace
HF_TOKEN = secrets.get_secret("HF_TOKEN")

# Graphistry
#GRAPHISTRY_USERNAME = secrets.get_secret("Graphistry_Username")
GRAPHISTRY_PERSONAL_KEY_ID = secrets.get_secret("Graphistry_Personal_Key_ID")
GRAPHISTRY_PERSONAL_KEY_SECRET = secrets.get_secret("Graphistry_Personal_Secret_Key")

# Export OTel env vars for SDK auto-config (explicit v1 paths)
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"
os.environ["OTEL_EXPORTER_OTLP_LOGS_ENDPOINT"] = f"{GRAFANA_OTLP_ENDPOINT}/v1/logs"
os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = f"{GRAFANA_OTLP_ENDPOINT}/v1/traces"
os.environ["OTEL_EXPORTER_OTLP_METRICS_ENDPOINT"] = f"{GRAFANA_OTLP_ENDPOINT}/v1/metrics"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic%20{GRAFANA_OTLP_BASIC_B64}"
os.environ["OTEL_TRACES_EXPORTER"] = "otlp"
os.environ["OTEL_METRICS_EXPORTER"] = "otlp"

# Login/register
from huggingface_hub import login
import graphistry

login(HF_TOKEN)

graphistry.register(
    api=3,
    protocol="https",
    server="hub.graphistry.com",
    personal_key_id=GRAPHISTRY_PERSONAL_KEY_ID,
    personal_key_secret=GRAPHISTRY_PERSONAL_KEY_SECRET,
)


<graphistry.pygraphistry.GraphistryClient at 0x7d2390ec22a0>

In [8]:
### **Step 4: OpenTelemetry Setup (Grafana OTLP, Silent)**

import logging
from opentelemetry import trace, metrics
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter

# Hard-silence OTel logs
logging.getLogger().setLevel(logging.CRITICAL)
logging.getLogger("opentelemetry").setLevel(logging.CRITICAL)
logging.getLogger("opentelemetry").propagate = False

# Shut down any previous providers to stop old exporters
try:
    trace.get_tracer_provider().shutdown()
except Exception:
    pass
try:
    metrics.get_meter_provider().shutdown()
except Exception:
    pass

# Normalize endpoint
GRAFANA_OTLP_ENDPOINT = GRAFANA_OTLP_ENDPOINT.rstrip("/")

# Define service resource with GPU context
resource = Resource.create({
    "service.name": "llamatelemetry-inference",
    "service.version": "0.1.0",
    "service.instance.id": "kaggle-t4-worker-1",
    "deployment.environment": "kaggle-notebook",
    "host.name": "kaggle-gpu-0",
    "gpu.model": "Tesla T4",
    "gpu.memory.total": 15360,  # MB
    "gpu.compute_capability": "7.5",
})

# Create tracer provider with resource (NO console exporter)
tracer_provider = TracerProvider(resource=resource)

# OTLP exporter to Grafana (explicit traces endpoint)
span_exporter = OTLPSpanExporter(
    endpoint=f"{GRAFANA_OTLP_ENDPOINT}/v1/traces",
    headers={
        "Authorization": f"Basic {GRAFANA_OTLP_BASIC_B64}",
        "Content-Type": "application/x-protobuf",
    },
)
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))

trace.set_tracer_provider(tracer_provider)
tracer = trace.get_tracer(__name__)

# Grafana sanity check (silent)
with tracer.start_as_current_span("grafana.sanity") as span:
    span.set_attribute("check", "ok")


In [10]:
# **Setup MeterProvider (Grafana OTLP, Silent)**

from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter

# OTLP metric exporter to Grafana (silent)
metric_exporter = OTLPMetricExporter(
    endpoint=f"{GRAFANA_OTLP_ENDPOINT}/v1/metrics",
    headers={"Authorization": f"Basic {GRAFANA_OTLP_BASIC_B64}"},
)
metric_reader = PeriodicExportingMetricReader(
    metric_exporter,
    export_interval_millis=5000,
)

meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
meter = metrics.get_meter(__name__)

# Create custom instruments
request_counter = meter.create_counter(
    name="llm.requests.total",
    description="Total number of LLM requests",
    unit="1",
)

latency_histogram = meter.create_histogram(
    name="llm.request.duration",
    description="LLM request latency",
    unit="ms",
)

token_histogram = meter.create_histogram(
    name="llm.tokens.total",
    description="Token usage per request",
    unit="{token}",
)


In [11]:
### **Step 5: Simplified Model Loading**

from huggingface_hub import hf_hub_download
from llamatelemetry.server import ServerManager
import os

# Create models directory
os.makedirs("/kaggle/working/models", exist_ok=True)

# Download from a confirmed working repository
print("Downloading model...")
model_path = hf_hub_download(
    repo_id="bartowski/Qwen2.5-3B-Instruct-GGUF",
    filename="Qwen2.5-3B-Instruct-Q4_K_M.gguf",
    local_dir="/kaggle/working/models",
)
print(f"‚úì Model downloaded: {model_path}")

# Check GPUs
import torch
print(f"\nFound {torch.cuda.device_count()} GPUs:")
for i in range(torch.cuda.device_count()):
    print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")

# Start server
server = ServerManager(server_url="http://127.0.0.1:8090")

# Minimal working configuration
server.start_server(
    model_path=model_path,
    gpu_layers=99,
    tensor_split="1.0",  # Use only first GPU
    port=8090,
    host="127.0.0.1",
    ctx_size=4096,
    batch_size=512,
)

print("\n‚úì Server running on http://127.0.0.1:8090")
print("‚úì GPU 0: Used for LLM")
print("‚úì GPU 1: Free for Graphistry")

Downloading model...


Qwen2.5-3B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

‚úì Model downloaded: /kaggle/working/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf

Found 2 GPUs:
  GPU 0: Tesla T4
  GPU 1: Tesla T4
GPU Check:
  Platform: kaggle
  GPU: Tesla T4
  Compute Capability: 7.5
  Status: ‚úì Compatible
Starting llama-server...
  Executable: /usr/local/lib/python3.12/dist-packages/llamatelemetry/binaries/cuda12/llama-server
  Model: Qwen2.5-3B-Instruct-Q4_K_M.gguf
  GPU Layers: 99
  Context Size: 4096
  Server URL: http://127.0.0.1:8090
Waiting for server to be ready...... ‚úì Ready in 3.0s

‚úì Server running on http://127.0.0.1:8090
‚úì GPU 0: Used for LLM
‚úì GPU 1: Free for Graphistry


In [12]:
### **Step 6: Instrumented Inference (Silent)**

from llamatelemetry.api import LlamaCppClient
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
import time

class InstrumentedLLMClient:
    """LLM client with OpenTelemetry instrumentation"""

    def __init__(self, base_url: str, tracer, meter):
        self.client = LlamaCppClient(base_url)
        self.tracer = tracer
        self.request_counter = request_counter
        self.latency_histogram = latency_histogram
        self.token_histogram = token_histogram

    def chat_completion(self, messages: list, **kwargs):
        model = kwargs.get("model", "unknown")
        max_tokens = kwargs.get("max_tokens", 100)
        temperature = kwargs.get("temperature", 0.7)

        with self.tracer.start_as_current_span(
            name=f"llm.chat.{model}",
            kind=trace.SpanKind.CLIENT,
        ) as span:
            try:
                span.set_attribute("llm.system", "llama.cpp")
                span.set_attribute("llm.model", model)
                span.set_attribute("llm.request.max_tokens", max_tokens)
                span.set_attribute("llm.request.temperature", temperature)
                span.set_attribute("llm.request.messages", len(messages))

                start_time = time.time()
                response = self.client.chat.completions.create(
                    messages=messages,
                    **kwargs
                )
                latency_ms = (time.time() - start_time) * 1000

                finish_reason = response.choices[0].finish_reason
                content = response.choices[0].message.content

                span.set_attribute("llm.response.finish_reason", finish_reason)
                span.set_attribute("llm.response.length", len(content))

                self.request_counter.add(
                    1,
                    attributes={
                        "model": model,
                        "finish_reason": finish_reason,
                        "status": "success",
                    }
                )
                self.latency_histogram.record(
                    latency_ms,
                    attributes={"model": model, "status": "success"}
                )

                if hasattr(response, 'usage'):
                    input_tokens = getattr(response.usage, 'prompt_tokens', 0)
                    output_tokens = getattr(response.usage, 'completion_tokens', 0)

                    span.set_attribute("llm.usage.input_tokens", input_tokens)
                    span.set_attribute("llm.usage.output_tokens", output_tokens)

                    self.token_histogram.record(
                        input_tokens,
                        attributes={"model": model, "token_type": "input"}
                    )
                    self.token_histogram.record(
                        output_tokens,
                        attributes={"model": model, "token_type": "output"}
                    )

                span.set_status(Status(StatusCode.OK))
                return response

            except Exception as e:
                span.set_status(Status(StatusCode.ERROR, str(e)))
                span.record_exception(e)
                self.request_counter.add(
                    1,
                    attributes={
                        "model": model,
                        "status": "error",
                        "error_type": type(e).__name__,
                    }
                )
                raise

# Initialize instrumented client
llm = InstrumentedLLMClient("http://127.0.0.1:8090", tracer, meter)


In [20]:
#** Step 7: Generate Sample Requests**

# Collect telemetry data for visualization
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry import trace

memory_exporter = InMemorySpanExporter()
tracer_provider.add_span_processor(SimpleSpanProcessor(memory_exporter))

import random
import time

# Wrap batches in a parent span so child spans have parents
with tracer.start_as_current_span("llm.batch.requests"):
    response = llm.chat_completion(
        messages=[{"role": "user", "content": "What is CUDA?"}],
        max_tokens=100,
        temperature=0.7,
    )
    print(f"Response: {response.choices[0].message.content}")

    prompts = [
        "Explain transformer architecture",
        "What is quantization in LLMs?",
        "How does FlashAttention work?",
        "Describe the attention mechanism",
        "What is GGUF format?",
    ]

    responses = []
    for i, prompt in enumerate(prompts):
        print(f"Request {i+1}/{len(prompts)}: {prompt[:50]}...")
        resp = llm.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=random.randint(50, 150),
            temperature=random.uniform(0.5, 0.9),
        )
        responses.append(resp)
        time.sleep(0.5)

    print(f"Completed {len(responses)} requests")

with tracer.start_as_current_span("llm.batch.test"):
    for i in range(10):
        llm.chat_completion(
            messages=[{"role": "user", "content": f"Test request {i}"}],
            max_tokens=50,
        )

try:
    tracer_provider.force_flush()
except Exception:
    pass

finished_spans = memory_exporter.get_finished_spans()
print(f"Captured {len(finished_spans)} spans")


Response: CUDA (Compute Unified Device Architecture) is a parallel computing platform and application programming interface (API) model created by NVIDIA. It allows developers to use a GPU for general-purpose processing in addition to its traditional role of accelerating graphics processing.

Key points about CUDA:

1. Purpose: CUDA enables the execution of thousands or millions of threads in parallel, which can be significantly faster than CPU-based approaches when handling tasks that are well-suited to such an architecture.

2. Target Hardware: The main target hardware for
Request 1/5: Explain transformer architecture...
Request 2/5: What is quantization in LLMs?...
Request 3/5: How does FlashAttention work?...
Request 4/5: Describe the attention mechanism...
Request 5/5: What is GGUF format?...
Completed 5 requests
Captured 18 spans


In [32]:
# (intentionally left empty)


In [21]:
### Step 8: Transform Spans to Graph Data**

# Prefer RAPIDS device selection over CuPy
import os
import rmm
rmm.reinitialize(devices=[1])

import cudf

span_data = []
for span in finished_spans:
    span_data.append({
        "span_id": format(span.context.span_id, "016x"),
        "parent_span_id": format(span.parent.span_id, "016x") if span.parent else None,
        "trace_id": format(span.context.trace_id, "032x"),
        "name": span.name,
        "start_time": span.start_time,
        "end_time": span.end_time,
        "duration_ms": (span.end_time - span.start_time) / 1_000_000,  # nanoseconds to ms
        "status": span.status.status_code.name,
        "attributes": dict(span.attributes) if span.attributes else {},
    })

df_spans = cudf.DataFrame(span_data)
print(f"Span DataFrame shape: {df_spans.shape}")
print(df_spans.head())

# Create edges (parent-child relationships). If no parents, fallback to ordering per trace.
if len(df_spans) > 0:
    df_edges = df_spans[df_spans["parent_span_id"].notnull()][
        ["parent_span_id", "span_id", "trace_id"]
    ].rename(columns={
        "parent_span_id": "source",
        "span_id": "destination",
    })

    if len(df_edges) == 0:
        # Fallback: connect spans by start_time across all spans
        df_sorted = df_spans.sort_values(["start_time"])
        df_sorted["next_span_id"] = df_sorted["span_id"].shift(-1)
        df_edges = df_sorted[["span_id", "next_span_id", "trace_id"]].rename(columns={
            "span_id": "source",
            "next_span_id": "destination",
        })
        df_edges = df_edges[df_edges["destination"].notnull()]
else:
    df_edges = cudf.DataFrame(columns=["source", "destination", "trace_id"])

print(f"Edges DataFrame shape: {df_edges.shape}")


Span DataFrame shape: (18, 9)
            span_id    parent_span_id                          trace_id  \
0  42fa297f74cdc67f  b87f1a3c74e81ffd  045221a1646f2503df62cc6aa606b8d3   
1  d513e6752768e944  b87f1a3c74e81ffd  045221a1646f2503df62cc6aa606b8d3   
2  5d73c7963936917f  b87f1a3c74e81ffd  045221a1646f2503df62cc6aa606b8d3   
3  d4cbe780a9801fd6  b87f1a3c74e81ffd  045221a1646f2503df62cc6aa606b8d3   
4  aae43e0a41962af8  b87f1a3c74e81ffd  045221a1646f2503df62cc6aa606b8d3   

               name           start_time             end_time  duration_ms  \
0  llm.chat.unknown  1770541334689255810  1770541336263532014  1574.276204   
1  llm.chat.unknown  1770541336263772564  1770541337629603191  1365.830627   
2  llm.chat.unknown  1770541338130001282  1770541339643591060  1513.589778   
3  llm.chat.unknown  1770541340144027093  1770541341231449896  1087.422803   
4  llm.chat.unknown  1770541341731882169  1770541343596870025  1864.987856   

  status                                         a

In [23]:
# Step 9 First ensure you have actual edges
import os, json
import graphistry

if len(df_edges) > 0:
    g = graphistry.edges(df_edges, "source", "destination")
    g = g.nodes(df_spans, "span_id")
    g = g.bind(point_title="name", edge_title="trace_id")
    g = g.layout_settings(play=0)
    url = g.plot(render=False)
    print(f"Graph URL: {url}")

    os.makedirs("/kaggle/working", exist_ok=True)
    with open("/kaggle/working/graphistry_trace_url.json", "w", encoding="utf-8") as f:
        json.dump({"trace_graph_url": url}, f, indent=2)
else:
    print("No edges in df_edges! (No spans captured)")


Graph URL: https://hub.graphistry.com/graph/graph.html?dataset=ca8434889f9f4d1396df59d4fe798aca&type=arrow&viztoken=e85c7e34-1fee-40a6-8f24-7cea7873096d&usertag=28c8414b-pygraphistry-0.50.6&splashAfter=1770541497&info=true&play=0


In [24]:
# Step 10 Create Trace Graph Visualization**
import os, json
import graphistry

if len(df_edges) > 0:
    g = graphistry.edges(df_edges, "source", "destination")
    g = g.nodes(df_spans, "span_id")
    g = g.bind(
        point_title="name",
        point_size="duration_ms",
        point_color="status",
        edge_title="trace_id",
    )

    g = g.layout_settings(play=0)

    # Only encode if status exists
    if "status" in df_spans.columns:
        g = g.encode_point_color("status", categorical_mapping={
            "OK": "#4CAF50",
            "ERROR": "#F44336",
            "UNSET": "#9E9E9E",
        }, as_categorical=True)

    url = g.plot(render=False)
    print(f"üîó Trace Graph Dashboard: {url}")

    os.makedirs("/kaggle/working", exist_ok=True)
    with open("/kaggle/working/graphistry_trace_graph_url.json", "w", encoding="utf-8") as f:
        json.dump({"trace_graph_dashboard_url": url}, f, indent=2)
else:
    print("Skipping Graphistry plot: no edges available")


üîó Trace Graph Dashboard: https://hub.graphistry.com/graph/graph.html?dataset=7700674cf576440e854913a06997f59b&type=arrow&viztoken=5385d4a7-1d29-4e0e-a9af-0b12c0218c26&usertag=28c8414b-pygraphistry-0.50.6&splashAfter=1770541581&info=true&play=0


In [26]:
### **Step 11: Metrics Dashboards with Plotly (GPU 1) (5 min)**

import cudf
import rmm
import pandas as pd

# Ensure GPU 1 for RAPIDS operations
rmm.reinitialize(devices=[1])

metrics_data = []
for span in finished_spans:
    attrs = span.attributes or {}
    metrics_data.append({
        "timestamp": span.start_time,  # keep as ns int
        "duration_ms": (span.end_time - span.start_time) / 1_000_000,
        "model": attrs.get("llm.model", "unknown"),
        "input_tokens": attrs.get("llm.usage.input_tokens", 0),
        "output_tokens": attrs.get("llm.usage.output_tokens", 0),
        "total_tokens": attrs.get("llm.usage.input_tokens", 0) + attrs.get("llm.usage.output_tokens", 0),
        "status": span.status.status_code.name,
    })

df_metrics = cudf.DataFrame(metrics_data)
if len(df_metrics) == 0:
    print("No spans available for Plotly metrics")
else:
    df_metrics = df_metrics.sort_values("timestamp")
    print(f"Metrics DataFrame shape: {df_metrics.shape}")

    # Convert to pandas for Plotly rendering
    dfp = df_metrics.to_pandas()
    dfp["timestamp"] = pd.to_datetime(dfp["timestamp"], unit="ns")

    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            "Request Latency Distribution",
            "Token Usage Over Time",
            "Tokens per Request (Input vs Output)",
            "Request Rate Over Time"
        ),
        specs=[
            [{"type": "histogram"}, {"type": "scatter"}],
            [{"type": "scatter"}, {"type": "scatter"}],
        ]
    )

    fig.add_trace(
        go.Histogram(x=dfp["duration_ms"], nbinsx=20, name="Latency (ms)"),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=dfp["timestamp"], y=dfp["total_tokens"], mode="lines+markers", name="Total Tokens"),
        row=1, col=2
    )

    fig.add_trace(
        go.Scatter(x=dfp["timestamp"], y=dfp["input_tokens"], mode="lines", name="Input Tokens"),
        row=2, col=1
    )
    fig.add_trace(
        go.Scatter(x=dfp["timestamp"], y=dfp["output_tokens"], mode="lines", name="Output Tokens"),
        row=2, col=1
    )

    fig.add_trace(
        go.Scatter(x=dfp["timestamp"], y=[1]*len(dfp), mode="lines+markers", name="Requests"),
        row=2, col=2
    )

    fig.update_layout(height=700, showlegend=True, title_text="LLM Metrics Dashboard")
    fig.show()


Metrics DataFrame shape: (18, 7)


In [20]:

Key Learnings

### **1. OpenTelemetry Integration**
#- ‚úÖ Full instrumentation with traces, metrics, and logs
#- ‚úÖ Semantic conventions for GenAI workloads
#- ‚úÖ Custom resource attributes for GPU context
#- ‚úÖ Flexible export to multiple backends

### **2. Trace Visualization**
#- ‚úÖ Parent-child span relationships as interactive graphs
#- ‚úÖ Request flow waterfall diagrams
#- ‚úÖ Error propagation visualization
#- ‚úÖ GPU-accelerated graph analytics with Graphistry

### **3. Metrics Monitoring**
#- ‚úÖ Request latency tracking
#- ‚úÖ Token usage analysis
#- ‚úÖ Throughput monitoring
#- ‚úÖ Real-time dashboards with Plotly

### **4. Production Patterns**
#- ‚úÖ Context propagation for distributed tracing
#- ‚úÖ Batch export for performance
#- ‚úÖ Error handling and exception recording
#- ‚úÖ Resource attribution for multi-GPU environments

---

## Next Steps

#- **Notebook 15:** Real-time performance monitoring with live metrics
#- **Notebook 16:** End-to-end production observability stack
#- Integrate with external collectors (Jaeger, Tempo, DataDog)
#- Add custom span processors for filtering/enrichment
#- Implement sampling strategies for high-volume workloads


#**üéØ Objectives Achieved:**
#‚úÖ CUDA Inference (GPU 0)
#‚úÖ LLM Observability (GPU 0)
#‚úÖ Graphistry + Plotly Visualizations (GPU 1)


SyntaxError: invalid syntax (3664757259.py, line 1)