# llamatelemetry v0.1.0 Quick Start (Kaggle Dual T4)

This notebook is for **Kaggle notebooks only** (GPU T4 x2).

- Platform: Kaggle dual Tesla T4
- Models: 1B-5B GGUF (Q4_K_M recommended)
- Distribution: GitHub Releases (no PyPI)


In [None]:
# Step 1: Install llamatelemetry v0.1.0
!pip install -q --no-cache-dir --force-reinstall git+https://github.com/llamatelemetry/llamatelemetry.git@v0.1.0


In [None]:
# Step 2: Verify GPUs
!nvidia-smi --query-gpu=name,memory.total,compute_cap --format=csv


In [None]:
# Step 3: Download a small GGUF model
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id='unsloth/gemma-3-1b-it-GGUF',
    filename='gemma-3-1b-it-Q4_K_M.gguf',
    local_dir='/kaggle/working/models',
)
print(model_path)


In [None]:
# Step 4: Start llama-server on GPU 0 (split-GPU)
from llamatelemetry.server import ServerManager

server = ServerManager()
server.start_server(
    model_path=model_path,
    gpu_layers=99,
    tensor_split='1.0,0.0',
    flash_attn=1,
)


In [None]:
# Step 5: Run inference
import llamatelemetry

engine = llamatelemetry.InferenceEngine()
engine.load_model(model_path, auto_start=False)
result = engine.infer('What is AI?', max_tokens=100)
print(result.text)


In [None]:
# Step 6: Cleanup
server.stop_server()
print('Server stopped')
