## Step 1: Check GPU Environment

In [None]:
import subprocess

print("="*70)
print("üîç GPU ENVIRONMENT CHECK")
print("="*70)

# Check GPUs
!nvidia-smi --query-gpu=index,name,memory.total --format=csv

# CUDA version
print("\nüìä CUDA Version:")
!nvcc --version | grep release

print("\n‚úÖ Environment ready for Unsloth + llamatelemetry")

## Step 2: Install Unsloth and llamatelemetry

In [None]:
%%time
print("üì¶ Installing Unsloth and llamatelemetry...")

# Install Unsloth (fast installation)
!pip install -q unsloth

# Install llamatelemetry v0.1.0 (force fresh install to ensure correct binaries)
!pip install -q --no-cache-dir --force-reinstall git+https://github.com/llamatelemetry/llamatelemetry.git@v0.1.0

# Additional dependencies
!pip install -q datasets trl

# Verify installations
import llamatelemetry
print(f"\n‚úÖ llamatelemetry {llamatelemetry.__version__} installed")

try:
    from unsloth import FastLanguageModel
    print("‚úÖ Unsloth installed")
except ImportError as e:
    print(f"‚ö†Ô∏è Unsloth import issue: {e}")

## Step 3: Load Base Model with Unsloth

We'll use Gemma-3 1B as it's fast to fine-tune on T4.

In [None]:
%%time
from unsloth import FastLanguageModel
import torch

print("="*70)
print("üì• LOADING BASE MODEL WITH UNSLOTH")
print("="*70)

# Model configuration
model_name = "unsloth/gemma-3-1b-it"  # Small model for demo
max_seq_length = 2048

print(f"\nüì• Loading {model_name}...")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=torch.float16,
    load_in_4bit=True,  # Use 4-bit for training
)

print(f"\n‚úÖ Model loaded!")
print(f"   Model: {model_name}")
print(f"   Max Sequence Length: {max_seq_length}")
print(f"   Precision: 4-bit")

## Step 4: Add LoRA Adapters

In [None]:
print("="*70)
print("üîß ADDING LORA ADAPTERS")
print("="*70)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,                # LoRA rank
    lora_alpha=32,       # LoRA alpha
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# Count trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())

print(f"\n‚úÖ LoRA adapters added!")
print(f"   Trainable params: {trainable:,} ({100*trainable/total:.2f}%)")
print(f"   Total params: {total:,}")

## Step 5: Prepare Training Dataset

In [None]:
from datasets import load_dataset

print("="*70)
print("üìä PREPARING TRAINING DATASET")
print("="*70)

# Load a small dataset for demo (Alpaca format)
dataset = load_dataset("yahma/alpaca-cleaned", split="train[:500]")

print(f"\nüìä Dataset loaded: {len(dataset)} examples")
print(f"\nüìã Sample data:")
print(dataset[0])

# Format for training (Alpaca prompt format)
def format_alpaca(example):
    instruction = example.get("instruction", "")
    input_text = example.get("input", "")
    output = example.get("output", "")
    
    if input_text:
        prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}"""
    else:
        prompt = f"""### Instruction:
{instruction}

### Response:
{output}"""
    
    return {"text": prompt}

dataset = dataset.map(format_alpaca)
print(f"\n‚úÖ Dataset formatted for training")

## Step 6: Train with SFTTrainer

In [None]:
%%time
from trl import SFTTrainer, SFTConfig

print("="*70)
print("üèãÔ∏è TRAINING MODEL")
print("="*70)

# Training configuration (quick demo - increase for real training)
training_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=30,  # Quick demo - use more for real training
    learning_rate=2e-4,
    fp16=True,
    logging_steps=5,
    output_dir="./unsloth_output",
    optim="adamw_8bit",
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=training_args,
)

print("\nüèãÔ∏è Starting training...")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Max steps: {training_args.max_steps}")
print(f"   Learning rate: {training_args.learning_rate}")

trainer.train()

print("\n‚úÖ Training complete!")

## Step 7: Export to GGUF Format

This is the key step - converting from Unsloth to llama.cpp compatible format.

In [None]:
import os

print("="*70)
print("üì¶ EXPORTING TO GGUF FORMAT")
print("="*70)

# Output path
OUTPUT_DIR = "/kaggle/working/gguf_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Unsloth's built-in GGUF export
print("\nüì¶ Exporting to GGUF with Q4_K_M quantization...")

model.save_pretrained_gguf(
    OUTPUT_DIR,
    tokenizer,
    quantization_method="q4_k_m",  # K-quant for best quality/size
)

# Find the exported file
gguf_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.gguf')]
print(f"\n‚úÖ GGUF export complete!")
print(f"   Output directory: {OUTPUT_DIR}")
print(f"   Files: {gguf_files}")

if gguf_files:
    gguf_path = os.path.join(OUTPUT_DIR, gguf_files[0])
    size_mb = os.path.getsize(gguf_path) / (1024**2)
    print(f"   Size: {size_mb:.1f} MB")

## Step 8: Clear GPU Memory Before Inference

In [None]:
import gc
import torch

print("üßπ Clearing GPU memory...")

# Delete training objects
del model
del trainer
del tokenizer

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

print("\nüìä GPU Memory After Cleanup:")
!nvidia-smi --query-gpu=index,memory.used,memory.free --format=csv

print("\n‚úÖ GPU memory cleared for inference")

## Step 9: Deploy with llamatelemetry

In [None]:
from llamatelemetry.server import ServerManager
import os

print("="*70)
print("üöÄ DEPLOYING FINE-TUNED MODEL WITH LLCUDA")
print("="*70)

# Find the GGUF file
OUTPUT_DIR = "/kaggle/working/gguf_output"
gguf_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.gguf')]

if not gguf_files:
    print("‚ùå No GGUF file found!")
else:
    gguf_path = os.path.join(OUTPUT_DIR, gguf_files[0])
    print(f"üì• Loading: {gguf_path}")
    
    # Start server
    server = ServerManager()
    print("\nüöÄ Starting llama-server...")
    server.start_server(
        model_path=gguf_path,
        host="127.0.0.1",
        port=8080,
        gpu_layers=99,
        ctx_size=2048,
        flash_attention=True,
    )
    
    if server.check_server_health(timeout=60):
        print("\n‚úÖ Fine-tuned model deployed!")
        print(f"   API endpoint: http://127.0.0.1:8080")
    else:
        print("\n‚ùå Server failed to start")

## Step 10: Test Your Fine-Tuned Model

In [None]:
from llamatelemetry.api.client import LlamaCppClient

print("="*70)
print("üß™ TESTING FINE-TUNED MODEL")
print("="*70)

client = LlamaCppClient(base_url="http://127.0.0.1:8080")

# Test with Alpaca-style prompt (matching training format)
test_prompts = [
    "### Instruction:\nExplain what machine learning is.\n\n### Response:",
    "### Instruction:\nWrite a short poem about coding.\n\n### Response:",
    "### Instruction:\nWhat are the benefits of GPU acceleration?\n\n### Response:",
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\nüîπ Test {i}:")
    print(f"   Prompt: {prompt[:50]}...")
    
    response = client.completion(
        prompt=prompt,
        max_tokens=100,
        temperature=0.7,
        stop=["###", "\n\n"]  # Stop at next section
    )
    
    print(f"   Response: {response.choices[0].text.strip()[:200]}")

## Step 11: Compare with Chat API

In [None]:
print("="*70)
print("üí¨ CHAT COMPLETION TEST")
print("="*70)

# Test with chat format
response = client.chat.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What did you learn during fine-tuning?"}
    ],
    max_tokens=150,
    temperature=0.7
)

print(f"\nüí¨ Response:")
print(response.choices[0].message.content)

print(f"\nüìä Usage:")
print(f"   Prompt tokens: {response.usage.prompt_tokens}")
print(f"   Completion tokens: {response.usage.completion_tokens}")

## Step 12: Save Model for Later Use

In [None]:
import shutil
import os

print("="*70)
print("üíæ SAVING MODEL")
print("="*70)

# Create a properly named copy
OUTPUT_DIR = "/kaggle/working/gguf_output"
gguf_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.gguf')]

if gguf_files:
    src = os.path.join(OUTPUT_DIR, gguf_files[0])
    dst = "/kaggle/working/my-finetuned-model-Q4_K_M.gguf"
    
    shutil.copy(src, dst)
    
    print(f"\n‚úÖ Model saved: {dst}")
    print(f"   Size: {os.path.getsize(dst) / (1024**2):.1f} MB")
    print(f"\nüí° To use this model later:")
    print(f"   from llamatelemetry.server import ServerManager")
    print(f"   server = ServerManager()")
    print(f"   server.start(model_path='{dst}')")

## Step 13: Cleanup

In [None]:
print("üõë Stopping server...")
server.stop_server()

print("\n‚úÖ Server stopped")
print("\nüìä Final GPU Status:")
!nvidia-smi --query-gpu=index,memory.used,memory.free --format=csv

## üìö Summary

### Complete Workflow:
1. ‚úÖ Installed Unsloth + llamatelemetry
2. ‚úÖ Loaded base model with 4-bit quantization
3. ‚úÖ Added LoRA adapters for efficient training
4. ‚úÖ Fine-tuned on custom dataset
5. ‚úÖ Exported to GGUF (Q4_K_M)
6. ‚úÖ Deployed with llamatelemetry llama-server
7. ‚úÖ Ran inference on fine-tuned model

### Key llamatelemetry + Unsloth Integration:

```python
from llamatelemetry.unsloth import export_to_llamatelemetry

# After Unsloth training
export_to_llamatelemetry(
    model=model,
    tokenizer=tokenizer,
    output_path="my-model.gguf",
    quant_type="Q4_K_M"
)
```

---

**Next:** [06-split-gpu-graphistry](06-split-gpu-graphistry-llamatelemetry-v0.1.0.ipynb)