# Fine-Tuning with Unsloth + GGUF Export

**Duration:** ~30 min | **Platform:** Kaggle dual Tesla T4

This notebook shows how to fine-tune a model with **Unsloth**, export it to GGUF,
and deploy it with llamatelemetry — all traced with OpenTelemetry.

### What you'll learn
1. Load a base model with Unsloth + LoRA adapters
2. Prepare a dataset (Alpaca format)
3. Train with SFTTrainer (traced with `@workflow`)
4. Export to GGUF with Q4_K_M quantization
5. Deploy and test the fine-tuned model

In [None]:
!pip install -q git+https://github.com/llamatelemetry/llamatelemetry.git@v1.2.0
!pip install -q unsloth trl datasets

## Initialize SDK with Tracing

Enable tracing so every step of the fine-tuning pipeline is observable.

In [None]:
import llamatelemetry

llamatelemetry.init(service_name="finetune-workflow")
print(f"llamatelemetry {llamatelemetry.version()} — tracing enabled")

## Load Base Model

Use Unsloth's `FastModel` for 2× faster loading and automatic LoRA injection.

In [None]:
@llamatelemetry.task(name="load-base-model")
def load_model():
    from unsloth import FastLanguageModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/gemma-3-1b-it",
        max_seq_length=2048,
        dtype=None,  # auto-detect
        load_in_4bit=True,
    )

    # Add LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
    )
    print(f"Model loaded with LoRA (r=16, {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable params)")
    return model, tokenizer

model, tokenizer = load_model()

## Prepare Dataset

Format the dataset in Alpaca style: instruction → input → output.

In [None]:
@llamatelemetry.task(name="prepare-dataset")
def prepare_dataset(tokenizer):
    from datasets import load_dataset

    dataset = load_dataset("yahma/alpaca-cleaned", split="train[:1000]")

    alpaca_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

    def format_example(example):
        return {"text": alpaca_template.format(**example) + tokenizer.eos_token}

    dataset = dataset.map(format_example)
    print(f"Dataset prepared: {len(dataset)} examples")
    return dataset

dataset = prepare_dataset(tokenizer)

## Train with SFT

The `@workflow` decorator creates a parent span that groups all training steps.

In [None]:
@llamatelemetry.workflow(name="sft-training")
def train(model, tokenizer, dataset):
    from trl import SFTTrainer
    from transformers import TrainingArguments

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=5,
            max_steps=30,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=5,
            output_dir="outputs",
            optim="adamw_8bit",
        ),
        max_seq_length=2048,
    )

    stats = trainer.train()
    print(f"Training complete: {stats.metrics['train_loss']:.4f} loss, {stats.metrics['train_steps_per_second']:.1f} steps/s")
    return model

model = train(model, tokenizer, dataset)

## Export to GGUF

Merge LoRA weights back and export to GGUF with Q4_K_M quantization.

In [None]:
@llamatelemetry.task(name="gguf-export")
def export_gguf(model, tokenizer):
    from unsloth import FastLanguageModel
    import os

    output_dir = "outputs/gemma-3-1b-finetuned-gguf"
    model.save_pretrained_gguf(
        output_dir,
        tokenizer,
        quantization_method="q4_k_m",
    )

    # Find the exported file
    gguf_files = [f for f in os.listdir(output_dir) if f.endswith(".gguf")]
    gguf_path = os.path.join(output_dir, gguf_files[0])
    size_mb = os.path.getsize(gguf_path) / (1024 * 1024)
    print(f"Exported: {gguf_path} ({size_mb:.0f} MB)")
    return gguf_path

gguf_path = export_gguf(model, tokenizer)

## Deploy and Test

Load the fine-tuned GGUF model into llama-server and test inference.

In [None]:
from llamatelemetry.llama import ServerManager, LlamaCppClient

mgr = ServerManager()
mgr.start_server(model_path=gguf_path, gpu_layers=99, ctx_size=2048)
mgr.wait_until_ready(timeout=60)

client = LlamaCppClient(base_url="http://127.0.0.1:8090")

# Test the fine-tuned model
test_prompts = [
    "Explain what machine learning is in one sentence.",
    "Write a Python function that reverses a string.",
    "What is the capital of France?",
]

for prompt in test_prompts:
    resp = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=128, temperature=0.7,
    )
    print(f"Q: {prompt}")
    print(f"A: {resp.choices[0].message.content}\n")

## Monitor GPU and Cleanup

In [None]:
# GPU status after fine-tuning + deployment
for s in llamatelemetry.gpu.snapshot():
    print(f"GPU {s.gpu_id}: {s.mem_used_mb}/{s.mem_total_mb} MB ({s.utilization_pct}% util)")

mgr.stop_server()
llamatelemetry.shutdown()
print("\nFine-tuning pipeline complete.")