The purpose of this notebook is to debug Unsloth+Qlora before testing it in Nautilus.

In [None]:
!pip install -U unsloth
!pip install transformers==4.57.1 trl accelerate peft bitsandbytes
!pip install sentencepiece einops timm qwen-vl-utils

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
import json
import torch
from unsloth import FastVisionModel
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import AutoProcessor

MODEL_PATH = "/content/drive/MyDrive/VLM_MODELS/Qwen2.5-VL-7B-Instruct"
TRAINING_DATA = "/content/drive/MyDrive/model_datasets/train2.jsonl"
VAL_DATA = "/content/drive/MyDrive/model_datasets/valid2.jsonl"
OUTPUT_DIR = "/content/drive/MyDrive/output/waste_detection_unsloth"

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    MODEL_PATH,
    load_in_4bit = True,                # Set true for QLoRA
    use_gradient_checkpointing = "unsloth",
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    random_state = 42,
)

processor = AutoProcessor.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    local_files_only=True,
)

FastVisionModel.for_training(model)

In [None]:
def load_jsonl(path):
    data = []
    with open(path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_dataset = load_jsonl(TRAINING_DATA)
val_dataset = load_jsonl(VAL_DATA)

print(f"Train: {len(train_dataset)} | Valid: {len(val_dataset)}")

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),

    train_dataset = train_dataset,
    eval_dataset = val_dataset,

    args = SFTConfig(
        output_dir = OUTPUT_DIR,

        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 8,

        num_train_epochs = 3,
        learning_rate = 2e-4,
        warmup_steps = 50,

        logging_steps = 10,
        save_steps = 50,
        save_total_limit = 3,

        optim = "adamw_8bit",
        weight_decay = 0.01,

        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        max_length = 2048,

        report_to = "none",
        seed = 42,
    ),
)

In [None]:
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training complete.")

In [None]:
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_properties(0)
    print("\n" + "="*70)
    print("FINAL GPU MEMORY STATS (UNSLOTH)")
    print("="*70)
    print(f"GPU Name:             {gpu.name}")
    print(f"Total Memory:         {gpu.total_memory / 1024**3:.2f} GB")
    print(f"Max Reserved:         {torch.cuda.max_memory_reserved() / 1024**3:.2f} GB")
    print(f"Max Allocated:        {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
    print("="*70)