### News

Placeholder

### Installation

In [None]:
%%capture
# Install Unsloth and dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes# Placeholder

In [None]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import json

In [None]:
# Uncomment for Google Colab file upload
# from google.colab import files
# uploaded = files.upload()
# dataset_path = list(uploaded.keys())[0]

# For Kaggle, update this path after uploading
#dataset_path = "/kaggle/input/your-dataset/finetuning_dataset.jsonl"
dataset_path = "/kaggle/input/nlqueries-sql-charts/finetuning_dataset.jsonl"


print(f"Dataset path: {dataset_path}")

In [None]:
# Model configuration
max_seq_length = 2048  # Choose any! Unsloth auto-supports RoPE Scaling internally
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Choose your model - small models for quick training
# Options:
# - "unsloth/Qwen2.5-0.5B-Instruct"  # 0.5B params - Very fast
# - "unsloth/Qwen2.5-1.5B-Instruct"  # 1.5B params - Good balance
# - "unsloth/Llama-3.2-1B-Instruct"  # 1B params - Good quality
# - "unsloth/Mistral-7B-v0.3"        # 7B params - Better quality, slower

model_name = "unsloth/Qwen3-1.7B"

print(f"Selected model: {model_name}")
print(f"Max sequence length: {max_seq_length}")
print(f"4-bit quantization: {load_in_4bit}")

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("✓ Model loaded successfully!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank - higher = more capacity but slower (8, 16, 32, 64)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,  # LoRA scaling factor
    lora_dropout=0,  # Dropout for LoRA layers (0 = no dropout)
    bias="none",  # Bias training ("none", "all", "lora_only")
    use_gradient_checkpointing="unsloth",  # Longer training but less memory
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("✓ LoRA configuration applied!")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.1f}M")

### Unsloth


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.2592
2,1.1944
3,1.7378
4,1.3096
5,0.9034
6,0.9969
7,0.5301
8,0.8768
9,0.7028
10,0.6516


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")