In [None]:
import unsloth
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel
import gc

## Model Configuration

In [None]:
OLLAMA_MODEL_NAME = "qwen2.5:1.5b"
HF_MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
MAX_SEQ_LENGTH = 2048

In [None]:

TRAIN_PATH = "/content/train_dataset_500.jsonl"


## Input Format

In [None]:
model_stub, tokenizer = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)


def formatting_prompts_func(examples):
    texts = []
    for messages in examples["messages"]:

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    return { "text" : texts }


raw_train_dataset = load_dataset("json", data_files=TRAIN_PATH, split="train")
raw_test_dataset = load_dataset("json", data_files=TEST_PATH, split="train")


print("Formatting Train Dataset...")
train_dataset = raw_train_dataset.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])
print("Formatting Test Dataset...")
test_dataset = raw_test_dataset.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])


del model_stub
torch.cuda.empty_cache()
gc.collect()
print("Data prepared and memory cleared.")

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Formatting Train Dataset...
Formatting Test Dataset...
Data prepared and memory cleared.


## Prameter Search

In [None]:
#PARAMETER GRID

param_grid = {
    "learning_rate": [2e-4, 1e-4, 5e-5],
    "lora_r": [8, 16, 32],
    "epochs": [1]
}

# Generate all combinations
keys, values = zip(*param_grid.items())
search_space = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []
best_loss = float('inf')
best_params = None

print(f"Starting Grid Search with {len(search_space)} combinations...")

for i, params in enumerate(search_space):
    print(f"\n=== RUN {i+1}/{len(search_space)} | LR: {params['learning_rate']} | Rank: {params['lora_r']} ===")


    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = HF_MODEL_NAME,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = None,
        load_in_4bit = True,
    )

    # Apply LoRA Config
    model = FastLanguageModel.get_peft_model(
        model,
        r = params['lora_r'],
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = params['lora_r'] * 2, # Auto-scale alpha
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42,
    )

    # C. Train
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        eval_dataset = test_dataset,
        dataset_text_field = "text",
        max_seq_length = MAX_SEQ_LENGTH,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = params['epochs'],
            learning_rate = params['learning_rate'],
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 42,
            output_dir = f"temp_output_run_{i}",
            save_strategy = "no",
            eval_strategy = "epoch"
        ),
    )

    trainer.train()

    # Capture Results
    eval_metrics = trainer.evaluate()
    final_loss = eval_metrics['eval_loss']
    print(f"--> Run {i+1} Result: Eval Loss = {final_loss}")

    results.append({**params, "eval_loss": final_loss})

    if final_loss < best_loss:
        best_loss = final_loss
        best_params = params


# Show Results
results_df = pd.DataFrame(results)
print("\n=== GRID SEARCH COMPLETE ===")
print(results_df.sort_values(by="eval_loss"))
print(f"\nWINNER: {best_params}")

Starting Grid Search with 9 combinations...

=== RUN 1/9 | LR: 0.0002 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)
[34m[1mwandb[0m: Currently logged in as: [33mayush7[0m ([33mayush7-international-institute-of-information-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.6903,0.712268


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


--> Run 1 Result: Eval Loss = 0.712268054485321

=== RUN 2/9 | LR: 0.0002 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.6367,0.664056


--> Run 2 Result: Eval Loss = 0.6640557646751404

=== RUN 3/9 | LR: 0.0002 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.6021,0.631217


--> Run 3 Result: Eval Loss = 0.6312170028686523

=== RUN 4/9 | LR: 0.0001 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss
1,0.8547,0.86748


--> Run 4 Result: Eval Loss = 0.8674800395965576

=== RUN 5/9 | LR: 0.0001 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.7215,0.747685


--> Run 5 Result: Eval Loss = 0.7476854920387268

=== RUN 6/9 | LR: 0.0001 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.6506,0.677944


--> Run 6 Result: Eval Loss = 0.6779436469078064

=== RUN 7/9 | LR: 5e-05 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss
1,1.4973,1.479064


--> Run 7 Result: Eval Loss = 1.4790643453598022

=== RUN 8/9 | LR: 5e-05 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.959,0.957152


--> Run 8 Result: Eval Loss = 0.9571517705917358

=== RUN 9/9 | LR: 5e-05 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.7496,0.775297


--> Run 9 Result: Eval Loss = 0.775297224521637

=== GRID SEARCH COMPLETE ===
   learning_rate  lora_r  epochs  eval_loss
2        0.00020      32       1   0.631217
1        0.00020      16       1   0.664056
5        0.00010      32       1   0.677944
0        0.00020       8       1   0.712268
4        0.00010      16       1   0.747685
8        0.00005      32       1   0.775297
3        0.00010       8       1   0.867480
7        0.00005      16       1   0.957152
6        0.00005       8       1   1.479064

WINNER: {'learning_rate': 0.0002, 'lora_r': 32, 'epochs': 1}


## Training using best parameters

In [None]:
# FINAL TRAINING
FINAL_OUTPUT_DIR = "qwen_1_5b_rag_best_model"
FINAL_EPOCHS = 3

print(f"\nTraining Final Model with Best Params: {best_params}")


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)

# Apply Best LoRA Params
model = FastLanguageModel.get_peft_model(
    model,
    r = best_params['lora_r'],
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = best_params['lora_r'] * 2,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

#  Final Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = FINAL_EPOCHS,
        learning_rate = best_params['learning_rate'],
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = FINAL_OUTPUT_DIR,
    ),
)

trainer.train()


print("Merging LoRA adapters into full model...")
model.save_pretrained_merged(FINAL_OUTPUT_DIR, tokenizer, save_method = "json")
print(f"Model saved to {FINAL_OUTPUT_DIR}")


Training Final Model with Best Params: {'learning_rate': 0.0002, 'lora_r': 32, 'epochs': 1}
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 150
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Step,Training Loss
1,2.7761
2,2.6937
3,2.4663
4,2.2279
5,2.118
6,2.0666
7,1.7223
8,1.4179
9,1.0437
10,0.9542


Merging LoRA adapters into full model...


config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [01:04<00:00, 64.12s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [01:00<00:00, 60.31s/it]


Unsloth: Merge process complete. Saved to `/content/qwen_1_5b_rag_best_model`
Model saved to qwen_1_5b_rag_best_model


In [None]:

drive_export_dir = "/content/drive/MyDrive/colab_model_exports"
os.makedirs(drive_export_dir, exist_ok=True)

if 'FINAL_OUTPUT_DIR' not in locals():
    FINAL_OUTPUT_DIR = "qwen_1_5b_rag_best_model"

zip_name = f"{FINAL_OUTPUT_DIR}"
shutil.make_archive(zip_name, 'zip', FINAL_OUTPUT_DIR)

# SAVING to Drive
destination_path = os.path.join(drive_export_dir, f"{zip_name}.zip")
shutil.copy(f"{zip_name}.zip", destination_path)

print(f"Success! Best model saved to: {destination_path}")

Success! Best model saved to: /content/drive/MyDrive/colab_model_exports/qwen_1_5b_rag_best_model.zip
