In [None]:
# Install Unsloth, Xformers, and other dependencies
!pip install "unsloth[kaggle-new]"
!pip install "trl" "peft" "accelerate" "bitsandbytes" "triton"
!pip install "evaluate" "rouge_score" "tqdm"

import unsloth
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel
import gc # Garbage collection for memory management

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# --- CONFIGURATION ---
OLLAMA_MODEL_NAME = "qwen2.5:1.5b"
HF_MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
MAX_SEQ_LENGTH = 2048

# --- YOUR PATHS ---
TRAIN_PATH = "/content/train_dataset_400.jsonl"
TEST_PATH = "/content/test_dataset_100.jsonl"

# --- 1. LOAD TOKENIZER (Just for data prep) ---
# We load the model stub just to get the tokenizer
model_stub, tokenizer = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)

# --- 2. PREPARE DATASETS ---
def formatting_prompts_func(examples):
    texts = []
    for messages in examples["messages"]:
        # Apply Qwen chat template
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    return { "text" : texts }

# Load Raw Data
raw_train_dataset = load_dataset("json", data_files=TRAIN_PATH, split="train")
raw_test_dataset = load_dataset("json", data_files=TEST_PATH, split="train")

# Format Data
print("Formatting Train Dataset...")
train_dataset = raw_train_dataset.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])
print("Formatting Test Dataset...")
test_dataset = raw_test_dataset.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])

# --- 3. CLEANUP ---
# Delete the stub model to free GPU memory for the upcoming loop
del model_stub
torch.cuda.empty_cache()
gc.collect()
print("Data prepared and memory cleared.")

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Formatting Train Dataset...
Formatting Test Dataset...
Data prepared and memory cleared.


In [None]:
import itertools
import pandas as pd
import torch
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# --- DEFINE BIGGER PARAMETER GRID ---
# We are now testing 3 options for LR and 3 options for Rank.
# Total runs = 3 x 3 = 9 runs.
param_grid = {
    "learning_rate": [2e-4, 1e-4, 5e-5],   # Added 1e-4 as a middle ground
    "lora_r": [8, 16, 32],                 # Added Rank 32 for higher capacity
    "epochs": [1]                          # Keeping epochs at 1 for speed
}

# Generate all combinations
keys, values = zip(*param_grid.items())
search_space = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []
best_loss = float('inf')
best_params = None

print(f"Starting Grid Search with {len(search_space)} combinations...")

for i, params in enumerate(search_space):
    print(f"\n=== RUN {i+1}/{len(search_space)} | LR: {params['learning_rate']} | Rank: {params['lora_r']} ===")

    # A. Load Fresh Model (Must reload to reset weights)
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = HF_MODEL_NAME,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = None,
        load_in_4bit = True,
    )

    # B. Apply LoRA Config
    model = FastLanguageModel.get_peft_model(
        model,
        r = params['lora_r'],
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = params['lora_r'] * 2, # Auto-scale alpha
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42,
    )

    # C. Train
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        eval_dataset = test_dataset,
        dataset_text_field = "text",
        max_seq_length = MAX_SEQ_LENGTH,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = params['epochs'],
            learning_rate = params['learning_rate'],
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 42,
            output_dir = f"temp_output_run_{i}",
            save_strategy = "no",
            eval_strategy = "epoch"
        ),
    )

    trainer.train()

    # D. Capture Results
    eval_metrics = trainer.evaluate()
    final_loss = eval_metrics['eval_loss']
    print(f"--> Run {i+1} Result: Eval Loss = {final_loss}")

    results.append({**params, "eval_loss": final_loss})

    if final_loss < best_loss:
        best_loss = final_loss
        best_params = params

    # E. CRITICAL: Clear Memory
    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

# Show Results
results_df = pd.DataFrame(results)
print("\n=== GRID SEARCH COMPLETE ===")
print(results_df.sort_values(by="eval_loss"))
print(f"\nWINNER: {best_params}")

Starting Grid Search with 9 combinations...

=== RUN 1/9 | LR: 0.0002 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)
[34m[1mwandb[0m: Currently logged in as: [33mayush7[0m ([33mayush7-international-institute-of-information-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.6903,0.712268


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


--> Run 1 Result: Eval Loss = 0.712268054485321

=== RUN 2/9 | LR: 0.0002 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.6367,0.664056


--> Run 2 Result: Eval Loss = 0.6640557646751404

=== RUN 3/9 | LR: 0.0002 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.6021,0.631217


--> Run 3 Result: Eval Loss = 0.6312170028686523

=== RUN 4/9 | LR: 0.0001 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss
1,0.8547,0.86748


--> Run 4 Result: Eval Loss = 0.8674800395965576

=== RUN 5/9 | LR: 0.0001 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.7215,0.747685


--> Run 5 Result: Eval Loss = 0.7476854920387268

=== RUN 6/9 | LR: 0.0001 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.6506,0.677944


--> Run 6 Result: Eval Loss = 0.6779436469078064

=== RUN 7/9 | LR: 5e-05 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss
1,1.4973,1.479064


--> Run 7 Result: Eval Loss = 1.4790643453598022

=== RUN 8/9 | LR: 5e-05 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.959,0.957152


--> Run 8 Result: Eval Loss = 0.9571517705917358

=== RUN 9/9 | LR: 5e-05 | Rank: 32 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Epoch,Training Loss,Validation Loss
1,0.7496,0.775297


--> Run 9 Result: Eval Loss = 0.775297224521637

=== GRID SEARCH COMPLETE ===
   learning_rate  lora_r  epochs  eval_loss
2        0.00020      32       1   0.631217
1        0.00020      16       1   0.664056
5        0.00010      32       1   0.677944
0        0.00020       8       1   0.712268
4        0.00010      16       1   0.747685
8        0.00005      32       1   0.775297
3        0.00010       8       1   0.867480
7        0.00005      16       1   0.957152
6        0.00005       8       1   1.479064

WINNER: {'learning_rate': 0.0002, 'lora_r': 32, 'epochs': 1}


In [None]:
# --- FINAL TRAINING ---
FINAL_OUTPUT_DIR = "qwen_1_5b_rag_best_model"
FINAL_EPOCHS = 3  # Increase epochs for the final high-quality run

print(f"\nTraining Final Model with Best Params: {best_params}")

# 1. Load Fresh Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)

# 2. Apply Best LoRA Params
model = FastLanguageModel.get_peft_model(
    model,
    r = best_params['lora_r'],
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = best_params['lora_r'] * 2,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# 3. Final Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = FINAL_EPOCHS,
        learning_rate = best_params['learning_rate'],
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = FINAL_OUTPUT_DIR,
    ),
)

trainer.train()

# 4. Merge and Save Locally
print("Merging LoRA adapters into full model...")
model.save_pretrained_merged(FINAL_OUTPUT_DIR, tokenizer, save_method = "json")
print(f"Model saved to {FINAL_OUTPUT_DIR}")


Training Final Model with Best Params: {'learning_rate': 0.0002, 'lora_r': 32, 'epochs': 1}
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 150
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 36,929,536 of 1,580,643,840 (2.34% trained)


Step,Training Loss
1,2.7761
2,2.6937
3,2.4663
4,2.2279
5,2.118
6,2.0666
7,1.7223
8,1.4179
9,1.0437
10,0.9542


Merging LoRA adapters into full model...


config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [01:04<00:00, 64.12s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [01:00<00:00, 60.31s/it]


Unsloth: Merge process complete. Saved to `/content/qwen_1_5b_rag_best_model`
Model saved to qwen_1_5b_rag_best_model


In [None]:
import os
import shutil
from google.colab import files

# Define where in Drive you want to save it
drive_export_dir = "/content/drive/MyDrive/colab_model_exports"
os.makedirs(drive_export_dir, exist_ok=True)

# Zip the model folder
# Ensure FINAL_OUTPUT_DIR is defined (it should be set in Cell 4)
if 'FINAL_OUTPUT_DIR' not in locals():
    FINAL_OUTPUT_DIR = "qwen_1_5b_rag_best_model" # Default fallback if Cell 4 wasn't run

zip_name = f"{FINAL_OUTPUT_DIR}"
shutil.make_archive(zip_name, 'zip', FINAL_OUTPUT_DIR)

# Move to Drive
destination_path = os.path.join(drive_export_dir, f"{zip_name}.zip")
shutil.copy(f"{zip_name}.zip", destination_path)

print(f"Success! Best model saved to: {destination_path}")

Success! Best model saved to: /content/drive/MyDrive/colab_model_exports/qwen_1_5b_rag_best_model.zip


-

-

In [None]:
import torch
import gc
from unsloth import FastLanguageModel

# 1. Delete the trainer to free up VRAM (optimizer states, gradients)
if 'trainer' in locals():
    del trainer

# 2. Garbage collection
gc.collect()
torch.cuda.empty_cache()

# 3. Enable Native Inference Optimization (2x faster)
# The 'model' variable from Cell 4 should still be active.
FastLanguageModel.for_inference(model)

print("Memory cleared and Model switched to Inference Mode!")

Memory cleared and Model switched to Inference Mode!


In [None]:
from datasets import load_dataset
from tqdm import tqdm

# Load the test dataset (using the path defined in Cell 2)
# If TEST_PATH is missing, uncomment the line below:
# TEST_PATH = "/content/test_dataset_100.jsonl"
dataset = load_dataset("json", data_files=TEST_PATH, split="train")

results = []

print(f"Starting inference on {len(dataset)} examples...")

# Loop through the test set
for item in tqdm(dataset):
    messages = item["messages"]

    # We strip the assistant's answer from the input to test the model
    # (Assuming the last message is the assistant's target answer)
    if messages[-1]["role"] == "assistant":
        input_messages = messages[:-1] # Input is everything UP TO the answer
        ground_truth = messages[-1]["content"]
    else:
        input_messages = messages
        ground_truth = "N/A"

    # Prepare inputs using the tokenizer
    inputs = tokenizer.apply_chat_template(
        input_messages,
        tokenize = True,
        add_generation_prompt = True, # Crucial: tells model "it's your turn"
        return_tensors = "pt",
    ).to("cuda")

    # Generate
    outputs = model.generate(
        input_ids = inputs,
        max_new_tokens = 256,   # Adjust if your answers need to be longer
        use_cache = True,
        temperature = 0.7,
        top_p = 0.9,
    )

    # Decode the output
    decoded_output = tokenizer.batch_decode(outputs)

    # Extract only the assistant's new response
    # Qwen uses <|im_start|>assistant ...
    response_text = decoded_output[0].split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "")

    # Save
    results.append({
        "prompt": input_messages,
        "generated_response": response_text,
        "ground_truth": ground_truth
    })

print("\nInference Complete!")

Starting inference on 100 examples...


  0%|          | 0/100 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 100/100 [13:43<00:00,  8.24s/it]


Inference Complete!





In [None]:
import json
import shutil
import os

# 1. Print first 3 examples to check quality
print("\n--- PREVIEW ---")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Prompt (User): {results[i]['prompt'][-1]['content']}")
    print(f"Generated:     {results[i]['generated_response']}")
    print(f"Actual:        {results[i]['ground_truth']}")
    print("-" * 20)

# 2. Save prediction file locally
output_filename = "test_predictions.jsonl"
with open(output_filename, "w") as f:
    for res in results:
        f.write(json.dumps(res) + "\n")

# 3. Export to Google Drive
drive_export_dir = "/content/drive/MyDrive/colab_model_exports"
os.makedirs(drive_export_dir, exist_ok=True)

dest_path = os.path.join(drive_export_dir, output_filename)
shutil.copy(output_filename, dest_path)

print(f"\nPredictions saved to Drive: {dest_path}")


--- PREVIEW ---

Example 1:
Prompt (User): CONTEXT:
[Chunk 1]: TYPE: small molecule. CREATED: 2010-09-15. UPDATED: 2020-06-12. GROUPS: investigational. DESCRIPTION: cytoplasm. AVERAGE-MASS: 478.484. MONOISOTOPIC-MASS: 478.117412475. UNII: QKY617BBX5. CAS-NUMBER: 343321-96-0. STATE: solid. TARGETS: BE0003728, BE0000071. PDB-ENTRIES: 2p54...

QUESTION: 
What are the pharmacological effects properties of GW-590735?
Generated:     <thought>
The user asks about the pharmacological effects of GW-590735.
Chunk 1 may contain additional context.
I will merge these facts into a concise answer.
</thought>
TYPE: small molecule. CREATED: 2010-09-15. UPDATED: 2020-06-12. GROUPS: investigational. DESCRIPTION: cytoplasm. AVERAGE-MASS: 478.484. MONOISOTOPIC-MASS: 478.117412475. UNII: QKY617BBX5. CAS...
Actual:        <thought>
The user asks about the pharmacological effects of GW-590735.
Chunk 1 may contain additional context.
I will merge these facts into a concise answer.
</thought>
TYPE: small mole

-

-

-

-

-

In [None]:
# 1. Install necessary libraries
!pip install "unsloth[kaggle-new]"
!pip install "trl" "peft" "accelerate" "bitsandbytes" "triton"
!pip install "evaluate" "rouge_score" "tqdm"

# 2. Import standard libraries
import os
import gc
import shutil
import itertools
import pandas as pd
import torch
from google.colab import drive

# 3. Import AI/ML libraries
from unsloth import FastLanguageModel
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments

# 4. Mount Google Drive
drive.mount('/content/drive')

Collecting unsloth[kaggle-new]
  Downloading unsloth-2025.11.3-py3-none-any.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.11.4 (from unsloth[kaggle-new])
  Downloading unsloth_zoo-2025.11.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth[kaggle-new])
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[kaggle-new])
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth[kaggle-new])
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth[kaggle-new])
  Downloading datasets-4.3.0-py3-none-

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=11657802c22ea367ddeba2c60e4f94a52047e6e4b6c94df5fe5cde1dac69c55b
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.6 rouge_score-0.1.2
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will n

In [None]:
# --- CONFIGURATION ---
MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"
MAX_SEQ_LENGTH = 2048

# *** UPDATED PATHS ***
TRAIN_FILE = "/content/data/train_dataset.jsonl"
TEST_FILE  = "/content/data/test_dataset.jsonl"

# --- 1. LOAD TOKENIZER ---
# We load the model stub just to get the tokenizer for formatting
model_stub, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)

# --- 2. FORMATTING FUNCTION ---
def formatting_prompts_func(examples):
    texts = []
    for messages in examples["messages"]:
        # Apply standard ChatML template
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    return { "text" : texts }

# --- 3. LOAD & PROCESS DATASETS ---
print(f"Loading training data from: {TRAIN_FILE}")
raw_train = load_dataset("json", data_files=TRAIN_FILE, split="train")

print(f"Loading test data from: {TEST_FILE}")
raw_test = load_dataset("json", data_files=TEST_FILE, split="train")

# Map formatting function
train_dataset = raw_train.map(formatting_prompts_func, batched=True, remove_columns=["messages"])
test_dataset  = raw_test.map(formatting_prompts_func, batched=True, remove_columns=["messages"])

# --- 4. CLEANUP ---
# Free up VRAM for the upcoming grid search
del model_stub
torch.cuda.empty_cache()
gc.collect()
print("Data ready. Memory cleared.")

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Loading training data from: /content/data/train_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Loading test data from: /content/data/test_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Data ready. Memory cleared.


In [None]:
# --- DEFINE PARAMETER GRID ---
# 3 Learning Rates x 2 Ranks = 6 Total Runs
param_grid = {
    "learning_rate": [2e-4, 1e-4, 5e-5],
    "lora_r": [8, 16],
    "epochs": [1]  # Keep epochs low (1) for faster searching
}

# Generate combinations
keys, values = zip(*param_grid.items())
search_space = [dict(zip(keys, v)) for v in itertools.product(*values)]

results = []
best_loss = float('inf')
best_params = None

print(f"Starting Grid Search with {len(search_space)} combinations...")

for i, params in enumerate(search_space):
    print(f"\n=== RUN {i+1}/{len(search_space)} | LR: {params['learning_rate']} | Rank: {params['lora_r']} ===")

    # 1. Load Fresh Model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_NAME,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = None,
        load_in_4bit = True,
    )

    # 2. Apply LoRA Configuration
    model = FastLanguageModel.get_peft_model(
        model,
        r = params['lora_r'],
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = params['lora_r'] * 2,
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
    )

    # 3. Setup Trainer
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        eval_dataset = test_dataset,
        dataset_text_field = "text",
        max_seq_length = MAX_SEQ_LENGTH,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = params['epochs'],
            learning_rate = params['learning_rate'],
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = f"temp_output_run_{i}",
            save_strategy = "no",
            eval_strategy = "epoch"
        ),
    )

    # 4. Train & Evaluate
    trainer.train()
    metrics = trainer.evaluate()
    final_loss = metrics['eval_loss']

    print(f"--> Run {i+1} Finished. Validation Loss: {final_loss}")

    # 5. Store Results
    results.append({**params, "eval_loss": final_loss})

    if final_loss < best_loss:
        best_loss = final_loss
        best_params = params

    # 6. Cleanup Memory
    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

# --- REPORT ---
print("\n====== GRID SEARCH RESULTS ======")
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="eval_loss"))
print(f"\n🏆 BEST PARAMETERS: {best_params}")

Starting Grid Search with 6 combinations...

=== RUN 1/6 | LR: 0.0002 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/400 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mayush7[0m ([33mayush7-international-institute-of-information-technology[0m) to [32mhttps://api.

[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.7822,0.880708


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


--> Run 1 Finished. Validation Loss: 0.8807081580162048

=== RUN 2/6 | LR: 0.0002 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.7018,0.813029


--> Run 2 Finished. Validation Loss: 0.8130291700363159

=== RUN 3/6 | LR: 0.0001 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss
1,0.9242,1.019352


--> Run 3 Finished. Validation Loss: 1.019351840019226

=== RUN 4/6 | LR: 0.0001 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,0.8117,0.909166


--> Run 4 Finished. Validation Loss: 0.909165620803833

=== RUN 5/6 | LR: 5e-05 | Rank: 8 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 9,232,384 of 1,552,946,688 (0.59% trained)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.558,1.623318


--> Run 5 Finished. Validation Loss: 1.623317837715149

=== RUN 6/6 | LR: 5e-05 | Rank: 16 ===
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Epoch,Training Loss,Validation Loss
1,1.0267,1.114377


--> Run 6 Finished. Validation Loss: 1.1143767833709717

   learning_rate  lora_r  epochs  eval_loss
1        0.00020      16       1   0.813029
0        0.00020       8       1   0.880708
3        0.00010      16       1   0.909166
2        0.00010       8       1   1.019352
5        0.00005      16       1   1.114377
4        0.00005       8       1   1.623318

🏆 BEST PARAMETERS: {'learning_rate': 0.0002, 'lora_r': 16, 'epochs': 1}


In [None]:
# --- FINAL CONFIGURATION ---
FINAL_OUTPUT_DIR = "qwen_1_5b_final_model"
FINAL_EPOCHS = 3 # Increase epochs for the production model

if best_params is None:
    # Fallback if you skipped Cell 3
    best_params = {"learning_rate": 2e-4, "lora_r": 16}

print(f"\nTraining Final Model with: {best_params} for {FINAL_EPOCHS} epochs...")

# 1. Load Model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
)

# 2. Apply Best LoRA Settings
model = FastLanguageModel.get_peft_model(
    model,
    r = best_params['lora_r'],
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = best_params['lora_r'] * 2,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# 3. Final Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = FINAL_EPOCHS,
        learning_rate = best_params['learning_rate'],
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = FINAL_OUTPUT_DIR,
    ),
)

trainer.train()

# 4. Merge & Save
print("Merging and saving final model...")
model.save_pretrained_merged(FINAL_OUTPUT_DIR, tokenizer, save_method = "json")
print(f"Model saved locally to {FINAL_OUTPUT_DIR}")


Training Final Model with: {'learning_rate': 0.0002, 'lora_r': 16, 'epochs': 1} for 3 epochs...
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 150
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss
1,2.6644
2,2.6571
3,2.7952
4,2.544
5,2.3205
6,2.2641
7,2.1384
8,1.9438
9,1.9142
10,1.6866


Merging and saving final model...


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:32<00:00, 32.68s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:35<00:00, 35.14s/it]


Unsloth: Merge process complete. Saved to `/content/qwen_1_5b_final_model`
Model saved locally to qwen_1_5b_final_model


In [None]:
# Define Drive Destination
drive_folder = "/content/drive/MyDrive/colab_model_exports"
os.makedirs(drive_folder, exist_ok=True)

zip_name = f"{FINAL_OUTPUT_DIR}"
print(f"Zipping {zip_name}...")

# Create Zip
shutil.make_archive(zip_name, 'zip', FINAL_OUTPUT_DIR)

# Copy to Drive
destination = os.path.join(drive_folder, f"{zip_name}.zip")
shutil.copy(f"{zip_name}.zip", destination)

print(f"✅ Success! Model saved to: {destination}")

Zipping qwen_1_5b_final_model...
✅ Success! Model saved to: /content/drive/MyDrive/colab_model_exports/qwen_1_5b_final_model.zip


-

In [None]:
# 1. Install standard libraries
!pip install transformers torch datasets tqdm

import os
import torch
import shutil
import json
import re
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

# 2. Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- CONFIGURATION ---
# Path to your zipped merged model in Drive
zip_path = "/content/drive/MyDrive/colab_model_exports/qwen_1_5b_final_model.zip"
local_model_dir = "cpu_inference_model"
test_data_path = "/content/data/test_dataset.jsonl" # Ensure this matches your file path

# 1. Unzip the model
if os.path.exists(zip_path):
    print(f"Unzipping model to {local_model_dir}...")
    # -q for quiet, -o to overwrite
    !unzip -q -o "{zip_path}" -d "{local_model_dir}"
    print("Model unzipped successfully.")
else:
    print("❌ Zip file not found! Check the path.")

Unzipping model to cpu_inference_model...
Model unzipped successfully.


In [None]:
import os
import shutil
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

# Configuration
local_model_dir = "cpu_inference_model"
checkpoint_dir = os.path.join(local_model_dir, "checkpoint-150")
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Fallback to get config

print(f"🔍 Diagnosing {local_model_dir}...")

# --- FIX 1: Check inside the checkpoint folder ---
config_found = False
if os.path.exists(checkpoint_dir):
    print(f"📂 checking {checkpoint_dir}...")
    if "config.json" in os.listdir(checkpoint_dir):
        print("✅ Found config.json in checkpoint! Moving it to root...")
        shutil.copy(os.path.join(checkpoint_dir, "config.json"), os.path.join(local_model_dir, "config.json"))
        config_found = True

# --- FIX 2: Download default config if still missing ---
if not config_found and not os.path.exists(os.path.join(local_model_dir, "config.json")):
    print("⚠️ config.json still missing. Downloading default from Hugging Face...")
    try:
        config = AutoConfig.from_pretrained(base_model_name)
        config.save_pretrained(local_model_dir)
        print("✅ Default config downloaded and saved.")
    except Exception as e:
        print(f"❌ Failed to download config: {e}")

# --- STEP 3: RELOAD MODEL ---
print("\n🔄 Attempting to load model again on CPU...")

try:
    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(local_model_dir)

    # Load Model
    model = AutoModelForCausalLM.from_pretrained(
        local_model_dir,
        device_map="cpu",
        torch_dtype=torch.float32,
        trust_remote_code=True
    )
    print("\n✅ SUCCESS: Model loaded on CPU!")

except Exception as e:
    print(f"\n❌ STILL FAILING: {e}")
    print("Suggestion: The model might be corrupted. You may need to re-run the 'Final Training' cell to save it correctly.")

🔍 Diagnosing cpu_inference_model...
📂 checking cpu_inference_model/checkpoint-150...
⚠️ config.json still missing. Downloading default from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

✅ Default config downloaded and saved.

🔄 Attempting to load model again on CPU...


`torch_dtype` is deprecated! Use `dtype` instead!



✅ SUCCESS: Model loaded on CPU!


In [None]:
import os
from google.colab import files

# --- DIAGNOSTIC CHECK ---
expected_path = "/content/data/test_dataset.jsonl"
likely_fallback = "/content/test_dataset.jsonl"

print(f"🔍 Looking for file...")

if os.path.exists(expected_path):
    print(f"✅ Found file at: {expected_path}")
    test_data_path = expected_path

elif os.path.exists(likely_fallback):
    print(f"⚠️ File not found in 'data' folder, but FOUND in root!")
    print(f"✅ Updating path to: {likely_fallback}")
    test_data_path = likely_fallback

else:
    print("❌ File not found anywhere!")
    print("📂 Please upload 'test_dataset.jsonl' now:")

    # Upload trigger
    uploaded = files.upload()

    # Check where it landed
    filename = next(iter(uploaded))
    test_data_path = os.path.abspath(filename)
    print(f"✅ File uploaded! Path set to: {test_data_path}")

# --- UPDATE & RUN INFERENCE ---
# Now we use the corrected 'test_data_path' variable
print(f"\n🚀 Starting inference using: {test_data_path}")

from datasets import load_dataset
from tqdm import tqdm
import torch

# Load dataset with the CORRECTED path
dataset = load_dataset("json", data_files=test_data_path, split="train")
results = []

print(f"Loaded {len(dataset)} examples. Running inference...")

for item in tqdm(dataset):
    messages = item["messages"]

    # 1. Prepare Input
    if messages[-1]["role"] == "assistant":
        input_messages = messages[:-1]
        raw_truth = messages[-1]["content"]
        if "</thought>" in raw_truth:
            ground_truth = raw_truth.split("</thought>")[-1].strip()
        else:
            ground_truth = raw_truth
    else:
        input_messages = messages
        ground_truth = "N/A"

    # 2. Tokenize
    inputs = tokenizer.apply_chat_template(
        input_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ) # Auto-detects device (CPU)

    # 3. Generate
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=512,
            use_cache=True,
            temperature=0.6,
            top_p=0.9,
            do_sample=True
        )

    # 4. Decode & Clean
    decoded_output = tokenizer.batch_decode(outputs)[0]
    full_response = decoded_output.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "")

    # Extract Answer
    if "</thought>" in full_response:
        final_answer = full_response.split("</thought>")[-1].strip()
    else:
        final_answer = full_response.strip()

    # 5. Save
    results.append({
        "prompt": input_messages,
        "final_answer": final_answer,
        "ground_truth": ground_truth
    })

print("\n✅ Inference Complete!")

🔍 Looking for file...
❌ File not found anywhere!
📂 Please upload 'test_dataset.jsonl' now:


Saving test_dataset_100.jsonl to test_dataset_100.jsonl
✅ File uploaded! Path set to: /content/test_dataset_100.jsonl

🚀 Starting inference using: /content/test_dataset_100.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Loaded 100 examples. Running inference...


  0%|          | 0/100 [00:00<?, ?it/s]


NameError: name 'tokenizer' is not defined

In [None]:
# 1. Preview
print("\n--- PREVIEW ---")
for i in range(2):
    if i < len(results):
        print(f"Generated: {results[i]['final_answer'][:100]}...") # Truncated for display
        print(f"Target:    {results[i]['ground_truth'][:100]}...")
        print("-" * 20)

# 2. Save locally
output_filename = "cpu_predictions.jsonl"
with open(output_filename, "w") as f:
    for res in results:
        f.write(json.dumps(res) + "\n")

# 3. Copy to Drive
drive_folder = "/content/drive/MyDrive/colab_model_exports"
dest_path = os.path.join(drive_folder, output_filename)
shutil.copy(output_filename, dest_path)

print(f"\n✅ Predictions saved to: {dest_path}")

-

-

-

-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Task
The Qwen2-1.5B model has been trained with the best hyperparameters, saved to Google Drive at `/content/drive/MyDrive/colab_model_exports/qwen_1_5b_rag_best_model.zip`, and inference has been performed on the test dataset with predictions saved to `/content/drive/MyDrive/colab_model_exports/test_predictions.jsonl`.

## Create Data Directory

### Subtask:
Create a directory `/content/data/` if it doesn't already exist, to store the new datasets.


**Reasoning**:
The subtask is to create a directory. I will use the `os` module to create the directory `/content/data/` as specified in the instructions, ensuring `exist_ok=True` to avoid errors if it already exists.



In [None]:
import os

# Define the directory path
data_dir = "/content/data/"

# Create the directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

print(f"Directory '{data_dir}' created or already exists.")

Directory '/content/data/' created or already exists.


## Save Train Dataset

### Subtask:
Save the provided training dataset to `/content/data/train_dataset.jsonl`.


**Reasoning**:
The subtask requires saving the `raw_train_dataset` to a specified path in JSONL format. The `to_json` method with `lines=True` is appropriate for this.



In [None]:
output_path = "/content/data/train_dataset.jsonl"
raw_train_dataset.to_json(output_path, orient="records", lines=True)
print(f"Training dataset saved to {output_path}")

NameError: name 'raw_train_dataset' is not defined

**Reasoning**:
The previous cell failed because `raw_train_dataset` was not defined in the current execution context. To fix this, I need to reload the raw training dataset using `load_dataset` and the `TRAIN_PATH` which is available in the environment.



In [None]:
from datasets import load_dataset

TRAIN_PATH = "/content/train_dataset_400.jsonl"
raw_train_dataset = load_dataset("json", data_files=TRAIN_PATH, split="train")

output_path = "/content/data/train_dataset.jsonl"
raw_train_dataset.to_json(output_path, orient="records", lines=True)
print(f"Training dataset saved to {output_path}")

Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Training dataset saved to /content/data/train_dataset.jsonl


## Save Test Dataset

### Subtask:
Save the provided test dataset to `/content/data/test_dataset.jsonl`.


**Reasoning**:
The subtask requires saving the `raw_test_dataset` to a specified path in JSONL format. Similar to the previous step for the training data, I will ensure `raw_test_dataset` is loaded and then use the `to_json` method with `lines=True`.



In [None]:
from datasets import load_dataset

TEST_PATH = "/content/test_dataset_100.jsonl"
raw_test_dataset = load_dataset("json", data_files=TEST_PATH, split="train")

output_path = "/content/data/test_dataset.jsonl"
raw_test_dataset.to_json(output_path, orient="records", lines=True)
print(f"Test dataset saved to {output_path}")

Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Test dataset saved to /content/data/test_dataset.jsonl


## Provide Dataset Paths

### Subtask:
Output the full paths to the saved train and test datasets for the user.


**Reasoning**:
The subtask requires printing the full paths of the saved training and test datasets. I will define these paths as strings and then print them to the console for the user.



In [None]:
train_dataset_path = "/content/data/train_dataset.jsonl"
test_dataset_path = "/content/data/test_dataset.jsonl"

print(f"Saved Training Dataset Path: {train_dataset_path}")
print(f"Saved Test Dataset Path: {test_dataset_path}")

Saved Training Dataset Path: /content/data/train_dataset.jsonl
Saved Test Dataset Path: /content/data/test_dataset.jsonl


## Final Task

### Subtask:
Confirm that the new datasets have been saved and their paths provided.


## Summary:

### Data Analysis Key Findings
*   A new directory, `/content/data/`, was successfully created to store the processed datasets.
*   The training dataset, `raw_train_dataset`, was successfully loaded from `/content/train_dataset_400.jsonl` and saved to `/content/data/train_dataset.jsonl`.
*   The test dataset, `raw_test_dataset`, was loaded from `/content/test_dataset_100.jsonl` and saved to `/content/data/test_dataset.jsonl`. However, the loading process indicated that 0 examples were generated for the train split, suggesting the input file might have been empty.
*   The full paths to the saved datasets are:
    *   Training Dataset: `/content/data/train_dataset.jsonl`
    *   Test Dataset: `/content/data/test_dataset.jsonl`

### Insights or Next Steps
*   Verify the content of `/content/test_dataset_100.jsonl` to ensure it contains expected data, as the loading process reported 0 examples.
*   The new training and test datasets are now available at the specified paths for subsequent data processing or model evaluation tasks.
