In [None]:
import os
import torch
import json
import shutil
from tqdm.auto import tqdm
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from google.colab import drive
import optuna
import gc
import evaluate
from rouge_score import rouge_scorer
from transformers import TrainingArguments, HfArgumentParser

In [None]:

from google.colab import drive
try:
    drive.mount('/content/drive')
except:
    print("Google Drive mount failed)")

Mounted at /content/drive


# Model and Path Configurations

In [None]:
# MODEL
HF_MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
MAX_SEQ_LENGTH = 2048

In [None]:

# PATH CONFIGURATION
TRAIN_DATA_FILE = "/content/train_data_final"
OUTPUT_DIR = "temporary_adapter_dir"
MODEL_DIR_FINAL = "/content/drive/MyDrive/qwen_raft_final"

# Global variables
global_tokenizer = None
global_train_dataset = None
global_test_dataset = None

# Input Data Formation (Chat template)

In [None]:

# Data Formation
def load_and_format_data(train_path):
    global global_tokenizer, global_train_dataset, global_test_dataset

    # Tokenizer
    model_stub, global_tokenizer = FastLanguageModel.from_pretrained(model_name = HF_MODEL_NAME)
    del model_stub
    torch.cuda.empty_cache()

     # Qwen ChatML template
    global_tokenizer = get_chat_template(
        global_tokenizer,
        chat_template = "chatml",
        mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}
    )


    def formatting_prompts_func(examples):
        texts = []
        for messages in examples["messages"]:
            text = global_tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)
        return { "text" : texts }


    try:
        raw_dataset = load_dataset("json", data_files = train_path, split = "train")
    except Exception as e:
        print(f"Error loading data from {train_path}: {e}. Please check the path.")
        return

    # 95% train and 5% evaluation
    split_dataset = raw_dataset.train_test_split(test_size=0.05, seed=42)
    raw_train = split_dataset["train"]
    raw_eval = split_dataset["test"]


    global_train_dataset = raw_train.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])
    global_test_dataset = raw_eval.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])

    print(f"Data split and formatting complete. Training Examples: {len(global_train_dataset)}, Evaluation Examples: {len(global_test_dataset)}")

# Preparing datasets Globally
load_and_format_data(TRAIN_DATA_FILE)

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Data split and formatting complete. Training Examples: 475, Evaluation Examples: 25


In [None]:


# Model Initialization
def model_init(trial):
    torch.cuda.empty_cache()


    lora_alpha_val = trial.suggest_categorical("lora_alpha", [16, 32])

    model, _ = FastLanguageModel.from_pretrained(
        model_name = HF_MODEL_NAME,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = None,
        load_in_4bit = True,
        device_map = "auto",
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = lora_alpha_val,
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42 + trial.number,
    )
    return model

# Training Arguments
base_training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    # HPT settings
    eval_strategy = "steps",
    eval_steps = 50,
    load_best_model_at_end = True,
    save_strategy = "steps",
    save_steps = 50,
    metric_for_best_model = "eval_loss",
    report_to = "none",
)

# Best HyperParameter Search

In [None]:

# Hyperparameter Search

print("\n Starting Hyperparameter Search")

def compute_objective(metrics):

    return metrics["eval_loss"]


class DummyTrial:
    def __init__(self, number=0): self.number = number
    def suggest_categorical(self, name, choices): return choices[0]
    def suggest_float(self, name, low, high, log=False): return low
    def suggest_int(self, name, low, high): return low

initial_model = model_init(DummyTrial())

# Trainer settings
trainer = SFTTrainer(
    model = initial_model,
    tokenizer = global_tokenizer,
    train_dataset = global_train_dataset,
    eval_dataset = global_test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = base_training_args,
)

trainer.model_init = model_init

best_trial = trainer.hyperparameter_search(
    direction = "minimize",
    backend = "optuna",
    hp_space = lambda x: {
        "learning_rate": x.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "num_train_epochs": x.suggest_int("num_train_epochs", 1, 2),
        "lora_alpha": x.suggest_categorical("lora_alpha", [16, 32]),
    },
    compute_objective = compute_objective,
    n_trials = 7, # Number of Trials
)

print("\nHyperparameter Search Complete.")
print(f"Best Trial Parameters: {best_trial.hyperparameters}")


--- Starting Hyperparameter Search (Optuna) ---
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/475 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/25 [00:00<?, ? examples/s]

[I 2025-11-24 16:38:55,241] A new study created in memory with name: no-name-f9887bdf-4742-4966-8c42-c121407e38ff
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,2.3281,2.218557
100,1.9702,1.975728


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
[I 2025-11-24 16:46:47,216] Trial 0 finished with value: 1.9757282733917236 and parameters: {'learning_rate': 8.052677395240362e-05, 'num_train_epochs': 2, 'lora_alpha': 16}. Best is trial 0 with value: 1.9757282733917236.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.8468,0.876388


[I 2025-11-24 16:50:53,744] Trial 1 finished with value: 0.8763884902000427 and parameters: {'learning_rate': 0.0004385251777942248, 'num_train_epochs': 1, 'lora_alpha': 32}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.9682,1.003006


[I 2025-11-24 16:55:00,517] Trial 2 finished with value: 1.0030059814453125 and parameters: {'learning_rate': 0.00017792978707833625, 'num_train_epochs': 1, 'lora_alpha': 16}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,2.2045,2.119349
100,2.0098,2.021261


[I 2025-11-24 17:02:42,955] Trial 3 finished with value: 2.02126145362854 and parameters: {'learning_rate': 1.777326729594054e-05, 'num_train_epochs': 2, 'lora_alpha': 16}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.9307,0.919944
100,0.8403,0.924981


[I 2025-11-24 17:10:21,479] Trial 4 finished with value: 0.9249810576438904 and parameters: {'learning_rate': 0.0001606407737712337, 'num_train_epochs': 2, 'lora_alpha': 32}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.953,0.941771


[I 2025-11-24 17:14:29,399] Trial 5 finished with value: 0.9417714476585388 and parameters: {'learning_rate': 0.0002741518522982801, 'num_train_epochs': 1, 'lora_alpha': 16}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.8761,0.899995


[I 2025-11-24 17:18:37,980] Trial 6 finished with value: 0.8999946713447571 and parameters: {'learning_rate': 0.00044813673453595487, 'num_train_epochs': 1, 'lora_alpha': 16}. Best is trial 1 with value: 0.8763884902000427.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,0.9206,0.996535


[I 2025-11-24 17:22:07,272] Trial 7 pruned. 
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,1.7368,1.691195


[I 2025-11-24 17:25:32,913] Trial 8 pruned. 
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Step,Training Loss,Validation Loss
50,2.1869,2.138711


[I 2025-11-24 17:28:54,833] Trial 9 pruned. 



Hyperparameter Search Complete.
Best Trial Parameters: {'learning_rate': 0.0004385251777942248, 'num_train_epochs': 1, 'lora_alpha': 32}


#Training on Best parameters found using HyperParameter Search

In [None]:



print("\nFinal Training with Best Hyperparameters")

# Applying the best parameters found
best_lr = best_trial.hyperparameters.get("learning_rate")
best_epochs = best_trial.hyperparameters.get("num_train_epochs", base_training_args.num_train_epochs)
best_lora_alpha = best_trial.hyperparameters.get("lora_alpha")


del trainer
gc.collect()
torch.cuda.empty_cache()


model, _ = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
    device_map = "auto",
)

best_model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = best_lora_alpha,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# Final Trainer
final_training_args = TrainingArguments(
    output_dir = "final_output",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = best_epochs,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    learning_rate = best_lr,
    eval_strategy = "no",
    save_strategy = "no",
)

final_trainer = SFTTrainer(
    model = best_model,
    tokenizer = global_tokenizer,
    train_dataset = global_train_dataset,
    eval_dataset = global_test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = final_training_args,
)

# Training the model fully using the best settings
final_trainer.train()


--- 7. Final Training with Best Hyperparameters ---
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muttamhamsaraj24[0m ([33muttamhamsaraj24-iiit-bangalore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
10,2.0256
20,0.9687
30,0.9164
40,0.8413
50,0.8871
60,0.8352


TrainOutput(global_step=60, training_loss=1.0790509700775146, metrics={'train_runtime': 354.4906, 'train_samples_per_second': 1.34, 'train_steps_per_second': 0.169, 'total_flos': 2082495812434944.0, 'train_loss': 1.0790509700775146, 'epoch': 1.0})

## Saving Model into Google Drive

In [None]:

# Saving the models into Google Drive
print(f"\nMerging and saving final merged model to Google Drive at {MODEL_DIR_FINAL}...")

best_model.save_pretrained_merged(
    MODEL_DIR_FINAL,
    global_tokenizer,
    save_method = "merged_16bit", # Saving the full merged model weights
)
print(f"Model saved successfully to Google Drive at {MODEL_DIR_FINAL}")


Merging LoRA adapters and saving final merged model to Google Drive at /content/drive/MyDrive/qwen_raft_final...


config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:51<00:00, 51.77s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [01:37<00:00, 97.14s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/qwen_raft_final`
Model saved successfully to Google Drive at /content/drive/MyDrive/qwen_raft_final


##Pushing Model Into Hugging Face ( Model ID - utt24/qwen_1.5B_Bayesian)

In [None]:
# Hugging Face Login
from huggingface_hub import logout, login
logout() # Log out any potentially stale session

print("\nWrite Access Token :")
login()
print("DONE")

Not logged in!



Please re-enter your Write Access Token now:


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

DONE


In [None]:
# GOOGLE DRIVE MOUNT
try:
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
except Exception as e:
    print(f"Google Drive mount failed: {e}")

#  model path on Google Drive
DRIVE_MODEL_PATH = "/content/drive/MyDrive/qwen_raft_final"

# Configuration for the Hub
HUB_REPO_ID = "utt24/qwen_1.5B_Bayesian"
COMMIT_MESSAGE = "Final merge of Qwen 1.5B"


print(f"\nLoading merged model from Drive ({DRIVE_MODEL_PATH})...")

try:
    if not os.path.exists(DRIVE_MODEL_PATH):
        raise FileNotFoundError(f"Model directory not found at {DRIVE_MODEL_PATH}. Check your Drive path.")


    model = AutoModelForCausalLM.from_pretrained(
        DRIVE_MODEL_PATH,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

    tokenizer = AutoTokenizer.from_pretrained(DRIVE_MODEL_PATH, trust_remote_code=True)

except Exception as e:
    print(f"Error during model loading: {e}")
    raise e

# Pushing Model
print(f"\nPushing model and tokenizer to Hugging Face Hub as: {HUB_REPO_ID}")


model.push_to_hub(
    HUB_REPO_ID,
    commit_message=COMMIT_MESSAGE,
    safe_serialization=True
)

tokenizer.push_to_hub(
    HUB_REPO_ID,
    commit_message=COMMIT_MESSAGE,
    safe_serialization=True
)

print("\n Model successfully uploaded to Hugging Face Hub!")
print(f"You can now access it using the ID: {HUB_REPO_ID}")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading merged model from Drive (/content/drive/MyDrive/qwen_raft_final)...





Pushing model and tokenizer to Hugging Face Hub as: utt24/qwen_1.5B_Bayesian




Saving checkpoint shards:   0%|          | 0/1 [00:00<?, ?it/s]