In [None]:
!pip install optuna
!pip install evaluate
!pip install rouge_score
import os
import torch
import json
import shutil
from tqdm.auto import tqdm
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from google.colab import drive
import optuna
import gc
import evaluate
from rouge_score import rouge_scorer
from transformers import TrainingArguments, HfArgumentParser

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=17f32ea92f5fdacf3ab49e96eb673342236f79b86729a22f9cd7faeb71fc59a9
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:

!ls "/content/drive/MyDrive/"


ls: cannot access '/content/drive/MyDrive/': No such file or directory


In [None]:

from google.colab import drive
try:
    drive.mount('/content/drive')
except:
    print("Google Drive mount failed)")

Mounted at /content/drive


# Model and Path Configurations

In [None]:
# MODEL
HF_MODEL_NAME = "google/gemma-2b-it"
MAX_SEQ_LENGTH = 3000

In [None]:
# PATH CONFIGURATION
TRAIN_DATA_FILE = "/content/train_data_final"
OUTPUT_DIR = "temporary_adapter_dir"
MODEL_DIR_FINAL = "/content/drive/MyDrive/gemma_2b_bayesian_final"

# Global variables
global_tokenizer = None
global_train_dataset = None
global_test_dataset = None

# Input Data Formation (Chat template)

In [None]:
# Chat template

# Data Formation
def load_and_format_data(train_path):
    global global_tokenizer, global_train_dataset, global_test_dataset

    # Tokenizer
    model_stub, global_tokenizer = FastLanguageModel.from_pretrained(model_name = HF_MODEL_NAME)
    del model_stub
    torch.cuda.empty_cache()


    global_tokenizer = get_chat_template(
        global_tokenizer,

        mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"}
    )


    def formatting_prompts_func(examples):
        texts = []
        for messages in examples["messages"]:
            text = global_tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)
        return { "text" : texts }


    try:
        raw_dataset = load_dataset("json", data_files = train_path, split = "train")
    except Exception as e:
        print(f"Error loading data from {train_path}: {e}. Please check the path.")
        return

    # 95% train and 5% evaluation
    split_dataset = raw_dataset.train_test_split(test_size=0.05, seed=42)
    raw_train = split_dataset["train"]
    raw_eval = split_dataset["test"]


    global_train_dataset = raw_train.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])
    global_test_dataset = raw_eval.map(formatting_prompts_func, batched = True, remove_columns = ["messages"])

    print(f"Data split and formatting complete. Training Examples: {len(global_train_dataset)}, Evaluation Examples: {len(global_test_dataset)}")

# Preparing datasets Globally
load_and_format_data(TRAIN_DATA_FILE)

==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Data split and formatting complete. Training Examples: 475, Evaluation Examples: 25


In [None]:
# Model Initialization
def model_init(trial):
    torch.cuda.empty_cache()


    lora_alpha_val = trial.suggest_categorical("lora_alpha", [16, 32])

    model, _ = FastLanguageModel.from_pretrained(
        model_name = HF_MODEL_NAME,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = None,
        load_in_4bit = True,
        device_map = "auto",
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = lora_alpha_val,
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42 + trial.number,
    )
    return model

# Training Arguments
base_training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    # HPT settings
    eval_strategy = "steps",
    eval_steps = 50,
    load_best_model_at_end = True,
    save_strategy = "steps",
    save_steps = 50,
    metric_for_best_model = "eval_loss",
    report_to = "none",
)

# Best HyperParameter Search

In [None]:
# Hyperparameter Search

print("\n Starting Hyperparameter Search")

def compute_objective(metrics):

    return metrics["eval_loss"]


class DummyTrial:
    def __init__(self, number=0): self.number = number
    def suggest_categorical(self, name, choices): return choices[0]
    def suggest_float(self, name, low, high, log=False): return low
    def suggest_int(self, name, low, high): return low

initial_model = model_init(DummyTrial())

# Trainer settings
trainer = SFTTrainer(
    model = initial_model,
    tokenizer = global_tokenizer,
    train_dataset = global_train_dataset,
    eval_dataset = global_test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = base_training_args,
)

trainer.model_init = model_init

best_trial = trainer.hyperparameter_search(
    direction = "minimize",
    backend = "optuna",
    hp_space = lambda x: {
        "learning_rate": x.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "num_train_epochs": x.suggest_int("num_train_epochs", 1, 2),
        "lora_alpha": x.suggest_categorical("lora_alpha", [16, 32]),
    },
    compute_objective = compute_objective,
    n_trials = 5, # Number of Trials
)

print("\nHyperparameter Search Complete.")
print(f"Best Trial Parameters: {best_trial.hyperparameters}")


 Starting Hyperparameter Search
==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


[I 2025-11-25 13:19:36,645] A new study created in memory with name: no-name-aca5f312-0035-4950-b6d0-dd7268a0a0ea
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Step,Training Loss,Validation Loss
50,2.6963,2.527023


Unsloth: Not an error, but GemmaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
[I 2025-11-25 13:24:35,240] Trial 0 finished with value: 2.5270233154296875 and parameters: {'learning_rate': 8.355156912459298e-05, 'num_train_epochs': 1, 'lora_alpha': 32}. Best is trial 0 with value: 2.5270233154296875.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Step,Training Loss,Validation Loss
50,1.1723,1.180718


[I 2025-11-25 13:29:41,193] Trial 1 finished with value: 1.1807175874710083 and parameters: {'learning_rate': 0.00012967066776320223, 'num_train_epochs': 1, 'lora_alpha': 16}. Best is trial 1 with value: 1.1807175874710083.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Step,Training Loss,Validation Loss
50,4.4458,3.966071


[I 2025-11-25 13:34:48,280] Trial 2 finished with value: 3.9660708904266357 and parameters: {'learning_rate': 1.0874105323864794e-05, 'num_train_epochs': 1, 'lora_alpha': 32}. Best is trial 1 with value: 1.1807175874710083.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Step,Training Loss,Validation Loss
50,0.9862,0.952391
100,1.0568,1.099942


[I 2025-11-25 13:44:35,682] Trial 3 finished with value: 1.0999417304992676 and parameters: {'learning_rate': 0.0003802158466338411, 'num_train_epochs': 2, 'lora_alpha': 16}. Best is trial 3 with value: 1.0999417304992676.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)


Step,Training Loss,Validation Loss
50,2.6649,2.530387


[I 2025-11-25 13:49:44,465] Trial 4 finished with value: 2.5303869247436523 and parameters: {'learning_rate': 2.3575535390606036e-05, 'num_train_epochs': 1, 'lora_alpha': 32}. Best is trial 3 with value: 1.0999417304992676.



Hyperparameter Search Complete.
Best Trial Parameters: {'learning_rate': 0.0003802158466338411, 'num_train_epochs': 2, 'lora_alpha': 16}


#Training on Best parameters found using HyperParameter Search

In [None]:
#Training on Best parameters found using HyperParameter Search

print("\nFinal Training with Best Hyperparameters")

# Applying the best parameters found
best_lr = best_trial.hyperparameters.get("learning_rate")
best_epochs = best_trial.hyperparameters.get("num_train_epochs", base_training_args.num_train_epochs)
best_lora_alpha = best_trial.hyperparameters.get("lora_alpha")


del trainer
gc.collect()
torch.cuda.empty_cache()


model, _ = FastLanguageModel.from_pretrained(
    model_name = HF_MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = True,
    device_map = "auto",
)

best_model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = best_lora_alpha,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
)

# Final Trainer
final_training_args = TrainingArguments(
    output_dir = "final_output",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = best_epochs,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    learning_rate = best_lr,
    eval_strategy = "no",
    save_strategy = "no",
)

final_trainer = SFTTrainer(
    model = best_model,
    tokenizer = global_tokenizer,
    train_dataset = global_train_dataset,
    eval_dataset = global_test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = False,
    args = final_training_args,
)

# Training the model fully using the best settings
final_trainer.train()


Final Training with Best Hyperparameters
==((====))==  Unsloth 2025.11.3: Fast Gemma patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 475 | Num Epochs = 2 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muttamhamsaraj24[0m ([33muttamhamsaraj24-iiit-bangalore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
10,5.856
20,2.0492
30,1.1037
40,0.976
50,0.9856
60,0.9205
70,0.8923
80,0.8775
90,0.8485
100,0.841


TrainOutput(global_step=120, training_loss=1.4153919696807862, metrics={'train_runtime': 648.8346, 'train_samples_per_second': 1.464, 'train_steps_per_second': 0.185, 'total_flos': 6066550581952512.0, 'train_loss': 1.4153919696807862, 'epoch': 2.0})

## Saving Model into Google Drive

In [None]:
# Saving the models into Google Drive
print(f"\nMerging and saving final merged model to Google Drive at {MODEL_DIR_FINAL}...")

best_model.save_pretrained_merged(
    MODEL_DIR_FINAL,
    global_tokenizer,
    save_method = "merged_16bit", # Saving the full merged model weights
)
print(f"Model saved successfully to Google Drive at {MODEL_DIR_FINAL}")


Merging and saving final merged model to Google Drive at /content/drive/MyDrive/gemma_2b_bayesian_final...


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/5.01G [00:00<?, ?B/s]

##Pushing Model Into Hugging Face ( Model ID - utt24/qwen_1.5B_Bayesian)

In [None]:
# Hugging Face Login
from huggingface_hub import logout, login
logout()

print("\nWrite Access Token :")
login()
print("DONE")