# Experimentation with Google's Gemma3 models

### Fine Tuning Components
- Supervised Fine Tuning (SFT)
- Low Rank Adaptations (LoRA)

### Dataset
- Stamford's question answering dataset (link: `https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset/data`)

### Environment Setup

In [1]:
!pip3 install -q -U bitsandbytes
!pip3 install -q -U peft
!pip3 install -q -U trl
!pip3 install -q -U accelerate
!pip3 install -q -U datasets
!pip3 install -q -U transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip freeze | grep bitsandbytes
!pip freeze | grep peft
!pip freeze | grep trl
!pip freeze | grep accelerate
!pip freeze | grep datasets
!pip freeze | grep transformers

bitsandbytes==0.45.5
peft==0.15.2
fastrlock==0.8.2
trl==0.16.1
accelerate==1.6.0
datasets==3.5.0
tensorflow-datasets==4.9.7
vega-datasets==0.9.0
sentence-transformers==3.3.1
transformers==4.51.3


In [3]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

In [4]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

hf_token_name = "HF_TOKEN_EG"
hf_key = UserSecretsClient().get_secret(hf_token_name)
print(f"Successfully loaded {hf_token_name}!")

login(token = hf_key)
print(f"Login with {hf_token_name} complete!")

Successfully loaded HF_TOKEN_EG!
Login with HF_TOKEN_EG complete!


### Load Data

In [5]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [6]:
from datasets import load_dataset

# Load SQuAD dataset
dataset = load_dataset("squad")

# Use only a portion of the dataset
subset_size = 2500  # Adjust this value to control the subset size
train_subset = dataset["train"].shuffle(seed=42).select(range(subset_size))
validation_subset = dataset["validation"].shuffle(seed=42).select(range(subset_size))

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

### Load Model

In [7]:
# Adjust precision and attention based on GPU
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
    !pip install -qqq flash-attn
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
    
# BitsAndBytes configuration for memory-efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

model_name = "google/gemma-2-2b-it"  
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="cuda:0", # do i need to make a custom device mapping dict (future for Gemma3)?
    attn_implementation=attn_implementation,
    offload_folder="./offload"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [8]:
# Apply LoRA configuration
peft_config = LoraConfig(
    task_type="CAUSAL_LM",  
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    r=16,                         
    lora_alpha=32,                
    lora_dropout=0.1,
    bias="none"
)

# Add LoRA adapters to the model
model = get_peft_model(model, peft_config)

# # Freeze all parameters except LoRA parameters
# for name, param in model.named_parameters():
#     if "lora" not in name:
#         param.requires_grad = False  

### Data Preprocessing

In [9]:
def preprocess_function(examples):
    # Tokenize the question
    inputs = tokenizer(
        examples["question"], 
        max_length=256, 
        truncation=True, 
        padding="max_length"
    )
    
    # Extract the first answer for each example in the batch
    answers = [ans["text"][0] if len(ans["text"]) > 0 else "" for ans in examples["answers"]]
    
    # Tokenize the answers
    outputs = tokenizer(
        answers, 
        max_length=256, 
        truncation=True, 
        padding="max_length"
    )
    
    # Assign tokenized outputs as labels
    inputs["labels"] = outputs["input_ids"]
    return inputs

In [10]:
# Tokenize our train and validation datasets

tokenized_train_subset = train_subset.map(preprocess_function, batched=True)
tokenized_validation_subset = validation_subset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [11]:
print(tokenized_train_subset[0]) # check one sample

{'id': '573173d8497a881900248f0c', 'title': 'Egypt', 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.', 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?', 'answers': {'text': ['84%'], 'answer_start': [468]}, 'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Training

In [12]:
# Disable W&B for this run only
os.environ["WANDB_MODE"] = "disabled"

In [13]:
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=tokenized_train_subset,
#     eval_dataset=tokenized_validation_subset,
#     args=transformers.TrainingArguments(
#         output_dir="./results",
#         eval_strategy="epoch",
#         per_device_train_batch_size=1,
#         gradient_accumulation_steps=4,
#         warmup_steps=2,
#         max_steps=100,
#         learning_rate=2e-4, #"5e-5"
#         fp16=True,
#         optim="paged_adamw_8bit",
#         save_strategy="epoch",
#         logging_dir="./logs",
#         push_to_hub=False,
#         weight_decay=0.01,
#         run_name="gemma2-testrun"
#     ),
#     peft_config=peft_config,
# )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Updated from evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=100,
    weight_decay=0.01,
    fp16=True,
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    logging_dir="./logs",
    push_to_hub=False,
    run_name="gemma2-testrun",  # Optional custom run name for W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_subset,
    eval_dataset=tokenized_validation_subset,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
0,No log,0.531646


TrainOutput(global_step=100, training_loss=4.390454406738281, metrics={'train_runtime': 838.7968, 'train_samples_per_second': 0.477, 'train_steps_per_second': 0.119, 'total_flos': 1256622863155200.0, 'train_loss': 4.390454406738281, 'epoch': 0.16})