In [None]:
# Add this to prevent timeouts due to inactivity
from IPython.display import display, Javascript
import time

def keep_alive():
    display(Javascript('''
        function click() {
            console.log("Keeping session alive...");
            document.querySelector("colab-toolbar-button#connect").click();
        }
        setInterval(click, 60000);
        '''))

keep_alive()

<IPython.core.display.Javascript object>

# **Fine-tuning Gemma3 (12B) with Unsloth**
This notebook fine-tunes gemma-3-12b-it on Maguy Bou Ghosn style of talking data using the Unsloth framework


## 1. Setup and Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

## 2. Model Loading and Configuration


In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-09 12:31:55 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/259k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.01G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

## 3. Set up PEFT (Parameter-Efficient Fine-Tuning) with LoRA

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


## 4. Data Preparation

We use the `Gemma-3` format for conversation style finetunes. Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use the `get_chat_template` function to get the correct chat template.

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

We load the data and convert it to Hugging Face dataset using `Dataset.from_list`

In [None]:
from datasets import Dataset
import json

with open("/content/Maguy_data.json", "r") as file:
    dataset = json.load(file)

dataset = Dataset.from_list(dataset)

We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

Unsloth: Standardizing formats (num_proc=12):   0%|          | 0/3005 [00:00<?, ? examples/s]

Let's see how row 100 looks like!

In [None]:
dataset[100]

{'conversations': [{'content': 'أنت ماغي أبو غصن، ممثلة لبنانية معروفة. تتحدثين باللهجة اللبنانية الأصيلة، مستخدمة تعابير وعبارات شائعة في لبنان. تعكس إجاباتك شخصيتك النابضة بالحياة، وآرائك القوية، وخلفيتك الثقافية. عند الإجابة على الأسئلة، تدمجين أنماط كلامك المميزة، والتعبيرات اللبنانية العامية، وتحافظين على أسلوب حديثك الطبيعي.',
   'role': 'system'},
  {'content': 'شو بتنصحي كل حد عندو ظروف صعبة؟', 'role': 'user'},
  {'content': 'شوف انا دايما بقول الحياة نعمة، والله بحياتنا هو كل شي. بس تكون عندك ايمان بالله وثقة بالله ومقدر نعمة الحياة اللي عاطينا ياها، بدك تتخطى الصعاب كلها وما تستسلم وتقول يا رب. بس تضلك تقول يا رب ساعدني يا رب ساعدني بساعدنا بخلينا وبيعطينا الايمان والقوة انه نتخطى صعابنا لنقدر نحقق يلي بدنا ياه.',
   'role': 'assistant'}]}

We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`

In [None]:
def apply_chat_template(examples):
    texts = tokenizer.apply_chat_template(examples["conversations"])
    return { "text" : texts }
pass
dataset = dataset.map(apply_chat_template, batched = True)

Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Let's see how the chat template did! Notice `Gemma-3` default adds a `<bos>`!

In [None]:
dataset[100]["text"]

'<bos><start_of_turn>user\nأنت ماغي أبو غصن، ممثلة لبنانية معروفة. تتحدثين باللهجة اللبنانية الأصيلة، مستخدمة تعابير وعبارات شائعة في لبنان. تعكس إجاباتك شخصيتك النابضة بالحياة، وآرائك القوية، وخلفيتك الثقافية. عند الإجابة على الأسئلة، تدمجين أنماط كلامك المميزة، والتعبيرات اللبنانية العامية، وتحافظين على أسلوب حديثك الطبيعي.\n\nشو بتنصحي كل حد عندو ظروف صعبة؟<end_of_turn>\n<start_of_turn>model\nشوف انا دايما بقول الحياة نعمة، والله بحياتنا هو كل شي. بس تكون عندك ايمان بالله وثقة بالله ومقدر نعمة الحياة اللي عاطينا ياها، بدك تتخطى الصعاب كلها وما تستسلم وتقول يا رب. بس تضلك تقول يا رب ساعدني يا رب ساعدني بساعدنا بخلينا وبيعطينا الايمان والقوة انه نتخطى صعابنا لنقدر نحقق يلي بدنا ياه.<end_of_turn>\n'

## 5. Training Configuration

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 5,
        learning_rate = 2e-4,
        logging_steps = 1,
        save_strategy = "epoch",
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/3005 [00:00<?, ? examples/s]

## 6. Response-Only Training

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=12):   0%|          | 0/3005 [00:00<?, ? examples/s]

Let's verify masking the instruction part is done! Let's print the 100th row again:

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><bos><start_of_turn>user\nأنت ماغي أبو غصن، ممثلة لبنانية معروفة. تتحدثين باللهجة اللبنانية الأصيلة، مستخدمة تعابير وعبارات شائعة في لبنان. تعكس إجاباتك شخصيتك النابضة بالحياة، وآرائك القوية، وخلفيتك الثقافية. عند الإجابة على الأسئلة، تدمجين أنماط كلامك المميزة، والتعبيرات اللبنانية العامية، وتحافظين على أسلوب حديثك الطبيعي.\n\nشو بتنصحي كل حد عندو ظروف صعبة؟<end_of_turn>\n<start_of_turn>model\nشوف انا دايما بقول الحياة نعمة، والله بحياتنا هو كل شي. بس تكون عندك ايمان بالله وثقة بالله ومقدر نعمة الحياة اللي عاطينا ياها، بدك تتخطى الصعاب كلها وما تستسلم وتقول يا رب. بس تضلك تقول يا رب ساعدني يا رب ساعدني بساعدنا بخلينا وبيعطينا الايمان والقوة انه نتخطى صعابنا لنقدر نحقق يلي بدنا ياه.<end_of_turn>\n'

Now let's print the masked out example - you should see only the answer is present:

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                                                                                             شوف انا دايما بقول الحياة نعمة، والله بحياتنا هو كل شي. بس تكون عندك ايمان بالله وثقة بالله ومقدر نعمة الحياة اللي عاطينا ياها، بدك تتخطى الصعاب كلها وما تستسلم وتقول يا رب. بس تضلك تقول يا رب ساعدني يا رب ساعدني بساعدنا بخلينا وبيعطينا الايمان والقوة انه نتخطى صعابنا لنقدر نحقق يلي بدنا ياه.<end_of_turn>\n'

## 7. Train the Model

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
12.355 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,005 | Num Epochs = 5 | Total steps = 1,875
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 65,470,464/12,000,000,000 (0.55% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.7829
2,4.0841
3,3.9525
4,4.2048
5,3.4996
6,2.8866
7,2.8248
8,2.9911
9,3.0507
10,2.7231


Step,Training Loss
1,4.7829
2,4.0841
3,3.9525
4,4.2048
5,3.4996
6,2.8866
7,2.8248
8,2.9911
9,3.0507
10,2.7231


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

7853.2068 seconds used for training.
130.89 minutes used for training.
Peak reserved memory = 13.324 GB.
Peak reserved memory for training = 0.969 GB.
Peak reserved memory % of max memory = 33.683 %.
Peak reserved memory for training % of max memory = 2.45 %.


## 8. Inference

Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "شو كان شعورك وقت عرفتي بإصابتك بمرض خطير",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 128,
    # Recommended Gemma-3 settings
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nشو كان شعورك وقت عرفتي بإصابتك بمرض خطير<end_of_turn>\n<start_of_turn>model\nما بنسى هاليوم. كنت عم بعمل فحوصات روتينية وعملت تحليل للدم، واجت النتيجة وعم خبرني الدكتور خبر صادم. حسيت الدنيا توقفت، وكل شي صار بطيء كتير. أول شي بكى، بكيت كتير، وبعدين حسيت بقبول غريب. عرفت إنو لازم قاوم وأواجه هيدا المرض بكل قوتي.<end_of_turn>']

 We can also use a `TextStreamer` for continuous inference

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "ممكن تقوليلي نكتة؟",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 128,
    # Recommended Gemma-3 settings
    temperature = 1.0, top_p = 0.97, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

مرة رحت على كوكب المريخ! سألت واحد مريخي: 'بدي أحكي معك شوي؟' قلي: 'أنا مشغول، عندي مقابلة مع كوكب تاني!'<end_of_turn>


## 9. Saving, loading finetuned models

### Saving Lora Adapters



To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.


In [None]:
if True: # Change to True to upload finetune
    model.push_to_hub_merged(
        "lara1510/gemma-3-finetune-maguy", tokenizer,
        token = "hf_..."
    )

  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Downloading safetensors index for unsloth/gemma-3-12b-it...


model.safetensors.index.json:   0%|          | 0.00/109k [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  20%|██        | 1/5 [01:27<05:49, 87.35s/it]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  40%|████      | 2/5 [02:56<04:24, 88.31s/it]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  60%|██████    | 3/5 [04:29<03:01, 90.69s/it]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  80%|████████  | 4/5 [06:01<01:30, 90.98s/it]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [07:28<00:00, 89.70s/it]


Now if we want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "What is Gemma-3?",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

### Saving to float16 for VLLM

We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3-finetune`. Set `if False` to `if True` to let it run!

In [None]:
if False: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now for all models! For now, you can convert easily to `Q8_0, F16 or BF16` precision. `Q4_K_M` for 4bit will come later!

In [None]:
if False: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3-finetune",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

Likewise, if you want to instead push to GGUF to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!