In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl=0.19.1 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
%%capture
# Install latest transformers for Gemma 3N
!pip install --no-deps --upgrade timm # Only for Gemma 3N

## Unsloth

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit",
    # Pretrained models
    "unsloth/gemma-3n-E4B-unsloth-bnb-4bit",
    "unsloth/gemma-3n-E2B-unsloth-bnb-4bit",

    # Other Gemma 3 quants
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

## Gemma3n Inference

In [None]:
from transformers import TextStreamer
# Helper function for inference
def do_gemma_3n_inference(messages, max_new_tokens = 128):
    _ = model.generate(
        **tokenizer.apply_chat_template(
            messages,
            add_generation_prompt = True, # Must add for generation
            tokenize = True,
            return_dict = True,
            return_tensors = "pt",
        ).to("cuda"),
        max_new_tokens = max_new_tokens,
        temperature = 1.0, top_p = 0.95, top_k = 64,
        streamer = TextStreamer(tokenizer, skip_prompt = True),
    )

In [None]:
messages = [{
    "role": "user",
    "content": [{ "type" : "text",
                  "text" : "Write a poem about sloths." }]
}]
do_gemma_3n_inference(messages)

## Finetuning Gemma3n

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 16,           # Larger = higher accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0.05,
    bias = "none",
    random_state = 3407,
)

#### Data Preprocessing

We now use the `Gemma-3` format for conversation style finetunes. We use [hotel_conversations_118](https://huggingface.co/datasets/tyanfarm/hotel_conversations_118) dataset in ShareGPT style. Gemma-3 renders multi turn conversations like below:

```
<bos><start_of_turn>user
Hello!<end_of_turn>
<start_of_turn>model
Hey there!<end_of_turn>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3, phi4, qwen2.5, gemma3` and more.

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

#### Raw dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("tyanfarm/hotel_conversations_118", split = "train")

print(dataset[0])

#### Standard Formatted dataset

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

dataset[50]

#### Gemma-3 Formatted dataset

We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`. We remove the `<bos>` token using removeprefix(`'<bos>'`) since we're finetuning. The Processor will add this token before training and the model expects only one.

In [None]:
def formatting_prompts_func(examples, indices):
    convos = examples["conversations"]
    results = []
    for i, convo in zip(indices, convos):
        try:
            text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False).removeprefix('<bos>')
            results.append(text)
        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            print(f"Hotel data has error: {str(convo)}")
            results.append("")  # Hoặc raise nếu muốn dừng luôn
    return { "text": results }

dataset = dataset.map(formatting_prompts_func, batched = True, with_indices=True)

In [None]:
dataset[50]

## Train the model

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Let's verify masking the instruction part is done! Let's print the 100th row again.  Notice how the sample only has a single `<bos>` as expected!

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

Now let's print the masked out example - you should see only the answer is present:

In [None]:
# tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

Show current memory stats

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

## Start Training

In [None]:
trainer_stats = trainer.train()

Show final memory & time stats

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Push to HuggingFace

#### LoRA Adapters

In [None]:
from huggingface_hub import login

login(token="")

In [None]:
from huggingface_hub import HfApi

dataset_name = "tyanfarm/gemma-3n-hotels-conversational-118-finetuned"
api = HfApi()
api.create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True)

In [None]:
model.save_pretrained("gemma-3n")
tokenizer.save_pretrained("gemma-3n")

# Push to huggingface
model.push_to_hub(dataset_name)
tokenizer.push_to_hub(dataset_name)

#### Merged Full Model

In [None]:
if True:
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name = "tyanfarm/gemma-3n-hotels-conversational-118-finetuned", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "What is Gemma-3N?",}]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

from transformers import TextStreamer
_ = model.generate(
    **inputs,
    max_new_tokens = 128, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

In [None]:
if True: # Change to True to upload finetune
    model.push_to_hub_merged(
        "tyanfarm/gemma-3N-finetune", tokenizer,
    )

In [None]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3N-finetune", tokenizer)

#### GGUF Conversion

In [None]:
from huggingface_hub import login

login(token="")

In [None]:
from huggingface_hub import snapshot_download

# Tải toàn bộ kho lưu trữ mô hình từ Hugging Face Hub
snapshot_download(
    repo_id="tyanfarm/gemma-3N-finetune",  # Thay bằng tên mô hình trên Hub
    local_dir="gemma-3N-finetune-gguf",  # Thư mục cục bộ để lưu mô hình
    token=""  # Token Hugging Face nếu mô hình là private
)

In [None]:
api.create_repo(repo_id="tyanfarm/gemma-3n-hotels-conversational-118-finetuned-gguf", repo_type="model", exist_ok=True)

In [None]:
if True: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3N-finetune",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

In [None]:
model.push_to_hub_gguf(
    "gemma-3N-finetune-gguf",
    quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
    repo_id = "tyanfarm/gemma-3n-hotels-conversational-118-finetuned-gguf",
)