# 安裝

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth



# 載入模型

In [None]:
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048, # 選擇上下文長度！
    load_in_4bit = True,  # 4 位元量化
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.5: Fast Gemma3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


# PEFT

In [None]:
# 使用 FastModel 建立 LoRA (Low-Rank Adaptation) 微調模型
model = FastModel.get_peft_model(
    model,

    # === 層級微調設定 ===
    finetune_vision_layers     = False,  # 是否微調視覺層 (適用於多模態模型的圖像處理部分)
    finetune_language_layers   = True,   # 是否微調語言層 (文字處理的核心層，通常需要開啟)
    finetune_attention_modules = True,   # 是否微調注意力模組 (Self-Attention 機制，影響模型理解能力)
    finetune_mlp_modules       = True,   # 是否微調多層感知器模組 (MLP layers，負責特徵轉換)

    # === LoRA 核心參數 ===
    r = 8,                              # LoRA 秩 (rank)：控制低秩矩陣的維度
                                        # 數值越大 = 模型容量越大，學習能力越強，但記憶體用量增加
                                        # 常用範圍：4-64，一般推薦 8-16

    lora_alpha = 8,                     # LoRA 縮放因子：控制 LoRA 權重的影響程度
                                        # 計算公式：scaling = lora_alpha / r
                                        # 建議設為 r 的 1-2 倍，影響學習率和收斂速度

    lora_dropout = 0,                   # LoRA 層的 Dropout 率 (0-1)
                                        # 0 = 不使用 dropout，適合小資料集
                                        # 0.1-0.3 = 適合大資料集，防止過擬合

    # === 其他設定 ===
    bias = "none",                      # 偏置項處理方式
                                        # "none"：不訓練偏置項 (節省記憶體，推薦)
                                        # "all"：訓練所有偏置項
                                        # "lora_only"：僅訓練 LoRA 相關偏置項

    random_state = 3407,                # 隨機種子：確保實驗結果可重現
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


# 準備資料

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [None]:
dataset[100]

{'conversations': [{'from': 'human',
   'value': 'What is the modulus operator in programming and how can I use it to calculate the modulus of two given numbers?'},
  {'from': 'gpt',
   'value': 'In programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which e

# 資料標準化

In [None]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

In [None]:
dataset[100]

{'conversations': [{'content': 'What is the modulus operator in programming and how can I use it to calculate the modulus of two given numbers?',
   'role': 'user'},
  {'content': 'In programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2. Theref

將 Gemma-3 的對話模板套用到這些對話中，並儲存為文字。由於我們要進行微調，因此會用 `removeprefix('<bos>')` 移除 `<bos>` token。這是因為 Processor（預處理器）會在訓練前自動加上這個 token，而模型預期輸入中只能有一個 `<bos>`。


In [None]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)
dataset[100]["text"]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

'<start_of_turn>user\nWhat is the modulus operator in programming and how can I use it to calculate the modulus of two given numbers?<end_of_turn>\n<start_of_turn>model\nIn programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2. Therefore, the ou

# 訓練模型

In [None]:
from trl import SFTTrainer, SFTConfig

# 建立 SFT (Supervised Fine-Tuning) 監督式微調訓練器
trainer = SFTTrainer(
    # === 核心組件 ===
    model = model,                      # 要微調的預訓練模型
    tokenizer = tokenizer,              # 文字編碼器，必須與模型匹配
    train_dataset = dataset,            # 訓練資料集
    eval_dataset = None,                # 評估資料集 (可選，用於驗證模型效果)

    # === 訓練設定 ===
    args = SFTConfig(
        # --- 資料設定 ---
        dataset_text_field = "text",           # 資料集中包含訓練文字的欄位名稱
                                               # 確保您的資料集有此欄位名稱

        dataset_num_proc = 2,                  # 資料預處理時使用的CPU進程數
                                               # 可根據您的CPU核心數調整 (通常 2-8)

        # --- 批次與記憶體管理 ---
        per_device_train_batch_size = 2,       # 每個GPU/設備的批次大小
                                               # 數值越大訓練越快，但耗用更多記憶體
                                               # 建議從小數值開始測試 (1, 2, 4...)

        gradient_accumulation_steps = 4,        # 梯度累積步數：模擬更大的批次大小
                                               # 有效批次大小 = batch_size × accumulation_steps × GPU數量
                                               # 此例：有效批次 = 2 × 4 = 8
                                               # 用於在記憶體有限時獲得大批次的效果

        # --- 訓練長度控制 ---
        # num_train_epochs = 1,                # 訓練輪數 (與 max_steps 二選一)
                                               # 1輪 = 完整遍歷一次訓練資料

        max_steps = 30,                        # 最大訓練步數 (覆蓋 num_train_epochs)
                                               # 用於快速測試或精確控制訓練長度
                                               # 正式訓練建議設定更大數值 (數百到數千)

        # --- 學習率設定 ---
        learning_rate = 2e-4,                  # 學習率：控制參數更新的步長
                                               # 2e-4 (0.0002) 適合短期微調
                                               # 長期訓練建議降至 2e-5 (0.00002)
                                               # LoRA通常使用較高學習率 (1e-4 到 5e-4)

        warmup_steps = 5,                      # 學習率預熱步數
                                               # 前N步逐漸增加學習率，避免初期震盪
                                               # 通常設為總步數的 5-10%

        lr_scheduler_type = "linear",          # 學習率調度策略
                                               # "linear"：線性衰減 (常用)
                                               # "cosine"：餘弦衰減 (效果通常更好)
                                               # "constant"：固定學習率

        # --- 優化器設定 ---
        optim = "adamw_8bit",                  # 優化器類型
                                               # "adamw_8bit"：8位元AdamW，節省記憶體
                                               # "adamw_torch"：標準AdamW
                                               # "sgd"：隨機梯度下降

        weight_decay = 0.01,                   # 權重衰減 (L2正則化)
                                               # 防止過擬合，通常設定 0.01-0.1
                                               # 0 = 不使用正則化

        # --- 監控與日誌 ---
        logging_steps = 1,                     # 每N步輸出一次訓練日誌
                                               # 1 = 每步都記錄 (適合調試)
                                               # 實際訓練可設定較大值 (10, 50, 100)

        report_to = "none",                    # 實驗追蹤工具
                                               # "none"：不使用追蹤
                                               # "wandb"：Weights & Biases
                                               # "tensorboard"：TensorBoard
                                               # ["wandb", "tensorboard"]：多個工具

        # --- 隨機性控制 ---
        seed = 3407,                           # 隨機種子：確保實驗可重現
    ),
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

使用 train_on_completions 方法，只針對assistant的輸出進行訓練，並忽略使用者輸入的LOSS。可以提升微調的準確性！

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)


Map (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nWhat is the modulus operator in programming and how can I use it to calculate the modulus of two given numbers?<end_of_turn>\n<start_of_turn>model\nIn programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2. Therefore, t

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                               In programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2. Therefore, the output of the above code would be:\n\n```\nModulus of the given numbers is: 2\n```\n\nThis means that the modulus of 10 and 4 is 2.<end_of_t

# 顯示目前記憶體用量

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.59 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,901,248/4,000,000,000 (0.37% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,1.2249
2,1.6779
3,1.7519
4,1.3687
5,1.1273
6,1.4517
7,0.7816
8,1.1506
9,0.9262
10,0.8212


# 顯示記憶體用量

In [None]:

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

657.3066 seconds used for training.
10.96 minutes used for training.
Peak reserved memory = 12.789 GB.
Peak reserved memory for training = 7.199 GB.
Peak reserved memory % of max memory = 86.758 %.
Peak reserved memory for training % of max memory = 48.837 %.


## 批次輸出

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "請接續這個序列: 1, 1, 2, 3, 5, 8,",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\n請接續這個序列: 1, 1, 2, 3, 5, 8,<end_of_turn>\n<start_of_turn>model\n這是一個經典的斐波那契序列，其中的元素之和是先前元素的兩個元素（除錯的元素除錯的元素除錯的元素除錯的元素）。\n以此法則，接下來的元素為 13：13。<end_of_turn>']

## 串流輸出

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "為何天空是藍色的？請用繁體中文回答。",}]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # 必須為生成添加
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # 增加以獲得更長的輸出！
    # 推薦的 Gemma-3 設定！
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

天空之所以呈現藍色，主要是由於一種叫做「瑞利散射」的現象。

以下解釋了瑞利散射的原因：

1. 太陽光，在穿過大氣層時會與空氣中的分子（主要為氮和氧）相互作用。這種相互作用會讓光子分散


# 存儲模型

In [None]:
model.save_pretrained("gemma-3")
tokenizer.save_pretrained("gemma-3")

['gemma-3/processor_config.json']