## 1. Choose GPUs

In [21]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

## 2. Import Libraries

In [22]:
from unsloth import FastLanguageModel
import torch
from transformers.utils import logging
logging.set_verbosity_info()

## 3. Basic Settings

In [23]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs. 
fourbit_models = [
    "unsloth/llama-3-8b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # If you have a huggingface token, you can pass it here.
    device_map = "auto",
)


loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unsloth--llama-3-8b-bnb-4bit/snapshots/f20166737ba79ae1129e83194c84e2ec0013bf1b/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128255,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "l

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4080. Num GPUs = 2. Max memory: 15.695 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unslothai--other/snapshots/43d9e0f2f19a5d7836895f648dc0e762816acf77/config.json
loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json
loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unslothai--vram-16/snapshots/9703344699da71a2bb9f17e575eb918c8f6cb349/config.json
loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unslothai--2/snapshots/e5ac17024788aa1d3d34c829ddaab52525828d5f/config.json
loading configuration file config.json from cache at /home/jpzhao/.cache/huggingface/hub/models--unsloth--llama-3-8b-bnb-4bit/snapshots/f20166737ba79ae1129e83194c84e2ec0013bf1b/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
 

PackageNotFoundError: No package metadata was found for bitsandbytes

查看模型在GPU上的分布

In [4]:
print(model.hf_device_map)

# 可以打印一下实际环境：
print(torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}
2
Device 0: NVIDIA GeForce RTX 4080
Device 1: NVIDIA RTX A6000


## 4. Parameters for Finetuning (LoRA)

In [5]:
# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], # modules to fine tune
    lora_alpha = 16, # The scaling factor for finetuning.
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes, and is faster!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context(Options include True, False and "unsloth". We suggest "unsloth" since we reduce memory usage by an extra 30% and support extremely long context finetunes.You can read up here: https://unsloth.ai/blog/long-context for more details.)
    random_state = 3407, # The number to determine deterministic runs. 
    use_rslora = False, # We support rank stabilized LoRA (RS-LoRA). Advanced feature to set the lora_alpha = 16 automatically.
    loftq_config = None, # And LoftQ. Advanced feature to initialize the LoRA matrices to the top r singular vectors of the weights. Can improve accuracy somewhat, but can make memory usage explode at the start.
)


Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## 5. Dataset for Finetuning

In [6]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [7]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map: 100%|██████████| 100000/100000 [00:07<00:00, 12610.48 examples/s]


In [8]:
dataset[5]["conversations"]

[{'content': 'How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?',
  'role': 'user'},
 {'content': 'Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.',
  'role': 'assistant'}]

In [9]:
dataset[5]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAstronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.<|

## 6. Train the Model

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

PyTorch: setting up devices
PyTorch: setting up devices
Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 100000/100000 [01:15<00:00, 1326.48 examples/s]
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [11]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# 这里的作用是把训练数据中的 labels 进行处理，让模型只在“assistant 回复”部分计算 loss，其它地方（比如 prompt 部分）都会标记为 -100，这样就不会在这些地方计算梯度。

Map (num_proc=96): 100%|██████████| 100000/100000 [00:01<00:00, 65779.62 examples/s]


In [12]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAstronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight rel

In [13]:
input_ids = trainer.train_dataset[5]["input_ids"]
labels = trainer.train_dataset[5]["labels"]

# 把 -100 替换为占位符字符（比如 '▁' 表示 ignore），其他的 decode 成文本
decoded = "".join(
    tokenizer.decode([input_ids[i]]) if labels[i] != -100 else "▁"
    for i in range(len(labels))
)

print("可视化 labels 中哪里是被 mask 的：")
print(decoded)

可视化 labels 中哪里是被 mask 的：
▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.<|eot_id|>


In [14]:
def inspect_sample(trainer, tokenizer, index=0):
    input_ids = trainer.train_dataset[index]["input_ids"]
    labels = trainer.train_dataset[index]["labels"]

    print("📌 原始文本（input_ids 解码）:")
    print(tokenizer.decode(input_ids))

    print("\n📌 实际用于训练的文本（只 decode 非 -100 的 labels）:")
    # 用空格的 token ID 替换掉 -100 方便 decode
    space = tokenizer(" ", add_special_tokens=False).input_ids[0]
    decoded_labels = tokenizer.decode([space if x == -100 else x for x in labels])
    print(decoded_labels)

    print("\n📌 哪些位置是 -100（忽略训练）:")
    print([i for i, x in enumerate(labels) if x == -100])

    print("\n📌 可视化标注（-100 位置显示为 ▓，其余 decode）:")
    visual = ""
    for i in range(len(labels)):
        if labels[i] == -100:
            visual += "▓"  # 你也可以用别的字符
        else:
            visual += tokenizer.decode([labels[i]])
    print(visual)

inspect_sample(trainer, tokenizer, index=5)

📌 原始文本（input_ids 解码）:
<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line 

In [15]:
# 使用 tokenizer 将一个空格 " " 编码为 token ID。
# add_special_tokens=False 的意思是 不要加入 [CLS], [SEP] 等特殊 token。
# .input_ids[0] 拿到的是空格的 第一个（也是唯一一个）token ID。
# 所以这一行的作用是获取空格 " " 的 token ID，存到变量 space 里。
space = tokenizer(" ", add_special_tokens = False).input_ids[0]

# 遍历 trainer.train_dataset[5]["labels"]，也就是第 5 个样本的 labels。
# 对于每一个 x，如果 x == -100，就替换成 space（空格的 token ID）。
# 一般 -100 是用于 忽略计算 loss 的 padding 或 masked token。
# 否则就保留原来的 x。
# 然后用 tokenizer.decode(...) 把这些 token IDs 解码成文本。
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

# 这两行代码是为了：
# 把某个训练样本的 labels 中 Mask 掉的部分替换为一个空格的 token ID，然后还原出人类可读的文本。
# 这样做的目的是：让你能更方便地查看 labels 的实际内容，即模型预期生成的文本

'                                                                  Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.<|eot_id|>'

In [16]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4080. Max memory = 15.695 GB.
2.176 GB of memory reserved.


In [17]:
trainer_stats = trainer.train()

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: source, score, text, conversations. If source, score, text, conversations are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


SystemError: PY_SSIZE_T_CLEAN macro must be defined for '#' formats