<a href="https://colab.research.google.com/github/wakusoftware/Building-Machine-Learning-Pipelines-on-AWS/blob/main/Qwen2_5_QLoRA%2C_LoRA%2C_and_Full_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*More details in this article: [Qwen2.5 QLoRA, LoRA and Full Fine-tuning on Your Computer](https://newsletter.kaitchup.com/p/qwen25-qlora-lora-and-full-fine-tuning)*

This notebook shows how to fine-tune Qwen2.5 with QLoRA, LoRA, and without LoRA (full fine-tuning).

GPU memory requirements:
* QLoRA: 18.5 GB (can be lower than 16 GB if you reduce the batch size)
* LoRA: 19.8 GB
* Full fine-tuning: 37.8 GB

#QLoRA Fine-tuning

We need to install the following packages:

In [None]:
!pip install --upgrade transformers bitsandbytes peft accelerate datasets trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3

I first ran this test to check whether Qwen2.5 supports "add_eos_token". Result: It doesn't. We need to add the EOS token manually.


Check this article for more information: [My LLM Can't Stop Generating, How to Fix It?](https://kaitchup.substack.com/p/my-llm-cant-stop-generating-how-to)

In [None]:
from transformers import AutoTokenizer, set_seed
set_seed(1234)  # For reproducibility

prompt = "### Human: Hello!### Assistant: Hello!"
model_id = "Qwen/Qwen2.5-7B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
print("\nEOS token: "+tokenizer.eos_token)
print("Example of a tokenized sentence without add_eos_token:")
print(tokenizer.decode(tokenizer(prompt)['input_ids'], skip_special_tokens=False))

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
print("\nExample of a tokenized sentence with add_eos_token:")
print(tokenizer.decode(tokenizer(prompt)['input_ids'], skip_special_tokens=False))

if tokenizer.eos_token_id not in tokenizer(prompt)['input_ids']:
  print("\n\nadd_eos_token is not supported. Consider adding the EOS token manually.")
else:
  print("\n\nadd_eos_token is supported.")


EOS token: <|im_end|>
Example of a tokenized sentence without add_eos_token:
### Human: Hello!### Assistant: Hello!

Example of a tokenized sentence with add_eos_token:
### Human: Hello!### Assistant: Hello!


add_eos_token is not supported. Consider adding the EOS token manually.



Code

In [None]:
import torch, os, multiprocessing
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    set_seed
)
from trl import SFTTrainer, SFTConfig

set_seed(1234)

#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

model_name = "Qwen/Qwen2.5-7B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 151645
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

#Add the EOS token
def process(row):
    row["text"] = row["text"]+tokenizer.eos_token
    return row

ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map={"": 0}, attn_implementation=attn_implementation
)


model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

training_arguments = SFTConfig(
        output_dir="./Qwen2.5_7B_QLoRA",
        eval_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=25,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=25,
        num_train_epochs=1,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        dataset_text_field="text",
        max_seq_length=512,
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)


#--code by Unsloth: https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=pCqnaKmlO1U9

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_ = trainer.train()


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_trainer= round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
trainer_percentage = round(used_memory_for_trainer/max_memory*100, 3)
print(f"{trainer_.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_trainer} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {trainer_percentage} %.")
print("-----")
#----

Repo card metadata block was not found. Setting CardData to empty.
  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/9846 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/518 [00:00<?, ? examples/s]

### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.

Recent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining power, leading

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   8%|7         | 315M/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2FlashAttention2(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Using auto half precision backend
Currently training with a batch size of: 8


GPU = NVIDIA L4. Max memory = 22.168 GB.
11.123 GB of memory reserved.


***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 307
  Number of trainable parameters = 40,370,176
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.
Detected flash_attn version: 2.6.3
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
25,1.3681,1.331752
50,1.2907,1.306315
75,1.3054,1.300161
100,1.2803,1.297927
125,1.2819,1.295724
150,1.2534,1.294384
175,1.269,1.292779
200,1.2722,1.291578
225,1.2842,1.290858
250,1.2858,1.290337



***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8

***** Running Evaluation *****
  Num examples = 518
  Batch size = 8
Saving model checkpoint to ./Qwen2.5_7B_QLoRA/checkpoint-307
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwe

12558.6702 seconds used for training.
209.31 minutes used for training.
Peak reserved memory = 18.529 GB.
Peak reserved memory for training = 7.406 GB.
Peak reserved memory % of max memory = 83.584 %.
Peak reserved memory for training % of max memory = 33.409 %.
-----


#LoRA Fine-tuning (fp32)

In [None]:
import torch, os, multiprocessing
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed
)
from trl import SFTTrainer, SFTConfig

set_seed(1234)

#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

model_name = "Qwen/Qwen2.5-7B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 151645
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

#Add the EOS token
def process(row):
    row["text"] = row["text"]+tokenizer.eos_token
    return row

ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, attn_implementation=attn_implementation
)


model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

training_arguments = SFTConfig(
        output_dir="./Qwen2.5_7B_LoRA",
        eval_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        per_device_eval_batch_size=2,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=25,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=25,
        num_train_epochs=1,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        dataset_text_field="text",
        max_seq_length=512,
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)


#--code by Unsloth: https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=pCqnaKmlO1U9

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_ = trainer.train()


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_trainer= round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
trainer_percentage = round(used_memory_for_trainer/max_memory*100, 3)
print(f"{trainer_.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_trainer} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {trainer_percentage} %.")
print("-----")
#----

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


openassistant_best_replies_train.jsonl:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

openassistant_best_replies_eval.jsonl:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9846 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/518 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/9846 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/518 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2Model is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attentio

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Using auto half precision backend
Currently training with a batch size of: 2


GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
32.23 GB of memory reserved.


***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 307
  Number of trainable parameters = 40,370,176
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Detected flash_attn version: 2.6.3
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
25,1.3893,1.356049
50,1.299,1.32719
75,1.3176,1.321658
100,1.2985,1.319919
125,1.2874,1.318496
150,1.2675,1.3167
175,1.2809,1.315382
200,1.283,1.314178
225,1.2895,1.31355
250,1.3037,1.31321



***** Running Evaluation *****
  Num examples = 518
  Batch size = 2
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples

3037.5464 seconds used for training.
50.63 minutes used for training.
Peak reserved memory = 36.873 GB.
Peak reserved memory for training = 4.643 GB.
Peak reserved memory % of max memory = 93.198 %.
Peak reserved memory for training % of max memory = 11.735 %.
-----


#LoRA Fine-tuning (bf16)

In [None]:
import torch, os, multiprocessing
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed
)
from trl import SFTTrainer, SFTConfig

set_seed(1234)

#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

model_name = "Qwen/Qwen2.5-7B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 151645
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

#Add the EOS token
def process(row):
    row["text"] = row["text"]+tokenizer.eos_token
    return row

ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, attn_implementation=attn_implementation, torch_dtype=torch.bfloat16
)


model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

training_arguments = SFTConfig(
        output_dir="./Qwen2.5_7B_LoRAbf16",
        eval_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        per_device_eval_batch_size=2,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=25,
        learning_rate=1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=25,
        num_train_epochs=1,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        dataset_text_field="text",
        max_seq_length=512,
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_arguments,
)


#--code by Unsloth: https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=pCqnaKmlO1U9

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_ = trainer.train()


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_trainer= round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
trainer_percentage = round(used_memory_for_trainer/max_memory*100, 3)
print(f"{trainer_.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_trainer} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {trainer_percentage} %.")
print("-----")
#----

tokenizer_config.json:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
  self.pid = os.fork()


Map (num_proc=12):   0%|          | 0/9846 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/518 [00:00<?, ? examples/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Using auto half precision backend
Currently training with a batch size of: 2


GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
16.182 GB of memory reserved.


***** Running training *****
  Num examples = 9,846
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 307
  Number of trainable parameters = 40,370,176
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Detected flash_attn version: 2.6.3
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
25,1.391,1.363859
50,1.3007,1.327253
75,1.3178,1.321641
100,1.2985,1.320056
125,1.2875,1.318537
150,1.2674,1.31679
175,1.2809,1.315547
200,1.283,1.314359
225,1.2896,1.313438
250,1.3039,1.313375



***** Running Evaluation *****
  Num examples = 518
  Batch size = 2
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples = 518
  Batch size = 2

***** Running Evaluation *****
  Num examples

2622.0278 seconds used for training.
43.7 minutes used for training.
Peak reserved memory = 19.762 GB.
Peak reserved memory for training = 3.58 GB.
Peak reserved memory % of max memory = 49.949 %.
Peak reserved memory for training % of max memory = 9.049 %.
-----


#Full fine-tuning

In [None]:
import torch, os, multiprocessing
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed
)
from trl import SFTTrainer, SFTConfig

set_seed(1234)

#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float32
  attn_implementation = 'sdpa'

model_name = "Qwen/Qwen2.5-7B"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = "<|im_end|>"
tokenizer.pad_token_id = 151645
tokenizer.padding_side = 'left'

ds = load_dataset("timdettmers/openassistant-guanaco")

#Add the EOS token
def process(row):
    row["text"] = row["text"]+tokenizer.eos_token
    return row

ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

model = AutoModelForCausalLM.from_pretrained(
          model_name, device_map={"": 0}, attn_implementation=attn_implementation, torch_dtype=compute_dtype
)


model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})


training_arguments = SFTConfig(
        output_dir="./Qwen2.5_7B_FFT2",
        eval_strategy="steps",
        do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        per_device_eval_batch_size=2,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=25,
        learning_rate=1e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=25,
        num_train_epochs=1,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        dataset_text_field="text",
        max_seq_length=512,
)

trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        tokenizer=tokenizer,
        args=training_arguments,
)


#--code by Unsloth: https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=pCqnaKmlO1U9

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_ = trainer.train()


used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_trainer= round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
trainer_percentage = round(used_memory_for_trainer/max_memory*100, 3)
print(f"{trainer_.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_trainer} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {trainer_percentage} %.")
print("-----")
#----

ModuleNotFoundError: No module named 'datasets'