In [1]:
!pip install trl peft bitsandbytes accelerate -q

In [1]:
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
import torch
import os
from datasets import load_dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoTokenizer

from trl import SFTTrainer


# Define the arguments

model_name = "distilgpt2"
dataset_name = "timdettmers/openassistant-guanaco"
load_in_8bit = False
load_in_4bit = True
use_peft = True
peft_lora_r = 64
peft_lora_alpha = 16
output_dir = "output"
batch_size = 16
gradient_accumulation_steps = 16
learning_rate = 1.41e-5
logging_steps = 1
num_train_epochs = 3
dataset_text_field = "text"
max_steps = -1

# Step 1: Load the model
if load_in_8bit and load_in_4bit:
    raise ValueError(
        "You can't load the model in 8 bits and 4 bits at the same time")
elif load_in_8bit or load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
    )
    # This means: fit the entire model on the GPU:0
    device_map = {"": 0}
    torch_dtype = torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=False,
    torch_dtype=torch_dtype,
)

# Step 2: Load the dataset
dataset = load_dataset(dataset_name, split="train")

# Step 3: Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
)

# Step 4: Define the LoraConfig
if use_peft:
    peft_config = LoraConfig(
        r=peft_lora_r,
        lora_alpha=peft_lora_alpha,
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None

# Step 5: Define the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    dataset_text_field=dataset_text_field,
    peft_config=peft_config,
)

trainer.train()

final_checkpoints_dir = os.path.join(output_dir, "final_checkpoints")
trainer.model.save_pretrained(final_checkpoints_dir)

# Free memory for merging weights
del model
torch.cuda.empty_cache()

model = AutoPeftModelForCausalLM.from_pretrained(
    final_checkpoints_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/yousif/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Loading binary /home/yousif/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
Found cached dataset json (/home/yousif/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-c93588435bc90172/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Using pad_token, but it is not set yet.
Loading cached processed dataset at /home/yousif/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-c93588435bc90172/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7ae36c9dee7b8196.arrow
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 9846
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 81912576
  0%|          | 0/5 [00:00<?, ?it/s]You're using a GPT2

{'loss': 2.9879, 'learning_rate': 1.128e-05, 'epoch': 0.0}


 40%|████      | 2/5 [00:04<00:05,  1.95s/it]

{'loss': 4.5747, 'learning_rate': 8.46e-06, 'epoch': 0.0}
