In [1]:
!pip install trl peft bitsandbytes accelerate -q

In [1]:
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
import torch
import os
from datasets import load_dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoTokenizer

from trl import SFTTrainer


# Define the arguments

model_name = "distilgpt2"
dataset_name = "timdettmers/openassistant-guanaco"
load_in_8bit = False
load_in_4bit = True
use_peft = True
peft_lora_r = 64
peft_lora_alpha = 16
output_dir = "output"
batch_size = 16
gradient_accumulation_steps = 16
learning_rate = 1.41e-5
logging_steps = 1
num_train_epochs = 3
dataset_text_field = "text"
max_steps = -1

# Step 1: Load the model
if load_in_8bit and load_in_4bit:
    raise ValueError(
        "You can't load the model in 8 bits and 4 bits at the same time")
elif load_in_8bit or load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit
    )
    # This means: fit the entire model on the GPU:0
    device_map = {"": 0}
    torch_dtype = torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=False,
    torch_dtype=torch_dtype,
)

# Step 2: Load the dataset
dataset = load_dataset(dataset_name, split="train")

# Step 3: Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
)

# Step 4: Define the LoraConfig
if use_peft:
    peft_config = LoraConfig(
        r=peft_lora_r,
        lora_alpha=peft_lora_alpha,
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None

# Step 5: Define the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    dataset_text_field=dataset_text_field,
    peft_config=peft_config,
)

trainer.train()

final_checkpoints_dir = os.path.join(output_dir, "final_checkpoints")
trainer.model.save_pretrained(final_checkpoints_dir)

# Free memory for merging weights
del model
torch.cuda.empty_cache()

model = AutoPeftModelForCausalLM.from_pretrained(
    final_checkpoints_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/yousif/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA SETUP: Loading binary /home/yousif/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn("The installed version of bitsandbytes was compiled without GPU support. "
Found cached dataset json (/home/yousif/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-c93588435bc90172/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
Using pad_token, but it is not set yet.
Loading cached processed dataset at /home/yousif/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-c93588435bc90172/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7ae36c9dee7b8196.arrow
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 9846
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 81912576
  0%|          | 0/5 [00:00<?, ?it/s]You're using a GPT2

{'loss': 2.9879, 'learning_rate': 1.128e-05, 'epoch': 0.0}


 40%|████      | 2/5 [00:04<00:05,  1.95s/it]

{'loss': 4.5747, 'learning_rate': 8.46e-06, 'epoch': 0.0}


In [1]:
import torch
import os
from huggingface_hub import login
import json
from datasets import load_dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from tqdm import tqdm
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoTokenizer
import datetime
from trl import SFTTrainer
tqdm.pandas()


def get_config_value(config, key, default_value):
    return config[key] if key in config and config[key] is not None else default_value


with open("config.json", "r") as f:
    config = json.load(f)

# Step 1: Load arguments
print("Step 1: Load arguments")
load_in_4bit = get_config_value(config, "load_in_4bit", True)
load_in_8bit = get_config_value(config, "load_in_8bit", False)
model_name = get_config_value(config, "model_name", "gpt2")
dataset_dir = get_config_value(config, "dataset_dir", "./data")
output_dir = get_config_value(config, "output_dir", "./output")
batch_size = get_config_value(config, "batch_size", 64)
gradient_accumulation_steps = get_config_value(
    config, "gradient_accumulation_steps", 16)
learning_rate = get_config_value(config, "learning_rate", 1.41e-5)
logging_steps = get_config_value(config, "logging_steps", 1)
num_train_epochs = get_config_value(config, "num_train_epochs", 3)
max_steps = get_config_value(config, "max_steps", -1)
use_peft = get_config_value(config, "use_peft", False)
peft_lora_r = get_config_value(config, "peft_lora_r", 64)
peft_lora_alpha = get_config_value(config, "peft_lora_alpha", 16)
hf_token = get_config_value(config, "hf_token", None)
hf_repo_name = get_config_value(
    config, "hf_repo_name", model_name + "-" + datetime.datetime.now().strftime("%Y_%m_%d"))


# Step 2: Prepare model config
print("Step 2: Prepare model config")
device_map = "auto"
if load_in_4bit:
    quantization_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit)
    torch_dtype = torch.bfloat16
elif load_in_8bit:
    quantization_config = BitsAndBytesConfig(load_in_8bit=load_in_8bit)
    torch_dtype = torch.bfloat16
else:
    quantization_config = None
    torch_dtype = None

# Step 3: Load the model
print("Step 3: Load the model")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map=device_map,
    trust_remote_code=False,
    torch_dtype=torch_dtype,
)

# Step 4: Load the dataset
print("Step 4: Load the dataset")
dataset = load_dataset("text", data_dir=dataset_dir, split="train")
print(dataset)

# # Step 5: Define the training arguments
# print("Step 5: Define the training arguments")
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_train_batch_size=batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     learning_rate=learning_rate,
#     logging_steps=logging_steps,
#     num_train_epochs=num_train_epochs,
#     max_steps=max_steps,
# )

# # Step 6: Define the LoraConfig, if using PEFT
# print("Step 6: Define the LoraConfig, if using PEFT")
# peft_config = None
# if use_peft:
#     peft_config = LoraConfig(
#         r=peft_lora_r,
#         lora_alpha=peft_lora_alpha,
#         bias="none",
#         task_type="CAUSAL_LM",
#     )

# # Step 7: Define the Trainer and train the model
# print("Step 7: Define the Trainer and train the model")
# trainer = SFTTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset,
#     dataset_text_field="text",
#     peft_config=peft_config,
# )
# trainer.train()

# # Step 8: Save the model locally
# print("Step 8: Save the model locally")
# final_checkpoints_dir = os.path.join(output_dir, "train_checkpoints")
# trainer.model.save_pretrained(final_checkpoints_dir)

# # Free memory for merging weights
# del model
# torch.cuda.empty_cache()
# model = AutoPeftModelForCausalLM.from_pretrained(
#     final_checkpoints_dir, device_map=device_map, torch_dtype=torch.bfloat16)

# # merge lora weights, if using PEFT
# if use_peft:
#     model = model.merge_and_unload()
# output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

# # Step 9: Push the model to the Hub
# if hf_token:
#     print("Step 9: Push the model to the Hub")
#     login(token="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")
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     tokenizer.push_to_hub(hf_repo_name)
# model.push_to_hub(hf_repo_name)

# print("Done!")


  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/yousif/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
Step 1: Load arguments
Step 2: Prepare model config
Step 3: Load the model


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Found cached dataset text (/home/yousif/.cache/huggingface/datasets/text/default-3eea2c3487e8f778/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


Step 4: Load the dataset
Dataset({
    features: ['text'],
    num_rows: 101
})


In [8]:
# # read all files in dataset_dir
# import os

# # get all file names
# file_names = os.listdir(dataset_dir)
# # add path to each file name
# file_names = [os.path.join(dataset_dir, file_name) for file_name in file_names]


# dataset = load_dataset("text", data_files=file_names, split="train")
# dataset['text']

Found cached dataset text (/home/yousif/.cache/huggingface/datasets/text/default-7d3605ac53331206/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


['from peft import LoraConfig',
 'from peft import AutoPeftModelForCausalLM',
 'import torch',
 'import os',
 'from datasets import load_dataset',
 'from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoTokenizer',
 '',
 'from trl import SFTTrainer',
 '',
 '',
 '# Define the arguments',
 '',
 'model_name = "distilgpt2"',
 'dataset_name = "timdettmers/openassistant-guanaco"',
 'load_in_8bit = False',
 'load_in_4bit = True',
 'use_peft = True',
 'peft_lora_r = 64',
 'peft_lora_alpha = 16',
 'output_dir = "output"',
 'batch_size = 16',
 'gradient_accumulation_steps = 16',
 'learning_rate = 1.41e-5',
 'logging_steps = 1',
 'num_train_epochs = 3',
 'dataset_text_field = "text"',
 'max_steps = -1',
 '',
 '# Step 1: Load the model',
 'if load_in_8bit and load_in_4bit:',
 '    raise ValueError(',
 '        "You can\'t load the model in 8 bits and 4 bits at the same time")',
 'elif load_in_8bit or load_in_4bit:',
 '    quantization_config = BitsAndBytesConfig

In [8]:

from transformers import AutoModelForCausalLM
import torch 
import datetime
import os


output_dir = "./output"
final_checkpoints_dir = os.path.join(output_dir, f"train_checkpoint-on-2023_08_10")
AutoModelForCausalLM.from_pretrained(final_checkpoints_dir, torch_dtype=torch.bfloat16)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/yousif/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro