# Fine Tune Bloomz with Prompt Tuning


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Installations

In [2]:
!pip install -q peft==0.4.0
!pip install -q transformers
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Imports

In [3]:
import os
import time


from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit, PeftModel, PeftConfig

## 3. Load data
We are going to fine-tune our model on a dataset called Abirate/english_quotes containing exclusively inspirational English quotes, with the hopes of using the fine-tuned version to generate more quotes later!

In [4]:
data = load_dataset("Abirate/english_quotes", split="train")
print(data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['quote', 'author', 'tags'],
    num_rows: 2508
})


In [5]:
data[0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

## 4. Model
We’ll be using bloomz model for our foundation causal LM for generating text. This model is trained on multi-lingual dataset.

In [6]:
model_name = "bigscience/bloomz-560m"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda:0")

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Before doing any fine-tuning, we will ask the model to generate a new phrase to the following input sentence.

In [51]:
i = tokenizer("Motivation is ", return_tensors="pt", padding=True)

# Generate text using the pre-trained foundation model based on the provided input_ids and attention_mask.
outputs = model.generate(
    input_ids=i["input_ids"].to("cuda:0"),
    attention_mask=i["attention_mask"].to("cuda:0"),
    max_new_tokens=10,
    eos_token_id=tokenizer.eos_token_id
)

decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(decoded_output)

['Motivation is  a key factor in the success of a business.']


Output is not that good. So, we going to fine tune the model over english quotes dataset.

## 5. Data Preparation

In [9]:
# Tokenize the quotes in the dataset using the specified tokenizer
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data = data.select(range(100)) # we are only using 100 samples for the shake of simplicity.

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [10]:
data[4]

{'quote': '“A room without books is like a body without a soul.”',
 'author': 'Marcus Tullius Cicero',
 'tags': ['books', 'simile', 'soul'],
 'input_ids': [119533,
  22630,
  7160,
  38695,
  632,
  3269,
  267,
  12364,
  7160,
  267,
  23037,
  17,
  982],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
data

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 100
})

## 6. Prompt-Tuning
Prompt tuning allows both random and initialization of soft prompts or also known as virtual tokens.Now, we will start with random initialization, where all we provide is the length of the virtual prompt.

In [12]:
# Create a configuration for prompt tuning using the PromptTuningConfig class
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.RANDOM,
    num_virtual_tokens= 10,
    tokenizer_name_or_path=model_name
)

# Get a PeftModel using the specified foundation_model and prompt tuning configuration
peft_model = get_peft_model(model, peft_config)

# Print the trainable parameters of the PeftModel
print(peft_model.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


PEFT allows us to drastically reduce the number of trainable parameters. Now, we can proceed with using TrainingArguments to define our fine-tuning configurations.

In [13]:
# Define the output directory for storing Peft model outputs
output_directory = os.path.join("/content/working_dir", "peft_outputs")

# Create the working directory if it doesn't exist
if not os.path.exists("/content/working_dir"):
    os.mkdir("/content/working_dir")

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.mkdir(output_directory)

# Define training arguments for the Peft model
training_args = TrainingArguments(
    num_train_epochs=50,
    auto_find_batch_size=True,
    output_dir=output_directory,
    learning_rate=3e-2,
    logging_strategy="epoch"
)

Specifically, we will be using DataCollatorForLanguageModeling which will additionally pad the inputs to the maximum length of a batch since the inputs can have variable lengths.

In [14]:
# Enable gradient checkpointing in the Peft model's configuration
peft_model.config.gradient_checkpointing = True

# Create a Trainer instance for training the Peft model
trainer = Trainer(
    model=peft_model,  # We pass in the PEFT version of the foundation model, bloomz-560M
    args=training_args,  # Training arguments specifying output directory, GPU usage, batch size, etc.
    train_dataset=data,  # Training dataset
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)  # mlm=False indicates not to use masked language modeling
)

# Start the training process
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Step,Training Loss
25,3.0937
50,2.9876
75,2.9412
100,2.8698
125,2.8783
150,2.8473
175,2.8635
200,2.7918
225,2.8212
250,2.7471


TrainOutput(global_step=1250, training_loss=2.690864923095703, metrics={'train_runtime': 368.3358, 'train_samples_per_second': 13.575, 'train_steps_per_second': 3.394, 'total_flos': 692924244393984.0, 'train_loss': 2.690864923095703, 'epoch': 50.0})

Now we can simply use SFTTrainer which is provided by trl from HuggingFace to start the training.

## 5. Save the model

In [15]:
# Record the current time for creating a unique Peft model path
time_now = time.time()

# Create a path for saving the Peft model using the output directory and timestamp
peft_model_path = os.path.join(output_directory, f"bloomz_{time_now}")

# Save the trained Peft model to the specified path
trainer.model.save_pretrained(peft_model_path)

## 6. Inference
You can load the model from the path that you have saved to before, and ask the model to generate text based on our input before!

In [16]:
# Load the trained Peft model from the specified path using the PeftModel class
loaded_model = PeftModel.from_pretrained(
    model,  # The base model to be used for prompt tuning
    peft_model_path,   # The path where the trained Peft model is saved
    is_trainable=False  # Indicates that the loaded model should not be trainable
)

Let’s generate the output of the loaded model.

In [55]:
# Generate text using the loaded Peft model based on the provided input_ids and attention_mask.
loaded_model_outputs = loaded_model.generate(
    input_ids=i["input_ids"].to("cuda:0"),
    attention_mask=i["attention_mask"].to("cuda:0"),
    max_new_tokens=9,
    eos_token_id=tokenizer.eos_token_id
)

# Decode the generated token IDs into human-readable text.
decoded_output = tokenizer.batch_decode(loaded_model_outputs, skip_special_tokens=True)

# Print the decoded output, which represents the generated text.
print(decoded_output)

['Motivation is  one of the most important things in life.']


Let’s compare our fine tuned randomly initialized model with the text initialization method.

Notice that all we are changing is the prompt_tuning_init setting and we are also providing a concise text prompt.

In [22]:
# Create a configuration for text-based prompt tuning using the PromptTuningConfig class
text_peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Generate inspirational quotes",  # Provides a starter for the model to begin searching for the best embeddings
    num_virtual_tokens=10,
    tokenizer_name_or_path=model_name
)

model = AutoModelForCausalLM.from_pretrained(model_name)

In [23]:
# Get a PeftModel using the specified foundation_model and text-based prompt tuning configuration
text_peft_model = get_peft_model(model, text_peft_config)

# Print the trainable parameters of the Text-based PeftModel
print(text_peft_model.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [24]:
# Create a Trainer instance for training the Peft model
text_trainer = Trainer(
    model=text_peft_model,  # We pass in the PEFT version of the foundation model, bloomz-560M
    args=training_args,  # Training arguments specifying output directory, GPU usage, batch size, etc.
    train_dataset=data,  # Training dataset
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)  # mlm=False indicates not to use masked language modeling
)

# Start the training process
text_trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


Step,Training Loss


Step,Training Loss
50,2.8596
100,2.804
150,2.7724
200,2.7252
250,2.7469
300,2.6633
350,2.694
400,2.6431
450,2.6426
500,2.6079


Checkpoint destination directory /content/working_dir/peft_outputs/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/working_dir/peft_outputs/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2500, training_loss=2.4129670806884764, metrics={'train_runtime': 319.1977, 'train_samples_per_second': 15.664, 'train_steps_per_second': 7.832, 'total_flos': 470400971685888.0, 'train_loss': 2.4129670806884764, 'epoch': 50.0})

Then we’ll perform the same as we did perform for our comparison by saving, loading and generating output from the text initialization model.

In [67]:
# Generate output
i = tokenizer("Give motivation ", return_tensors="pt", padding=True)
text_outputs = text_peft_model.generate(
    input_ids=i["input_ids"].to("cuda:0"),
    attention_mask=i["attention_mask"].to("cuda:0"),
    max_new_tokens=15,
    eos_token_id=tokenizer.eos_token_id
)

print(tokenizer.batch_decode(text_outputs, skip_special_tokens=True))

['Give motivation 伦·茨威格, "Reading is the best way to learn."']


## 7. Conclusion
This notebook was made for the purpose of education and learning, that's why we taken only 100 samples. To get better result, you have to use all the dataset with different parameters. May:

- Use small learning rate.
- Use large epoch size.
- Use large size