### Bloom 1b1 finetuning with PEFT p_tuning

Load libraries

In [1]:
from datasets import Dataset, load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, TrainerCallback

Load dataset, dataset used is alpaca cleaned version.

In [2]:
ds = load_dataset("yahma/alpaca-cleaned", split="train[:8000]")
ds

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 8000
})

In [3]:
ds[:3]

{'output': ['1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
  'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the add

Pre-process dataset using the tokenizer from the pretrained model  
Load the tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
tokenizer

BloomTokenizerFast(name_or_path='bigscience/bloom-1b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

Define function for data processing and map the data

In [5]:
def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ")
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8000
})

Inspect the processed dataset

In [6]:
tokenizer.decode(tokenized_ds[1]["input_ids"])

'Human: What are the three primary colors?\n\nAssistant: The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).</s>'

In [7]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"])))

'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).</s>'

Load the pre-trianed model Bloom-1b1

In [8]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", low_cpu_mem_usage=True)
model # inspect model structure

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1536)
    (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1536, out_features=4608, bias=True)
          (dense): Linear(in_features=1536, out_features=1536, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  )
  (

Set configuration for p_tuning using PEFT library

In [9]:
import peft
from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType

config = PromptEncoderConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=10,
                             encoder_reparameterization_type=PromptEncoderReparameterizationType.MLP,
                             encoder_dropout=0.1, encoder_num_layers=5, encoder_hidden_size=1024)
#config

Create model for finetuining

In [10]:
model = get_peft_model(model, config)
model



PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(250880, 1536)
      (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=1536, out_features=4608, bias=True)
            (dense): Linear(in_features=1536, out_features=1536, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
          )
        )
      

In [11]:
model.print_trainable_parameters()

trainable params: 4,213,248 || all params: 1,069,527,552 || trainable%: 0.39393543365211037


Set trainer configuration arguments

In [12]:

args = TrainingArguments(
    output_dir="./chatbot", # Save checkpoints to a folder
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=20,    
)

Create trainer

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

# define a callback function for logging the losses to a text file
class LossLoggingCallback(TrainerCallback):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.losses.append(logs['loss'])
            with open(f"{self.output_dir}/losses.txt", "a") as f:
                f.write(f"{state.global_step}: {logs['loss']}\n")

trainer.add_callback(LossLoggingCallback(output_dir="./"))

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Training

In [14]:
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 2.3789, 'grad_norm': 0.9479593634605408, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.01}
{'loss': 2.3549, 'grad_norm': 1.5464972257614136, 'learning_rate': 4.9e-05, 'epoch': 0.02}
{'loss': 2.1671, 'grad_norm': 2.3337299823760986, 'learning_rate': 4.85e-05, 'epoch': 0.03}
{'loss': 2.1884, 'grad_norm': 2.6896445751190186, 'learning_rate': 4.8e-05, 'epoch': 0.04}
{'loss': 2.2034, 'grad_norm': 2.3402836322784424, 'learning_rate': 4.75e-05, 'epoch': 0.05}
{'loss': 1.9327, 'grad_norm': 2.5278422832489014, 'learning_rate': 4.7e-05, 'epoch': 0.06}
{'loss': 1.9221, 'grad_norm': 1.3716206550598145, 'learning_rate': 4.6500000000000005e-05, 'epoch': 0.07}
{'loss': 2.0053, 'grad_norm': 7.409725666046143, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.08}
{'loss': 2.0212, 'grad_norm': 1.6158283948898315, 'learning_rate': 4.55e-05, 'epoch': 0.09}
{'loss': 1.8959, 'grad_norm': 2.48403000831604, 'learning_rate': 4.5e-05, 'epoch': 0.1}
{'loss': 1.7994, 'grad_norm': 1.459035158157348

TrainOutput(global_step=1000, training_loss=1.8794574642181396, metrics={'train_runtime': 1007.4501, 'train_samples_per_second': 7.941, 'train_steps_per_second': 0.993, 'train_loss': 1.8794574642181396, 'epoch': 1.0})

Test finetuned model

In [15]:
#model = model.cuda()
ipt = tokenizer("Human: {}\n{}".format("How to prepare an exam？", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(model.device)
print( ipt)
print(tokenizer.decode(model.generate(**ipt, max_length=256, do_sample=True)[0], skip_special_tokens=True))

{'input_ids': tensor([[114330,     29,   7535,    427,  52615,    660,  19728,   2498,    603,
           9096,  61339,     29,    210]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
Human: How to prepare an exam？

Assistant: Prepare for an exam using the following exam preparations. Take all the practice subjects you need so that you can practice it consistently. Consider the exam series you took; study topics in accordance with its subjects; and prepare for the exam in the most effective way possible.

There are many exam preparation resources for the preparation of any exam. For exam preparation, I suggest you access the following resources:
