# Step 1: Clone the PEFT library and install dependencies


In [30]:
!pip install datasets
!git clone https://github.com/tsachiblau/peft_CPT.git peft
!pip install -e ./peft

import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(restart=True)


fatal: destination path 'peft' already exists and is not an empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: pip: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'status': 'ok', 'restart': True}

# Step 2: Import libraries

In [1]:
import torch
from transformers import AutoModelForCausalLM
from peft import CPTConfig, get_peft_model
from torch.utils.data import Dataset
from typing import List, Union, Any, Dict
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset

MAX_INPUT_LENGTH = 1024
tokenizer_name_or_path = 'bigscience/bloom-1b7'

# Step 3: Load the Tokenizer and Dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name_or_path,
    cache_dir='.',
    padding_side='right',
    trust_remote_code=True
)


# Step 4: Preprocess dataset with string labels

In [5]:

dataset = load_dataset('glue', 'sst2')

def add_string_labels(example):
    example['label_text'] = "positive" if example['label'] == 1 else "negative"
    return example

train_dataset = dataset['train'].select(range(4)).map(add_string_labels)
test_dataset = dataset['validation'].select(range(20)).map(add_string_labels)


# Step 5: Define the CPT Dataset Class


In [6]:
class CPTDataset(Dataset):
    def __init__(self, samples, tokenizer, template, max_length=MAX_INPUT_LENGTH):
        self.template = template
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.attention_mask = []
        self.input_ids = []
        self.input_type_mask = []
        self.inter_seperator_ids = self._get_input_ids(template['inter_seperator'])

        for sample_i in tqdm(samples):
            input_text, label = sample_i['sentence'], sample_i['label_text']
            input_ids, attention_mask, input_type_mask = self.preprocess_sentence(input_text, label)

            self.input_ids.append(input_ids)
            self.attention_mask.append(attention_mask)
            self.input_type_mask.append(input_type_mask)

    def _get_input_ids(self, text):
        return self.tokenizer(text, add_special_tokens=False)["input_ids"]

    def preprocess_sentence(self, input_text, label):
        input_template_part_1_text, input_template_part_2_text = self.template['input'].split('{}')
        input_template_tokenized_part1 = self._get_input_ids(input_template_part_1_text)
        input_tokenized = self._get_input_ids(input_text)
        input_template_tokenized_part2 = self._get_input_ids(input_template_part_2_text)

        sep_tokenized = self._get_input_ids(self.template['intra_seperator'])

        label_template_part_1, label_template_part_2 = self.template['output'].split('{}')
        label_template_part1_tokenized = self._get_input_ids(label_template_part_1)
        label_tokenized = self._get_input_ids(label)
        label_template_part2_tokenized = self._get_input_ids(label_template_part_2)

        eos = [self.tokenizer.eos_token_id] if self.tokenizer.eos_token_id is not None else []
        input_ids = input_template_tokenized_part1 + input_tokenized + input_template_tokenized_part2 + sep_tokenized + label_template_part1_tokenized + label_tokenized + label_template_part2_tokenized + eos

        # determine label tokens, to calculate loss only over them when labels_loss == True
        attention_mask = [1] * len(input_ids)
        input_type_mask = [1] * len(input_template_tokenized_part1) + [2] * len(input_tokenized) + [1] * len(
            input_template_tokenized_part2) + [0] * len(sep_tokenized) + \
                          [3] * len(label_template_part1_tokenized) + [4] * len(label_tokenized) + [3] * len( \
            label_template_part2_tokenized) + [0] * len(eos)

        assert len(input_type_mask) == len(input_ids) == len(attention_mask)

        return input_ids, attention_mask, input_type_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "input_type_mask": self.input_type_mask[idx]
        }


templates = {
    'input': 'input: {}',
    'intra_seperator': ' ',
    'output': 'output: {}',
    'inter_seperator': '\n'
}

CPT_train_dataset = CPTDataset(train_dataset, tokenizer, templates)

100%|██████████| 4/4 [00:00<00:00, 728.56it/s]


# Step 6: Create Context

In [7]:
context_ids = []
context_attention_mask = []
context_input_type_mask = []
first_type_mask = 0

for i in range(len(CPT_train_dataset)):
    context_ids += CPT_train_dataset[i]['input_ids']
    context_attention_mask += CPT_train_dataset[i]['attention_mask']
    context_input_type_mask += [i + first_type_mask if i > 0 else 0 for i in CPT_train_dataset[i]['input_type_mask']]
    first_type_mask += 4

print(len(context_ids), context_ids)
print(len(context_attention_mask), context_attention_mask)
print(len(context_input_type_mask), context_input_type_mask)


80 [8684, 29, 210, 41587, 2084, 9999, 940, 1485, 368, 123643, 32643, 210, 210, 19308, 29, 210, 111017, 2, 8684, 29, 210, 43453, 654, 40501, 630, 3804, 12378, 376, 380, 13430, 210, 210, 19308, 29, 210, 111017, 2, 8684, 29, 210, 19562, 141046, 3776, 26702, 530, 14016, 2893, 7747, 17303, 40704, 3638, 7384, 9670, 210, 210, 19308, 29, 210, 96675, 2, 8684, 29, 210, 3842, 7849, 105708, 999, 85816, 427, 17398, 368, 5025, 36387, 210, 210, 19308, 29, 210, 111017, 2]
80 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
80 [1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 3, 3, 3, 4, 0, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 7, 7, 7, 8, 0, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 0, 11, 11, 11, 12, 0, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 15, 15, 15, 16, 0]


# Step 7: Load Base Model and Configure CPT

In [8]:
base_model = AutoModelForCausalLM.from_pretrained(
    'bigscience/bloom-1b7',
    cache_dir='.',
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True,
    local_files_only=False,
)

config = CPTConfig(
            CPT_token_ids=context_ids,
            CPT_mask=context_attention_mask,
            CPT_tokens_type_mask=context_input_type_mask,
            CPT_prompt_tuning_init="TEXT",
            num_virtual_tokens=len(context_ids),

            opt_weighted_loss_type='decay',
            opt_loss_decay_factor=0.95,
            opt_projection_epsilon=0.2,
            opt_projection_format_epsilon=0.1,

            tokenizer_name_or_path=tokenizer_name_or_path,
)

model = get_peft_model(base_model, config)## load training data

# Step 8: Configuring Collate Function

In [9]:
class CPTDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, training=True, mlm=False):
        super().__init__(tokenizer, mlm=mlm)
        self.training = training
        self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # mk check why needed

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        list_sample_mask = []
        for i in range(len(examples)):
            if "sample_mask" in examples[i].keys():
                list_sample_mask.append(examples[i].pop("sample_mask"))

        max_len = max(len(ex["input_ids"]) for ex in examples)

        def pad_sequence(sequence, max_len, pad_value=0):
            return sequence + [pad_value] * (max_len - len(sequence))

        input_ids = torch.tensor([pad_sequence(ex["input_ids"], max_len) for ex in examples])
        attention_mask = torch.tensor([pad_sequence(ex["attention_mask"], max_len) for ex in examples])
        input_type_mask = torch.tensor([pad_sequence(ex["input_type_mask"], max_len) for ex in examples])

        batch = {"input_ids": input_ids, "attention_mask": attention_mask, "input_type_mask": input_type_mask}

        tensor_sample_mask = batch["input_ids"].clone().long()
        tensor_sample_mask[:, :] = 0
        for i in range(len(list_sample_mask)):
            tensor_sample_mask[i, : len(list_sample_mask[i])] = list_sample_mask[i]

        batch["labels"] = batch["input_ids"].clone()
        if not self.training:
            batch["sample_mask"] = tensor_sample_mask

        return batch



# Step 9: Training

In [10]:
training_args = TrainingArguments(
    output_dir='../.',  # Where the model predictions and checkpoints will be written
    use_cpu=False,  # This is necessary for CPU clusters.
    auto_find_batch_size=False,  # Find a suitable batch size that will fit into memory automatically
    learning_rate=1e-4,  # Higher learning rate than full Fine-Tuning
    logging_steps=1,
    per_device_train_batch_size=1,
    save_total_limit=1,
    remove_unused_columns=False,
    num_train_epochs=25,
    fp16=True,
    save_strategy='no'
)

trainer = Trainer(
    model=model,  # We pass in the PEFT version of the foundation model, bloomz-560M
    args=training_args,  # The args for the training.
    train_dataset=CPT_train_dataset,  # The dataset used to train the model.
    data_collator=CPTDataCollatorForLanguageModeling(tokenizer, training=True, mlm=False)
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


TrainOutput(global_step=100, training_loss=0.6106603369908408, metrics={'train_runtime': 12.1518, 'train_samples_per_second': 8.229, 'train_steps_per_second': 8.229, 'total_flos': 14503280640000.0, 'train_loss': 0.6106603369908408, 'epoch': 25.0})

# Step 10: Evaluate the Model


In [11]:
model.eval()
test_dataset = test_dataset.select_columns(['sentence', 'label_text'])
CPT_test_dataset = CPTDataset(test_dataset, tokenizer, templates)
device = model.device

for i in range(10):
    input_ids, input_type_mask = CPT_test_dataset[i]['input_ids'], CPT_test_dataset[i]['input_type_mask']

    outputs = model(
        input_ids=torch.Tensor(input_ids).long().to(device=device).view(1, -1),
        labels=torch.Tensor(input_ids).long().to(device=device).view(1, -1),
        input_type_mask=torch.Tensor(input_type_mask).long().to(device=device).view(1, -1)
    )

    shifted_logits = outputs.logits[..., :-1, :].contiguous().to(model.dtype)[0, -len(input_ids) + 1:]
    shift_labels = torch.Tensor(input_ids).long().to(device=device).view(1, -1)[0, 1:].contiguous().to(device)
    shifted_input_type_mask = torch.Tensor(input_type_mask).long().to(device=device).view(1, -1)[..., 1:].contiguous().to(device)

    mask = torch.Tensor(shifted_input_type_mask).long().to(device=device).view(-1,) == 4
    logit = shifted_logits[mask]
    label = shift_labels[mask]
    all_labels = torch.Tensor([tokenizer(i, add_special_tokens=False)["input_ids"] for i in ['negative', 'positive']]).long().to(device).view(-1,)

    prediction = logit[0, torch.Tensor([tokenizer(i, add_special_tokens=False)["input_ids"] for i in ['negative', 'positive']]).long().to(device).view(-1,)].argmax()
    prediction_text = 'negative' if prediction == 0 else 'positive'
    print('Sentence: {} \n \t The prediction is: {}\n \t The GT is {}'.format(tokenizer.decode(input_ids), prediction_text, tokenizer.decode(label)))


100%|██████████| 20/20 [00:00<00:00, 971.09it/s]
Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Sentence: input: it 's a charming and often affecting journey .  output: positive</s> 
 	 The prediction is: positive
 	 The GT is positive
Sentence: input: unflinchingly bleak and desperate  output: negative</s> 
 	 The prediction is: negative
 	 The GT is negative
Sentence: input: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .  output: positive</s> 
 	 The prediction is: positive
 	 The GT is positive
Sentence: input: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales .  output: positive</s> 
 	 The prediction is: positive
 	 The GT is positive
Sentence: input: it 's slow -- very , very slow .  output: negative</s> 
 	 The prediction is: negative
 	 The GT is negative
Sentence: input: although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .  output: positive</s> 
 	 The prediction is: negative
 	 The GT is p