# Fine Tune T5 with Prefix Tuning on Sentement Analysis
Prefix tuning is an additive method where only a sequence of continuous task-specific vectors is attached to the beginning of the input, or prefix. Only the prefix parameters are optimized and added to the hidden states in every layer of the model. The tokens of the input sequence can still attend to the prefix as virtual tokens. As a result, prefix tuning stores 1000x fewer parameters than a fully finetuned model, which means you can use one large language model for many tasks.


## 1. Installations

In [1]:
!pip install -q peft transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

## 2. Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

import torch
import os

In [3]:
model_name_or_path = "t5-large"
tokenizer_name_or_path = "t5-large"

## 3. Load data
We are going to fine-tune our model on the sentences_allagree subset of the financial_phrasebank dataset. This dataset contains financial news categorized by sentiment.

In [4]:
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/171k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2037
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 227
    })
})

In [6]:
classes = dataset["train"].features["label"].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1,
)
dataset

Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'text_label'],
        num_rows: 2037
    })
    validation: Dataset({
        features: ['sentence', 'label', 'text_label'],
        num_rows: 227
    })
})

In [7]:
dataset["train"][0]

{'sentence': 'Work on the assignment has already started and is due for completion in spring 2011 .',
 'label': 1,
 'text_label': 'neutral'}

## 4. Preprocess Dataset

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
text_column = "sentence"
label_column = "text_label"
max_length = 128

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [10]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]

In [11]:
processed_datasets["train"][2]

{'input_ids': [26861,
  3159,
  4532,
  2861,
  3,
  1454,
  13,
  48,
  349,
  3,
  6,
  8,
  4080,
  4776,
  271,
  4157,
  57,
  8,
  349,
  3,
  31,
  7,
  843,
  1652,
  3,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

Create a DataLoader from the train and validation datasets. Set pin_memory=True to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU.

In [12]:
batch_size = 16

train_dataset = processed_datasets["train"]
val_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(val_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

## 4. Model
We’ll be using T5 large model as it is a sequence to sequence model.

In [13]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 983,040 || all params: 738,651,136 || trainable%: 0.13308583065659835


In [19]:
lr = 1e-2
num_epochs = 10

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

## 5. Training

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 128/128 [01:04<00:00,  1.98it/s]
100%|██████████| 15/15 [00:06<00:00,  2.32it/s]


epoch=0: train_ppl=tensor(1.0874, device='cuda:0') train_epoch_loss=tensor(0.0838, device='cuda:0') eval_ppl=tensor(1.0645, device='cuda:0') eval_epoch_loss=tensor(0.0625, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.02it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]


epoch=1: train_ppl=tensor(1.0695, device='cuda:0') train_epoch_loss=tensor(0.0672, device='cuda:0') eval_ppl=tensor(1.0685, device='cuda:0') eval_epoch_loss=tensor(0.0662, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.02it/s]
100%|██████████| 15/15 [00:06<00:00,  2.32it/s]


epoch=2: train_ppl=tensor(1.0651, device='cuda:0') train_epoch_loss=tensor(0.0630, device='cuda:0') eval_ppl=tensor(1.0504, device='cuda:0') eval_epoch_loss=tensor(0.0492, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.02it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]


epoch=3: train_ppl=tensor(1.0592, device='cuda:0') train_epoch_loss=tensor(0.0575, device='cuda:0') eval_ppl=tensor(1.0548, device='cuda:0') eval_epoch_loss=tensor(0.0533, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.01it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]


epoch=4: train_ppl=tensor(1.0465, device='cuda:0') train_epoch_loss=tensor(0.0455, device='cuda:0') eval_ppl=tensor(1.0559, device='cuda:0') eval_epoch_loss=tensor(0.0544, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.01it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]


epoch=5: train_ppl=tensor(1.0420, device='cuda:0') train_epoch_loss=tensor(0.0412, device='cuda:0') eval_ppl=tensor(1.0521, device='cuda:0') eval_epoch_loss=tensor(0.0507, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.01it/s]
100%|██████████| 15/15 [00:06<00:00,  2.32it/s]


epoch=6: train_ppl=tensor(1.0403, device='cuda:0') train_epoch_loss=tensor(0.0395, device='cuda:0') eval_ppl=tensor(1.0511, device='cuda:0') eval_epoch_loss=tensor(0.0498, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.02it/s]
100%|██████████| 15/15 [00:06<00:00,  2.32it/s]


epoch=7: train_ppl=tensor(1.0419, device='cuda:0') train_epoch_loss=tensor(0.0410, device='cuda:0') eval_ppl=tensor(1.0510, device='cuda:0') eval_epoch_loss=tensor(0.0497, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.01it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]


epoch=8: train_ppl=tensor(1.0381, device='cuda:0') train_epoch_loss=tensor(0.0373, device='cuda:0') eval_ppl=tensor(1.0451, device='cuda:0') eval_epoch_loss=tensor(0.0441, device='cuda:0')


100%|██████████| 128/128 [01:03<00:00,  2.01it/s]
100%|██████████| 15/15 [00:06<00:00,  2.31it/s]

epoch=9: train_ppl=tensor(1.0395, device='cuda:0') train_epoch_loss=tensor(0.0388, device='cuda:0') eval_ppl=tensor(1.0494, device='cuda:0') eval_epoch_loss=tensor(0.0482, device='cuda:0')





## 6. Model Evaluation

In [21]:
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["validation"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['validation']['text_label'][:10]=}")

accuracy=96.91629955947137 % on the evaluation dataset
eval_preds[:10]=['positive', 'positive', 'positive', 'neutral', 'neutral', 'negative', 'positive', 'neutral', 'neutral', 'negative']
dataset['validation']['text_label'][:10]=['positive', 'positive', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'negative']


## 7. Share Model

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
peft_model_id = "surajkarki/t5-large_prefix_tuning"
model.push_to_hub("surajkarki/t5-large_prefix_tuning", use_auth_token=True)



README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/surajkarki/t5-large_prefix_tuning/commit/8cc6e940d23c93557fba7722b304c74661674f06', commit_message='Upload model', commit_description='', oid='8cc6e940d23c93557fba7722b304c74661674f06', pr_url=None, pr_revision=None, pr_num=None)

## 8. Inference

In [25]:
from peft import PeftModel, PeftConfig

peft_model_id = "surajkarki/t5-large_prefix_tuning"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

In [34]:
def predictions(inputs):
  results = {}
  for i in inputs:
    tokenized_text = tokenizer(i, return_tensors="pt")
    model.to(device)
    with torch.no_grad():
      inps = {k: v.to(device) for k, v in tokenized_text.items()}
      outputs = model.generate(input_ids=inps["input_ids"], max_new_tokens=10)
      out = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
      results[i] = out

  return results

In [38]:
news = [
    "The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .",
    "One of the challenges in the oil production in the North Sea is scale formation that can plug pipelines and halt production .",
    "Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing ."
]

result = predictions(news)
print(result)

{"The Lithuanian beer market made up 14.41 million liters in January , a rise of 0.8 percent from the year-earlier figure , the Lithuanian Brewers ' Association reporting citing the results from its members .": 'positive', 'One of the challenges in the oil production in the North Sea is scale formation that can plug pipelines and halt production .': 'negative', 'Pharmaceuticals group Orion Corp reported a fall in its third-quarter earnings that were hit by larger expenditures on R&D and marketing .': 'negative'}
