In [6]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [7]:
from peft import LoraConfig, get_peft_model, TaskType

In [8]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [9]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)


model_name = 'google/flan-t5-small'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Wrap the base model with get_peft_model() to get a trainable PeftModel
peft_model = get_peft_model(original_model, lora_config)

In [10]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    # JS: const prompt = dialogueList.map(dialogue => startPrompt + dialogue + endPrompt);
    example['prompt'] = prompt
    # Why padding is Max Length?
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    # Element's 'input_ids' and 'label' keys have the tokenised prompt and response.
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# print(tokenized_datasets['train'][200])
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary', 'prompt'])
print(tokenized_datasets['train'][200])


{'input_ids': [12198, 1635, 1737, 8, 826, 3634, 5, 1713, 345, 13515, 536, 4663, 10, 363, 103, 25, 241, 12, 214, 81, 140, 58, 1713, 345, 13515, 357, 4663, 10, 571, 81, 39, 2705, 3187, 44, 1900, 58, 1713, 345, 13515, 536, 4663, 10, 37, 1348, 2769, 13, 66, 82, 2996, 19, 756, 11989, 5, 1713, 345, 13515, 357, 4663, 10, 86, 84, 1426, 410, 25, 129, 8, 2030, 6784, 58, 1713, 345, 13515, 536, 4663, 10, 86, 17082, 27, 530, 3, 9, 3, 3916, 5, 1713, 345, 13515, 357, 4663, 10, 2114, 25, 1204, 136, 19397, 58, 1713, 345, 13515, 536, 4663, 10, 2163, 6, 27, 43, 6, 11, 386, 648, 16, 792, 5, 1713, 345, 13515, 357, 4663, 10, 2114, 25, 118, 3, 9, 853, 2488, 58, 1713, 345, 13515, 536, 4663, 10, 27, 43, 118, 3, 9, 853, 26864, 7, 1208, 16, 1567, 13, 2116, 21, 192, 203, 5, 1713, 345, 13515, 357, 4663, 10, 3963, 25, 1715, 16, 136, 1886, 1087, 58, 1713, 345, 13515, 536, 4663, 10, 27, 47, 46, 30277, 7, 372, 1144, 16, 1900, 5, 1713, 345, 13515, 357, 4663, 10, 363, 2600, 33, 25, 207, 44, 58, 1713, 345, 13515, 536, 46

In [11]:
# Train like a Usual model once the LoRa 
output_dir = f'./peft-dialogue-summary-training'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=100,
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [12]:
peft_trainer.train()

Step,Training Loss


TrainOutput(global_step=1558, training_loss=2.463961208279846, metrics={'train_runtime': 536.9157, 'train_samples_per_second': 23.207, 'train_steps_per_second': 2.902, 'total_flos': 2368874804674560.0, 'train_loss': 2.463961208279846, 'epoch': 1.0})

In [13]:
peft_model_path="./peft-dialogue-summary-training/checkpoint-1500"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-dialogue-summary-training/checkpoint-1500\\tokenizer_config.json',
 './peft-dialogue-summary-training/checkpoint-1500\\special_tokens_map.json',
 './peft-dialogue-summary-training/checkpoint-1500\\tokenizer.json')

In [14]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [17]:
index = 201
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']
dash_line = '-'.join('' for x in range(100))

device = "cpu"
print(device)
original_model.to(device)
peft_model.to(device)


prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)



peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

cpu
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Where to, miss?
#Person2#: Hi! Crenshaw and Hawthorne, at the Holiday Inn that is on that corner.
#Person1#: Sure thing. So, where are you flying in from?
#Person2#: From China.
#Person1#: Really? You don't look very Chinese to me, if you don't mind me saying so.
#Person2#: It's fine. I am actually from Mexico. I was in China on a business trip, visiting some local companies that manufacture bathroom products.
#Person1#: Wow sounds interesting! Excuse me if I am being a bit nosy but, how old are you?
#Person2#: Don't you know it's rude to ask a lady her age?
#Person1#: Don't get me wrong! It's just that you seem so young and already doing business overseas!
#Person2#: Well thank you! In that case, I am 26 years old, and what about yourself?
#Person1#: I am 40 years old and was born and raised here in the good old U. S o