# **Low Rank Adaptation and Parameter Efficient Finetuning of HuggingFace Flan-T5 LLMs on Text Summarisation**

### Import Relevant Dependencies

In [1]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    pipeline
)
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    TaskType,
    PeftModel,
    PeftConfig
)
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable

warnings.filterwarnings("ignore")

os.environ["LLM_REPOSITORY"] = "google/flan-t5-small"
os.environ["TOKENIZER_REPOSITORY"] = "google/flan-t5-small"
os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "4096"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["TOKENS_DATA_PATH"] = F"data/doc_summary_{os.environ['TOKENIZER_REPOSITORY'].split('/')[-1]}_tokens"
os.environ["SUMMARY_DATA_PATH"] = "data/doc_summary_pair.json"
os.makedirs(os.environ["DATASET_PATH"], exist_ok=True)
os.makedirs(os.environ["TOKENS_DATA_PATH"], exist_ok=True)

## **DATA PREPARATION**

### Split Dataset into Training, Validation and Testing Sets

In [2]:
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.1

with open(os.environ["SUMMARY_DATA_PATH"]) as f:
    doc_summary_data = json.load(f)
f.close()

random.shuffle(doc_summary_data)
train_size = int(len(doc_summary_data) * TRAIN_SIZE)
val_size = int(len(doc_summary_data) * VALIDATION_SIZE)
test_size = int(len(doc_summary_data) * TEST_SIZE)

train_data = doc_summary_data[:train_size]
val_data = doc_summary_data[train_size:train_size+val_size]
test_data = doc_summary_data[train_size+val_size:]

data_list = [
    ("train", train_data),
    ("validation", val_data),
    ("test", test_data),
]

for data_tuple in data_list:
    with open(os.path.join(os.environ["DATASET_PATH"], f"{data_tuple[0]}.json"), "w") as f:
        json.dump(data_tuple[1], f, indent=4)
    f.close()

del doc_summary_data, train_data, val_data, test_data, data_list, data_tuple, train_size, val_size, test_size
gc.collect()

119

### Load Dataset into DictDataset Format to be modelled by the HuggingFace LLM

In [3]:
dataset = load_dataset(path=os.environ["DATASET_PATH"])
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")
dataset["train"][random.randint(0, len(dataset["train"]))]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 4000
Validation dataset size: 500
Test dataset size: 500


{'document': 'NCT Number: NCT03317405\nStudy Title: Phase I Trial of Endoxifen Gel Versus Placebo in Women Undergoing Breast Surgery\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03317405\nAcronym: unknown\nStudy Status: ACTIVE_NOT_RECRUITING\nBrief Summary: This is a randomized, double-blind, Phase I trial with dose escalation from 10 mg endoxifen (ENX) gel daily (5mg per breast) to 20 mg daily (10mg per breast) in women undergoing mastectomy. Endoxifen hydrochloride may treat or reduce the risk of breast cancer.\nStudy Results: YES\nConditions: Breast Ductal Carcinoma In Situ|Breast Lobular Carcinoma In Situ|Stage 0 Breast Cancer AJCC v6 and v7|Stage I Breast Cancer AJCC v7|Stage IA Breast Cancer AJCC v7|Stage IB Breast Cancer AJCC v7|Stage II Breast Cancer AJCC v6 and v7|Stage IIA Breast Cancer AJCC v6 and v7|Stage IIB Breast Cancer AJCC v6 and v7|Stage III Breast Cancer AJCC v7|Stage IIIA Breast Cancer AJCC v7|Stage IIIB Breast Cancer AJCC v7|Stage IIIC Breast Cancer AJCC v7

### Load Corresponding LLM Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    os.environ["TOKENIZER_REPOSITORY"],
    model_max_length=int(os.environ["MAX_TOKENS"])
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

### Use Tokenizer Object to retreive the Maximum Source (Text) and Target (Summary) Tokens in the Data

In [5]:
concatenated_dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)
tokenized_inputs = concatenated_dataset.map(
    lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])

tokenized_targets = concatenated_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])

max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Max source length: 428
Max target length: 220


### Tokenize Dataset and Persist Tokens to Disk Memory

In [6]:
def preprocess_function(sample, max_source_length: int, max_target_length: int, padding: str="max_length"):
    inputs = [f"summarize this document: {item}"  for item in sample["document"]]

    model_inputs = tokenizer(
        inputs, 
        max_length=max_source_length, 
        padding=padding, 
        truncation=True,
    )
    labels = tokenizer(
        text_target=sample["summary"], 
        max_length=max_target_length,
        padding=padding, 
        truncation=True,
    )

    # replace pad tokens with -100, this is because pad tokens hold no semantic meaning, and thus should not 
    # contribute to the loss value, -100 is choosen because it is unlikely be be an ID to any given token in 
    # tokenizer
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_lambda = lambda dataset : preprocess_function(dataset, max_source_length, max_target_length)
tokenized_dataset = dataset.map(preprocess_lambda, batched=True, remove_columns=["document", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "train"))
tokenized_dataset["validation"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "validation"))
tokenized_dataset["test"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "test"))

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

## **MODEL PREPARATION**

### Load 8bits quantized HuggingFace LLM to Memory

In [7]:
# define model
model = AutoModelForSeq2SeqLM.from_pretrained(
    pretrained_model_name_or_path=os.environ["LLM_REPOSITORY"],
    load_in_8bit=True,
    device_map="auto"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Define Low Rank Adaptation Configurations Object and apply to Loaded LLM for Parameter Efficient Finetuning

In [8]:
# Define LoRA Config 
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862001038515747


### Define Data Collator Object

In [9]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## **MODEL FINETUNING / TRAINING**

### Define Seq2SeqTrainer Object and Commence LoRA Finetuning

In [10]:
OUTPUT_DIR = F"lora-{os.environ['LLM_REPOSITORY'].split('/')[-1]}"
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
	auto_find_batch_size=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # to be set to True for inference

# finetune model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.7775
1000,1.5219
1500,1.4304
2000,1.3659
2500,1.3202
3000,1.2866
3500,1.2525
4000,1.2248
4500,1.2013
5000,1.1786


TrainOutput(global_step=10000, training_loss=1.2296947509765626, metrics={'train_runtime': 7760.7051, 'train_samples_per_second': 10.308, 'train_steps_per_second': 1.289, 'total_flos': 1.26902992896e+16, 'train_loss': 1.2296947509765626, 'epoch': 20.0})

### Persist LoRA Model Weights to Disk Memory

In [11]:
# Save our LoRA model & tokenizer results
PEFT_MODEL_ID=f"{os.environ['LLM_REPOSITORY'].split('/')[-1]}_finetuned_results"
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

('flan-t5-small_finetuned_results/tokenizer_config.json',
 'flan-t5-small_finetuned_results/special_tokens_map.json',
 'flan-t5-small_finetuned_results/spiece.model',
 'flan-t5-small_finetuned_results/added_tokens.json',
 'flan-t5-small_finetuned_results/tokenizer.json')

## **MODEL EVALUATION**

### Load LoRA Weights from Disk to Perform Inference on Test Dataset

In [12]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, device_map="auto")

### Use Loaded Model and Tokenizer to Instantiate a Langchain HuggingFacePipeline object

In [13]:
# switch model to eval mode
model.eval()

# define model pipeline
hgf_pipeline = pipeline(
    task="text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    temperature=0.1, 
    max_length=int(os.environ["MAX_TOKENS"]),
    top_p=0.15,
    top_k=0,
    repetition_penalty=1.1,
)

llm = HuggingFacePipeline(pipeline=hgf_pipeline)

The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


### Generating Summaries from Documents in Test Data

In [14]:
summary_chain = load_summarize_chain(llm, chain_type="map_reduce")

# summarise first 5 documents in the testing data
predicted_summaries = []
n_docs = 5
for i, document in enumerate(dataset["test"]["document"][:n_docs]):
    document = Document(page_content=document)
    summary = summary_chain.run([document])
    print(f"Document: {document}\n")
    print(f"SUMARY: {summary}\n\n")
    predicted_summaries.append(summary)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Document: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

SUMARY: A study has been conducted to determine the effectiveness of a fully-human anti-CD30 chimeric antigen receptor for treating CD30-expressing lymphomas.


Document: page_content='Interventions: DRUG: Ibrutinib|OTHER: Laboratory Biomarker Analysis\nPrimary Outcome Measures: Incidence of relapsed leukemia, Incidence of relapsed leukemia defined as \\> 5% leukemic blasts detected in bone marrow or peripheral blood. Participants will also be considered to have relapsed leukemia if they receive any active treatment for progressive leukemia after allogeneic HCT, even if they have \\< 5% leukemic blasts. Withdrawal of immunosuppression alone is not considered an active treatm

## **PERFORMANCE MEASUREMENT**

### Compare Generated Summaries to Target Summaries with the Rouge Score and the Cosine Similarity Metric

In [15]:
rouge = Rouge()
embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"])
embeddings_model.to(os.environ["DEVICE"])
target_summaries = dataset["test"]["summary"][:n_docs]

for i, (predicted_summary, target_summary) in enumerate(zip(predicted_summaries, target_summaries)):
    pred_embeddings, target_embeddings = (
        embeddings_model.encode(predicted_summary).reshape(1, -1),
        embeddings_model.encode(target_summary).reshape(1, -1)
    )
    cos_similarity = cosine_similarity(target_embeddings, pred_embeddings)
    rouge_scores = rouge.get_scores(predicted_summary, target_summary)
    print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
    print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n\n")


Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Cosine similarity for summary 1: 0.6672851 

Rouge scores for summary 1: {'rouge-1': {'r': 0.4642857142857143, 'p': 0.65, 'f': 0.5416666618055557}, 'rouge-2': {'r': 0.21428571428571427, 'p': 0.3157894736842105, 'f': 0.2553191441195111}, 'rouge-l': {'r': 0.4642857142857143, 'p': 0.65, 'f': 0.5416666618055557}} 


Cosine similarity for summary 2: 0.735101 

Rouge scores for summary 2: {'rouge-1': {'r': 0.3870967741935484, 'p': 0.46153846153846156, 'f': 0.42105262661742077}, 'rouge-2': {'r': 0.14285714285714285, 'p': 0.17857142857142858, 'f': 0.1587301537918873}, 'rouge-l': {'r': 0.3225806451612903, 'p': 0.38461538461538464, 'f': 0.3508771880209296}} 


Cosine similarity for summary 3: 0.7573644 

Rouge scores for summary 3: {'rouge-1': {'r': 0.4444444444444444, 'p': 0.75, 'f': 0.5581395302109249}, 'rouge-2': {'r': 0.1875, 'p': 0.375, 'f': 0.2499999955555556}, 'rouge-l': {'r': 0.4074074074074074, 'p': 0.6875, 'f': 0.5116279023039481}} 


Cosine similarity for summary 4: 0.8874081 

Rouge 