# **Low Rank Adaptation and Parameter Efficient Finetuning of HuggingFace Alpaca LLMs on Text Summarisation**

### Import Relevant Dependencies

In [1]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    DataCollatorForSeq2Seq, 
    Trainer, 
    TrainingArguments, 
)
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    TaskType,
    PeftModel,
    PeftConfig
)
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable

warnings.filterwarnings("ignore")

os.environ["LLM_REPOSITORY"] = "chavinlo/alpaca-native"
os.environ["TOKENIZER_REPOSITORY"] = "chavinlo/alpaca-native"
os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "4096"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["TOKENS_DATA_PATH"] = F"data/doc_summary_{os.environ['TOKENIZER_REPOSITORY'].split('/')[-1]}_tokens"
os.environ["SUMMARY_DATA_PATH"] = "data/doc_summary_pair.json"
os.makedirs(os.environ["DATASET_PATH"], exist_ok=True)
os.makedirs(os.environ["TOKENS_DATA_PATH"], exist_ok=True)

## **DATA PREPARATION**

### Split Dataset into Training, Validation and Testing Sets

In [2]:
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.1

with open(os.environ["SUMMARY_DATA_PATH"]) as f:
    doc_summary_data = json.load(f)
f.close()

random.shuffle(doc_summary_data)
train_size = int(len(doc_summary_data) * TRAIN_SIZE)
val_size = int(len(doc_summary_data) * VALIDATION_SIZE)
test_size = int(len(doc_summary_data) * TEST_SIZE)

train_data = doc_summary_data[:train_size]
val_data = doc_summary_data[train_size:train_size+val_size]
test_data = doc_summary_data[train_size+val_size:]

data_list = [
    ("train", train_data),
    ("validation", val_data),
    ("test", test_data),
]

for data_tuple in data_list:
    _file_path = os.path.join(os.environ["DATASET_PATH"], f"{data_tuple[0]}.json")
    if not os.path.exists(_file_path):
        with open(_file_path, "w") as f:
            json.dump(data_tuple[1], f, indent=4)
        f.close()

del doc_summary_data, train_data, val_data, test_data, data_list, data_tuple, train_size, val_size, test_size
gc.collect()

99

### Load Dataset into DictDataset Format to be modelled by the HuggingFace LLM

In [3]:
dataset = load_dataset(path=os.environ["DATASET_PATH"])
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")
dataset["train"][random.randint(0, len(dataset["train"]))]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 4000
Validation dataset size: 500
Test dataset size: 500


{'summary': 'This study is trying to find the highest dose of a drug that can be tolerated by monitoring the occurrence of severe toxicities related to the drug. The maximum tolerated dose will be determined based on when these toxicities occur within 28 days of the first vaccine injection.',
 'id': 1127,
 'document': 'DL2, and 23 months and 13 days for DL3.|Number of Participants With Dose-Limiting Toxicities (DLT), Dose-limiting toxicity (DLT) will be defined as any one of the following: Any grade ≥ 3 hematologic toxicity or grade ≥ 3 non-hematologic toxicity that is possibly, probably, or definitely related to study drug, except transient (≤ 48 hour) grade 3 fatigue, local reactions, flu-like symptoms, fever, headache, and laboratory abnormalities that are not associated with organ pathology. Also any ≥ grade 2 allergic and ≥ grade 2 autoimmune reaction(s) (except endocrine-related immune toxicity) will be defined as a DLT. Any grade 3 autoimmune endocrine-related toxicity that has 

### Load Corresponding LLM Tokenizer

In [4]:
tokenizer = LlamaTokenizer.from_pretrained(
    os.environ["TOKENIZER_REPOSITORY"],
    model_max_length=int(os.environ["MAX_TOKENS"])
)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


### Use Tokenizer Object to retreive the Maximum Source (Text) and Target (Summary) Tokens in the Data

In [5]:
concatenated_dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)
tokenized_inputs = concatenated_dataset.map(
    lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])

tokenized_targets = concatenated_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])

max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]]) + 64
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]]) + 64

print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Max source length: 550
Max target length: 317


### Tokenize Dataset and Persist Tokens to Disk Memory

In [6]:
LABEL_PAD_TOKEN_ID = -100
TRAIN_ON_INPUT = False
generate_prompt = lambda document : (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
    f"Generate a concise summary of this document:\n\n DOCUMENT: \n{document} \n\n SUMMARY:"
)
def preprocess_function(
    sample, 
    max_seq_length: int, 
    padding: str="max_length", 
    train_on_input: bool=False):
    
    _input = f"{generate_prompt(sample['document'])} {sample['summary']}"
    tokenization_result = tokenizer(
        _input,
        max_length=max_seq_length, 
        padding=False,
        truncation=True,
    )
    input_tokens = tokenization_result["input_ids"].copy()
    label_tokens = tokenization_result["input_ids"].copy()
    
    if not train_on_input:
        prompt_tokens = tokenizer(
            generate_prompt(sample['document']),
            max_length=max_seq_length, 
            padding=False,
            truncation=True,
        )["input_ids"]
        prompt_tokens_len = len(prompt_tokens)
        input_tokens = input_tokens[:prompt_tokens_len]
        label_tokens = ([LABEL_PAD_TOKEN_ID]*prompt_tokens_len) + label_tokens[prompt_tokens_len:]
        input_tokens = input_tokens + [tokenizer.pad_token_id]*(len(label_tokens)-prompt_tokens_len)
        
    else:
        input_tokens.append(tokenizer.eos_token_id)
        
    tokenization_result["input_ids"] = input_tokens
    tokenization_result["labels"] = label_tokens
    tokenization_result["attention_mask"] = [1]*len(input_tokens)
    return tokenization_result

preprocess_lambda = lambda dataset : preprocess_function(
    dataset, 
    sum([max_source_length, max_target_length]),
    train_on_input=TRAIN_ON_INPUT,
)
# batched arg must be set to False for this to work properly
tokenized_dataset = dataset.map(preprocess_lambda, batched=False, remove_columns=["document", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "train"))
tokenized_dataset["validation"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "validation"))
tokenized_dataset["test"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "test"))

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

## **MODEL PREPARATION**

### Load 8bits quantized HuggingFace LLM to Memory

In [7]:
# define model
model = LlamaForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.environ["LLM_REPOSITORY"],
    load_in_8bit=True,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Define Low Rank Adaptation Configurations Object and apply to Loaded LLM for Parameter Efficient Finetuning

In [8]:
# Define LoRA Config 
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,812,416 || trainable%: 0.12433438908285782


### Define Data Collator Object

In [9]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=LABEL_PAD_TOKEN_ID,
    pad_to_multiple_of=8,
     padding=True,
)

## **MODEL FINETUNING / TRAINING**

### Define Seq2SeqTrainer Object and Commence LoRA Finetuning

In [None]:
OUTPUT_DIR = F"lora-{os.environ['LLM_REPOSITORY'].split('/')[-1]}"
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3

# Define training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
	auto_find_batch_size=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # to be set to True for inference

# finetune model
trainer.train()

Step,Training Loss
500,4.8462
1000,4.4306
1500,4.2045
2000,3.9932
2500,3.7804
3000,3.5668
3500,3.3361
4000,3.0951
4500,2.8409
5000,2.5784


### Persist LoRA Model Weights to Disk Memory

In [15]:
# Save our LoRA model & tokenizer results
PEFT_MODEL_ID=f"{os.environ['LLM_REPOSITORY'].split('/')[-1]}_finetuned_results"
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

# # delete model and tokenizer from memory
del model, tokenizer
gc.collect()
torch.cuda.empty_cache()

## **MODEL EVALUATION**

### Load LoRA Weights from Disk to Perform Inference on Test Dataset

In [16]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

# load base LLM model and tokenizer
model = LlamaForCausalLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = LlamaTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, device_map="auto", torch_dtype=torch.float32)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Generating Summaries from Documents in Test Data

In [17]:
predicted_summaries = []
n_docs = 5
input_documents = dataset["test"]["document"][:n_docs]
target_summaries = dataset["test"]["summary"][:n_docs]

model.eval()
for i, document in enumerate(input_documents):
    prompt = generate_prompt(document)
    prompt_tokens = tokenizer(
        prompt,
        return_tensors="pt"
    )
    summary_tokens = model.generate(
        **prompt_tokens,
        return_dict_in_generate=True,
        temperature=0.1,
        top_p=0.15,
        top_k=0,
        repetition_penalty=1.1,
        max_new_tokens=256,
    )
    summary = tokenizer.decode(summary_tokens.sequences[0], skip_special_tokens=True)
    print(f"Document: {document}\n")
    print(f"SUMARY: {summary.replace(prompt, '').replace('</s>', '')}\n\n")
    predicted_summaries.append(summary)

Document: States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, Georgia, 30265, Unit

## **PERFORMANCE MEASUREMENT**

### Compare Generated Summaries to Target Summaries with the Rouge Score and the Cosine Similarity Metric

In [18]:
rouge = Rouge()
embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"])
embeddings_model.to(os.environ["DEVICE"])
_zipped_data = zip(input_documents, predicted_summaries, target_summaries)

for i, (document, predicted_summary, target_summary) in enumerate(_zipped_data):
    prompt = generate_prompt(document)
    pred_embeddings, target_embeddings = (
        embeddings_model.encode(predicted_summary.replace(prompt, '').replace('</s>', '')).reshape(1, -1),
        embeddings_model.encode(target_summary.replace(prompt, '').replace('</s>', '')).reshape(1, -1)
    )
    cos_similarity = cosine_similarity(target_embeddings, pred_embeddings)
    rouge_scores = rouge.get_scores(predicted_summary, target_summary)
    print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
    print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n\n")


Cosine similarity for summary 1: 0.7862063 

Rouge scores for summary 1: {'rouge-1': {'r': 0.72, 'p': 0.12, 'f': 0.20571428326530614}, 'rouge-2': {'r': 0.35714285714285715, 'p': 0.047619047619047616, 'f': 0.08403361136925362}, 'rouge-l': {'r': 0.56, 'p': 0.09333333333333334, 'f': 0.15999999755102043}} 


Cosine similarity for summary 2: 0.67372495 

Rouge scores for summary 2: {'rouge-1': {'r': 0.5238095238095238, 'p': 0.09482758620689655, 'f': 0.16058393901006981}, 'rouge-2': {'r': 0.22727272727272727, 'p': 0.02824858757062147, 'f': 0.05025125431479011}, 'rouge-l': {'r': 0.5238095238095238, 'p': 0.09482758620689655, 'f': 0.16058393901006981}} 


Cosine similarity for summary 3: 0.747568 

Rouge scores for summary 3: {'rouge-1': {'r': 0.6, 'p': 0.06060606060606061, 'f': 0.11009174145273967}, 'rouge-2': {'r': 0.05263157894736842, 'p': 0.0034129692832764505, 'f': 0.00641025526647785}, 'rouge-l': {'r': 0.5, 'p': 0.050505050505050504, 'f': 0.09174311759952868}} 


Cosine similarity for sum