# Fine Tune for LLMs

In [1]:
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.16.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## FLAN-T5 model

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

### Evaluate the Original Model
#### Load model

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
# define a function to count the trainable and all parameters in a pytorch model
def print_number_of_trainable_model_parameters(model):
    trainable = sum([param.numel() for _, param in model.named_parameters() if param.requires_grad])
    all = sum([param.numel() for _, param in model.named_parameters()])
    return f"trainable model parameters: {trainable}\nall model parameters: {all}\npercentage of trainable model parameters: {100 * trainable / all:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


#### Load dataset

In [4]:
dataset = load_dataset('knkarthick/dialogsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

#### Test the Model with Zero Shot Inferencing

Test the model with the zero shot inferencing. The model extracted some information from the dialogue, but the summary obtained was not complete and comparable to the baseline human base summary 

In [12]:
# select a dialogue with index = 200
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

# construct the prompt
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

# tokenize and input the prompt and output the summary
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

# format the output
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

### Full Fine Tune of Flan-T5 Model by Human Labelled Summaries
#### Preparation for Full Fine Tuning
  * construct the prompts including instruction as the start prompt, dialogue and end prompt
  * tokenize the prompts as 'input_ids' column and the labelled summary as 'labels' column in the dataset
  * The processing is defined and executed in `tokenize_function` that will process each example in the training, validation and test datasets
  * after processing all examples, filter out id, topic, dialogue and summary columns. Only keep `input_ids` and labels columns for fine tuning

In [19]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The tokenize_function traverse the training, validation and test datasets in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [20]:
# only select a subset of the datasets for training and validation and test to reduce the time and resources for training
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


##### Set Up TrainingArguments and Trainer to Fine Tune the Model

In [29]:

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

trainer.train()

#### Evaluate the Model Qualitatively (Human Evaluation)

To evaluate the fine tuned model, select a dialogue, its predicted summary and the corresponding human baseline summary and compare the results. Compared to original_model, the summary of the fully trained instruct_model is much closer to the human baseline summary

* For comparison, first download the fully fine tuned T5 model using the entire traing dataset from s3 
```shell
aws s3 cp --recursive s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/ ./flan-dialogue-summary-checkpoint/
```
* Now load the load model

In [15]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)

* Select the dialogue with index of 200 to review the generated summary

In [21]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'm thinking of upgrading my computer.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.


#### Evaluate the Model Quantitatively (with ROUGE Metric)

We can evaluate the generated summaries with the human baseline summaries by the [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) by comparing the words from the two summriess. Results showed that an increase in the ROOUGE metric in summaries after fine-tuning.

In the next cell, we define a ModelEval class to compare two models by calcuating the ROGUE metrics of the model generated summaries on input dialogues based on the corresponding human generated baseline summaries as the reference.

In [62]:
from typing import Dict, List
class ModelEval:
    rouge = evaluate.load('rouge')
    def __init__(self, model_1_name, model_1, model_2_name, model_2, tokenizer, dialogues, ref_summaries):

        # make sure the dialogues and referenc summaries have the same lengths
        assert(len(dialogues) == len(ref_summaries))        

        # define the prompt template to embed the dialogue and generate prompt to the model
        self.prompt_temp = """
            Summarize the following conversation.
        
            {diag}
        
            Summary: """
        
        self.model_1_name, self.model_1 = model_1_name, model_1
        self.model_2_name, self.model_2 = model_2_name, model_2
        self.tokenizer = tokenizer
        self.ref_summaries = ref_summaries
        
        # generate prompts from dialogues, and the model generated summaries from each prompts for the two models in ocmparison
        self.dialogues = [self.prompt_temp.format(diag=diag) for diag in dialogues]
        self.model_1_summaries = [self.__class__.get_prediction(self.model_1, self.tokenizer, diag) for diag in self.dialogues]
        self.model_2_summaries = [self.__class__.get_prediction(self.model_2, self.tokenizer, diag) for diag in self.dialogues]      
       

    @classmethod
    def get_prediction(cls, model, tokenizer, dialogue) -> List[str]:
        input_ids = tokenizer(dialogue, return_tensors="pt").input_ids
        output_ids = model.generate(input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))[0]
        return tokenizer.decode(output_ids, skip_special_tokens=True)     

    @classmethod
    def eval_rogue(cls, pred_summaries, ref_summaries) -> float:
        return cls.rouge.compute(
            predictions=pred_summaries,
            references=ref_summaries,
            use_aggregator=True,
            use_stemmer=True,
        )
        
    def compare_models(self) -> Dict[str, float]:
        """
        calculate the ROGUE metrics for the two models in comparison and print out improvements in ROGUE metrics        
        """
        # calculate the ROGUE scores for the two models
        model_1_score = self.__class__.eval_rogue(self.model_1_summaries, self.ref_summaries)
        model_2_score = self.__class__.eval_rogue(self.model_2_summaries, self.ref_summaries)

        # calculate the improvement in ROGUE metrics and print out improvments
        improvement = (np.array(list(model_2_score.values())) - np.array(list(model_1_score.values())))
        print(f"the improvement of {self.model_2_name} compared to {self.model_1_name} are the following")
        for key, value in zip(model_1_score.keys(), improvement):
            print(f'{key}: {value*100:.2f}%')
        return {self.model_1_name: model_1_score, self.model_2_name: model_2_score}
    
    

In [59]:
# compare the two models
dialogues = dataset['test'][0:3]['dialogue']
ref_summaries = dataset['test'][0:3]['summary']
model_1_name, model_1 = "original_model", original_model
model_2_name, model_2 = "instruct_model", instruct_model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

meval = ModelEval(model_1_name, model_1, model_2_name, model_2, tokenizer, [dialogues[0]], [ref_summaries[0]])
print(meval.compare_models())


the improvement of instruct_model compared to original_model are the following
rouge1: 21.11%
rouge2: 15.91%
rougeL: 10.00%
rougeLsum: 10.00%
{'original_model': {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}, 'instruct_model': {'rouge1': 0.37777777777777777, 'rouge2': 0.1590909090909091, 'rougeL': 0.26666666666666666, 'rougeLsum': 0.26666666666666666}}


### Perform Parameter Efficient Fine-Tuning (PEFT)

**Parameter Efficient Fine-Tuning (PEFT)** fine-tuning is a form of instruction fine-tuning can usually obtain comparable results to the full fine-tuning, but with much less paramters to tune, which is much more efficient than full fine-tuning.

Here, we use **Low-Rank Adaptation (LoRA)** as a PEFT technology to fine tuen the FLAN-T5 model. LoRA is usually used to fine-tune a model for a specific task. Note that LoRA doesn't change the original LLM model. When serving the inference requests, we combine the original LLM model with the newly-trained “LoRA adapter” as shown in the example. The LoRA adapter is only a several % of the original LLM size (MBs vs GBs).

#### Setup the PEFT/LoRA model for Fine-Tuning
* define the LoraConfig for Flan-T5. For config paramters of commonly used llms, refer to [this link for reference](https://github.com/sematic-ai/sematic/blob/main/sematic/examples/summarization_finetune/__main__.py)
* Note the rank (`r`) defines the rank/dimension of the adapter to be trained.

In [22]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [31]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


#### Train Model Using PEFT/LoRA

In [41]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

#### Use PEFT (LoRA) Fine Tuned Model with Base Model for Inference
* The key point is that the base model weights are frozen during LoRA fine tune
* The code demonstrate how to combine base and PEFT/LoRA fine tuned models to summarize dialogues
* basically, you load both models using `PeftModel.from_pretrained()` function, and define the data type
* the `torch_dtype` is set at torch.bfloat16, and `is_trainable` is False, since we will use the model for prediction, not for training

In [43]:
# use peft and the base model to summarize dialogues
from peft import PeftModel, PeftConfig

# define the base model to fine tune
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './peft-dialogue-summary-checkpoint-from-s3/', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [44]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


#### Evaluate the Model Qualitatively (Human Evaluation)
* the PEFT/LoRA fine tuned model generated more relavant summary compared to original model
* Now lets evaluate the model by ROGUE metrics

In [45]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1#: I'm thinking of upgrading my computer.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1# recommends adding a painting program to #Person2#'s software and upgrading hardware. #Person2# also wants to upgrade the hardware because it's outdated now.


### Evaluate the Model Quantitatively (with ROUGE Metric)
* Perform inferences for the sample of the test dataset (only 10 dialogues and summaries to save time)
* Here we use our ModeEval class object for model evaluation, as shown in Full Fine Tune section

In [64]:
def get_prediction_peft(dialogue, tokenizer):
    prompt = f"""
        Summarize the following conversation.

        {dialogue}

        Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    return peft_model_text_output    

'#Person1# asks Ms. Dawson to take a dictation to all employees by this afternoon. Ms. Dawson tells #Person1# that all office communications are restricted to email correspondence and official memos. #Person1# wants to change the communication methods and asks Ms. Dawson to get the memo typed up and distributed to all employees before 4 pm.'

In [73]:
dialogues = dataset['test'][0:5]['dialogue']
ref_summaries = dataset['test'][0:5]['summary']

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
peft_pred_summaries = [get_prediction_peft(dia, tokenizer) for dia in dialogues]
origin_pred_summaries = [ModelEval.get_prediction(original_model, tokenizer, dia) for dia in dialogues]

In [74]:
original_rogue = ModelEval.eval_rogue(origin_pred_summaries, ref_summaries)
peft_rogue = ModelEval.eval_rogue(peft_pred_summaries, ref_summaries)

In [75]:
improvement = (np.array(list(peft_rogue.values())) - np.array(list(original_rogue.values())))
print(f"the improvement of peft_model compared to original_model are the following")
for key, value in zip(peft_rogue.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')
print("rogue for peft")
print(peft_rogue)
print("rogue for original")
print(original_rogue)

the improvement of peft_model compared to original_model are the following
rouge1: 11.33%
rouge2: 2.65%
rougeL: 8.17%
rougeLsum: 8.38%
rogue for peft
{'rouge1': 0.34193513803269904, 'rouge2': 0.1022864276796861, 'rougeL': 0.27097500453912726, 'rougeLsum': 0.2728511771470072}
rogue for original
{'rouge1': 0.22858627858627859, 'rouge2': 0.0758237689744539, 'rougeL': 0.18922567498726442, 'rougeLsum': 0.18905242481401419}
