In [2]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet



In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

### Load Dataset and LLM

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [5]:
model_name = 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [6]:
# param: In the context of PyTorch, param usually refers to a tensor or a parameter of a model. It is an instance of torch.Tensor or torch.nn.Parameter.
# numel(): This method is a member function of torch.Tensor that returns the total number of elements in the tensor.
# numel() counts all the elements in the tensor. For example, if you have a tensor with shape (2, 3, 4), then numel() would return 2 * 3 * 4 = 24,
# because the tensor contains 24 elements in total.


def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
  return f"""  trainable model parameters: {trainable_model_params}
  all model parameters: {all_model_params}
  percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%
  """

print(print_number_of_trainable_model_parameters(original_model))

  trainable model parameters: 247577856
  all model parameters: 247577856
  percentage of trainable model parameters: 100.00%
  


### Test the Model with Zero Shot Inferencing

In [7]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

# returns the tokenized input as PyTorch tensors ('pt').
# This is necessary because most models require inputs in tensor format.
inputs = tokenizer(prompt, return_tensors = 'pt')

# [0]: The generate method returns a batch of sequences (if batch size is more than one),
# so [0] extracts the first sequence from the batch. In most cases, there is only one sequence.
# skip_special_tokens = True: This argument ensures that any special tokens (like padding or end-of-sequence tokens)
# are omitted from the final decoded string.
output = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'],
        max_new_tokens = 200,
    )[0],
    skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f"INPUT PROMPT: \n{prompt}")
print(dash_line)
print(f"BASELINE HUMAN SUMMARY: \n{summary}\n")
print(dash_line)
print(f"MODEL GENERATION - ZERO SHOT: \n{output}")

---------------------------------------------------------------------------------------------------
INPUT PROMPT: 

Summarize the following conversation

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

### Perform Full Fine-Tuning

### Preprocess the Dialog-Summary Dataset

In [8]:
def tokenize_function(example):
  start_prompt = 'Summarize the following conversation.\n\n'
  end_prompt = '\n\nSummary'
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  # padding = 'max_length': Pads the tokenized sequences to the maximum length allowed by the model.
  # This ensures that all sequences in the batch have the same length.
  # truncation = True: Truncates sequences that are too long to fit within the model’s maximum input length.
  # return_tensors = "pt": Returns the tokenized outputs as PyTorch tensors.
  # .input_ids extracts the input IDs from the tokenized output, which are the numerical representations of the tokens.
  example['input_ids'] = tokenizer(prompt, padding = 'max_length', truncation = True, return_tensors = "pt").input_ids
  example['labels'] = tokenizer(example['summary'], padding = 'max_length', truncation = True, return_tensors = "pt").input_ids

  return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
# The resulting dataset will include only examples with indices 0, 100, 200, 300, ..., 900.
# reduces the dataset size while maintaining a representative distribution of examples across the original dataset.
tokenized_datasets = tokenized_datasets.filter(lambda example, index : index % 100 == 0, with_indices = True)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


### Fine-Tune the Model with the Preprocessed Dataset

In [11]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

# TrainingArguments - used to specify the various settings for training a model with the Hugging Face
# transformers library. It handles parameters such as learning rate, number of epochs, logging, and more.
# weight_decay = 0.01: This parameter applies weight decay regularization to the optimizer. Weight decay
# helps prevent overfitting by penalizing large weights. The value 0.01 means that the weights will be
# regularized with a strength of 0.01.
# logging_steps = 1: This parameter determines how often (in terms of steps) the training logs are generated.
# Setting it to 1 means that logs will be generated after every training step. This can be useful for monitoring
# training progress but may result in a large amount of log data if the training runs for many steps.
training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    max_steps = 1
)

trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation']
)

In [None]:
trainer.train()



### Evaluate the Model Qualitatively (Human Evaluation)

In [None]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./flan-dialogue-summary-checkpoint", torch_dtype=torch.bfloat16)

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

### Evaluate The Model Quantitatively (with ROUGE Metric)

In [None]:
rouge = evaluate.load('rouge')

In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
  prompt = f"""
  Summarize the following conversation.

  {dialogue}

  Summary : """

  input_dis = tokenizer(prompt, return_tensors = "pt")

  original_model_outputs = original_model.generate(input_dis = input_dis, generation_config = GenerationConfig(max_new_tokens = 200))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens = True)
  original_model_summaries.append(original_model_text_output)

  instruct_model_outputs = instruct_model.generate(input_ids = input_ids, generation_config = GenerationConfig(max_new_tokens = 200))
  instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens = True)
  instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

In [None]:
original_model_results = rouge.compute(
    predictions = original_model_summaries,
    references = human_baseline_summaries[0 : len(original_model_summaries)],
    use_aggregator = True,
    use_stemmer = True
)

# Function: The use_aggregator parameter specifies whether to aggregate the ROUGE
# scores across different summary lengths and categories (like ROUGE-1, ROUGE-2, ROUGE-L)
# into a single score.
# Purpose: Aggregation is useful when you want a single summary score that combines different
# types of ROUGE metrics. It provides a holistic view of the summary's quality

# Function: The use_stemmer parameter indicates whether to apply stemming to the words in the summaries
# before computing the ROUGE scores.
# Purpose: Stemming reduces words to their base or root form (e.g., "running" becomes "run"). This helps
# in normalizing variations of a word so that they are counted as the same word. For instance, "run," "runs,"
# and "running" would all be considered the same word if stemming is applied.
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

In [12]:
print("Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(instruct_model_results.values()))) - np.array(list(original_model_results.values()))
for key, value in zip(instruct_model_results.keys(), improvement):
  print(f'{key}: {value*100:.2f}%')




### Perform Parameter Efficient Fine-Tuning (PERT)

### Setup the PEFT/LoRA model for Fine-Tuning