In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [5]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

### Load Dataset and LLM

In [6]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
model_name = 'google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [28]:
# param: In the context of PyTorch, param usually refers to a tensor or a parameter of a model. It is an instance of torch.Tensor or torch.nn.Parameter.
# numel(): This method is a member function of torch.Tensor that returns the total number of elements in the tensor.
# numel() counts all the elements in the tensor. For example, if you have a tensor with shape (2, 3, 4), then numel() would return 2 * 3 * 4 = 24,
# because the tensor contains 24 elements in total.


def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
  return f"""  trainable model parameters: {trainable_model_params}
  all model parameters: {all_model_params}
  percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%
  """

print(print_number_of_trainable_model_parameters(original_model))

  trainable model parameters: 247577856
  all model parameters: 247577856
  percentage of trainable model parameters: 100.00%
  


### Test the Model with Zero Shot Inferencing

In [37]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

# returns the tokenized input as PyTorch tensors ('pt').
# This is necessary because most models require inputs in tensor format.
inputs = tokenizer(prompt, return_tensors = 'pt')

# [0]: The generate method returns a batch of sequences (if batch size is more than one),
# so [0] extracts the first sequence from the batch. In most cases, there is only one sequence.
# skip_special_tokens = True: This argument ensures that any special tokens (like padding or end-of-sequence tokens)
# are omitted from the final decoded string.
output = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'],
        max_new_tokens = 200,
    )[0],
    skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f"INPUT PROMPT: \n{prompt}")
print(dash_line)
print(f"BASELINE HUMAN SUMMARY: \n{summary}\n")
print(dash_line)
print(f"MODEL GENERATION - ZERO SHOT: \n{output}")

---------------------------------------------------------------------------------------------------
INPUT PROMPT: 

Summarize the following conversation

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

### Perform Full Fine-Tuning

### Preprocess the Dialog-Summary Dataset

In [38]:
def tokenize_function(example):
  start_prompt = 'Summarize the following conversation.\n\n'
  end_prompt = '\n\nSummary'
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  # padding = 'max_length': Pads the tokenized sequences to the maximum length allowed by the model.
  # This ensures that all sequences in the batch have the same length.
  # truncation = True: Truncates sequences that are too long to fit within the model’s maximum input length.
  # return_tensors = "pt": Returns the tokenized outputs as PyTorch tensors.
  # .input_ids extracts the input IDs from the tokenized output, which are the numerical representations of the tokens.
  example['input_ids'] = tokenizer(prompt, padding = 'max_length', truncation = True, return_tensors = "pt").input_ids
  example['labels'] = tokenizer(example['summary'], padding = 'max_length', truncation = True, return_tensors = "pt").input_ids

  return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched = True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.filter()