## **Installing the required libraries**

In [2]:
!pip install transformers datasets evaluate py7zr rouge_score torch accelerate -U

Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl (547 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting py7zr
  Downloading py7zr-0.21.1-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.4 MB/s[0m eta [3

In [3]:
from transformers import T5Tokenizer
from datasets import load_dataset

## **Loading the samsum dataset**

In [4]:
dataset = load_dataset('samsum')  # Loading the dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

## **Dataset Structure**

In [5]:
print("Structure of the dataset is:")
print(dataset)

Structure of the dataset is:
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})


## **showing the dailogue and it's summary**

In [6]:
print('Dialogue:', dataset['train']['dialogue'][0])
print('_____________________________________________________________________\n')
print('Summary:', dataset['train']['summary'][0])

Dialogue: Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
_____________________________________________________________________

Summary: Amanda baked cookies and will bring Jerry some tomorrow.


## **t5-small Tokenizer**

In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')  # Loading the t5-small from the T5Tokenizer

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## **pre-processing**

In [9]:
# pre-processing function creation
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)   # mapping the samsum dataset with the created preprocessing function

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

## **Loading the Model**




In [11]:
from transformers import T5ForConditionalGeneration

# Loading t5-small model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
def summarize_text(text, model, tokenizer, max_length=150, min_length=40, num_beams=4):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=num_beams, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [13]:
text = '''Siva lived in the small coastal village of Mangala Puram, where the sea was both a friend and a foe. The village thrived on fishing, and every family had at least one member who ventured into the deep waters daily. Siva was different. He loved the sea but was more fascinated by the mysteries it held beneath its surface.

From a young age, Siva spent hours watching the waves crash against the rocks, dreaming of the day he could dive into the depths. His father, a seasoned fisherman, wanted Siva to follow in his footsteps, but Siva's heart was set on becoming a marine biologist. He was particularly intrigued by the stories of the sunken ship, the "Golden Pearl," which lay somewhere off the coast of Mangala Puram.

One summer, Siva's determination paid off. He received a scholarship to study marine biology at a prestigious university. He worked hard, absorbing every bit of knowledge he could about marine ecosystems, underwater archaeology, and diving techniques. After graduation, he returned to Mangala Puram, eager to explore the secrets beneath the waves.

Siva assembled a team of local divers, including his childhood friend, Ravi. Together, they began their quest to find the "Golden Pearl." They mapped the sea floor, studied old maritime records, and dived at every possible location. Months turned into years, but Siva's resolve never wavered.

One day, after an intense storm, Siva noticed something unusual on the sea floor during a dive. It was a large, encrusted object that looked like part of a ship's hull. With his team's help, Siva carefully excavated the site. To their amazement, they found the remnants of the "Golden Pearl," along with a treasure trove of artifacts.

The discovery brought fame and fortune to Mangala Puram. Siva's find became a significant archaeological site, attracting tourists and researchers from around the world. More importantly, it validated Siva's dreams and hard work. The village, once known only for its fishermen, was now celebrated for its rich history and Siva's incredible discovery.

Siva continued his research, dedicating his life to uncovering the secrets of the sea. His journey inspired many young villagers to pursue their passions, proving that with determination and hard work, even the wildest dreams could come true.
'''

In [14]:
summarize_text(text, model, tokenizer, max_length=150, min_length=40, num_beams=4)

'the "golden pearl" was a sunken ship off the coast of Mangala Puram. he spent hours watching the waves crash against the rocks, dreaming of the day he could dive into the depths. his father, a seasoned fisherman, wanted him to follow in his footsteps.'

## **Fine-Tuning the Model**





In [21]:
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
import evaluate

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

# Training the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3803,0.344125
2,0.3716,0.336938
3,0.3636,0.332871
4,0.3592,0.332321
5,0.3607,0.331042


TrainOutput(global_step=18415, training_loss=0.3686716693398618, metrics={'train_runtime': 4166.624, 'train_samples_per_second': 17.679, 'train_steps_per_second': 4.42, 'total_flos': 9969277096427520.0, 'train_loss': 0.3686716693398618, 'epoch': 5.0})

## **saving the model**

In [22]:
model.save_pretrained('fine-tuned-abstractive/')
tokenizer.save_pretrained('fine-tuned-abstractive/')

('fine-tuned-abstractive/tokenizer_config.json',
 'fine-tuned-abstractive/special_tokens_map.json',
 'fine-tuned-abstractive/spiece.model',
 'fine-tuned-abstractive/added_tokens.json')

## **Evaluating the model**

In [23]:
from datasets import load_metric

In [24]:
rouge=load_metric('rouge')

In [25]:
from transformers import T5ForConditionalGeneration

# Loading the fine-tuned model
local_model = T5ForConditionalGeneration.from_pretrained('fine-tuned-abstractive/')

def evaluate(data, idx, local_tokenizer):
    inputs=local_tokenizer('summarization: '+data['dialogue'][idx],return_tensors='pt')
    summary=local_model.generate(inputs['input_ids'],max_length=64) # Use the loaded model
    summary=local_tokenizer.decode(summary[0],skip_special_tokens=True)
    return summary

In [30]:
predicted_summary = [evaluate(dataset['validation'], i, tokenizer) for i in range(10)]

In [31]:
scores=rouge.compute(references=dataset['validation']['dialogue'][:10],predictions=predicted_summary)

## **Rouge Scores**

In [33]:
for key in scores:
    print(f"{key}: {scores[key].high}")

rouge1: Score(precision=0.8457111359643189, recall=0.3345209725873078, fmeasure=0.442208369480505)
rouge2: Score(precision=0.5562528510312466, recall=0.2222090926265939, fmeasure=0.28804964834435565)
rougeL: Score(precision=0.7565472428248429, recall=0.30642550565106536, fmeasure=0.39660653427677967)
rougeLsum: Score(precision=0.8323487168024253, recall=0.3467631676208023, fmeasure=0.4501571017787591)
