## Installing Libraries

In [16]:
!nvidia-smi

Sun Jun 16 10:15:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0              29W /  70W |   2291MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [17]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [19]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: accelerate
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.4.0 requires transformers, which is not installed.
trl 0.4.7 requires transformers>=4.18.0, which is not installed.[0m[31m
[0mSuccessfully installed accelerate-0.31.0
[0mFound existing installation: accelerate 0.31.0
Uninstalling accelerate-0.31.0:
  Successfully uninstalled accelerate-0.31.0
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.31.0 trans

In [1]:
from transformers import pipeline,set_seed
from datasets import load_dataset,load_from_disk
import matplotlib.pyplot as plt
from datasets import load_dataset
import pandas as pd
from datasets import load_dataset,load_metric

from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device ="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Defining Model,Tokenizer and Loading Dataset
##### Using Same model (pegasus) for training & Encoding

In [3]:
model_ckpt="google/pegasus-cnn_dailymail"

tokenizer=AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus=AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
## Download & unzip the data

# dataset = load_dataset("Samsung/samsum")
# !wget https://github.com/vaibahvk808/samsum_summarizer_data/blob/main/summarizer-data.zip
# !unzip summarizer-data

# path =/content/drive/MyDrive/Code 2024/Datasets/summarizer-data/samsum_dataset

In [4]:
dataset_samsum=load_from_disk("/content/drive/MyDrive/Code 2024/Datasets/summarizer-data/samsum_dataset")

In [5]:
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [6]:
split_lengths=[len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lenghts:{split_lengths}")
print(f"Features:{dataset_samsum['train'].column_names}")
print("\Dialogue:")

print(dataset_samsum["test"][1]["dialogue"])

print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])

Split lenghts:[14732, 819, 818]
Features:['id', 'dialogue', 'summary']
\Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


## Encoding the Input & Output text for training & Evalulation

In [7]:
def convert_examples_to_features(example_batch):
    input_encodings=tokenizer(example_batch['dialogue'],max_length=1024,truncation=True)
    with tokenizer.as_target_tokenizer():
      target_encodings=tokenizer(example_batch['summary'],max_length=18,truncation=True)

    return{
        'input_ids':input_encodings['input_ids'],
        'attention_mask':input_encodings['attention_mask'],
        'labels':target_encodings['input_ids']
    }

In [8]:
dataset_samsum_pt=dataset_samsum.map(convert_examples_to_features,batched=True)

In [9]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

## Training the Model - Defining the Hyper-parameter of model  & assigning training and  evaluation dataset


In [10]:
# Training
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator=DataCollatorForSeq2Seq(tokenizer,model=model_pegasus)

In [None]:
# ! pip install -U accelerate
# ! pip install -U transformers
# import accelerate
# import transformers

# transformers.__version__
# !pip install transformers[torch]

In [11]:
from transformers import TrainingArguments,Trainer

trainer_args=TrainingArguments(
    output_dir='pegasus-samsum',num_train_epochs=1,warmup_steps=500,
    per_device_train_batch_size=1,per_device_eval_batch_size=1,
    weight_decay=0.01,logging_steps=10,
    eval_strategy='steps',eval_steps=500,save_steps=1e6,
    gradient_accumulation_steps=16
)

In [12]:
## Adding Test here as Training has lot of examples , just to understand the process. In reality train should be passed
trainer=Trainer(model=model_pegasus,args=trainer_args,
                tokenizer=tokenizer,data_collator=seq2seq_data_collator,
                train_dataset=dataset_samsum_pt["test"], ## train should be passed here
                eval_dataset=dataset_samsum_pt["validation"])

In [13]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=51, training_loss=3.3507661211724376, metrics={'train_runtime': 187.4568, 'train_samples_per_second': 4.369, 'train_steps_per_second': 0.272, 'total_flos': 313450454089728.0, 'train_loss': 3.3507661211724376, 'epoch': 0.9963369963369964})

## Evaluation of the Model - Using rouge score for Text Summarization

In [20]:
# Evaluation
def generate_batch_sized_chunks(list_of_elements,batch_size):
  """split the dataset into smaller batches that we can process simulataneously
  Yield successive batch-suzed chunks from list_of_elements."""
  for i in range(0,len(list_of_elements),batch_size):
    yield list_of_elements[i:i+batch_size]

def calculate_metric_on_test_ds(dataset,metric,model,tokenizer,batch_size=16,device=device,column_text="article",column_summary="highlights"):
  article_batches=list(generate_batch_sized_chunks(dataset[column_text],batch_size))
  target_batches=list(generate_batch_sized_chunks(dataset[column_summary],batch_size))

  for article_batch,target_batch in tqdm(zip(article_batches,target_batches),total=len(article_batches)):
    inputs=tokenizer(article_batch,max_length=1024,truncation=True,padding="max_length",return_tensors="pt")
    summaries=model.generate(input_ids=inputs["input_ids"].to(device),
                             attention_mask=inputs["attention_mask"].to(device),
                             length_penalty=0.8,num_beams=8,max_length=128)

    #We decode the generated texts, replace the token, and add teh decoded texts with the reference to the metric.
    decoded_summaries=[tokenizer.decode(s,skip_special_tokens=True,
                                        clean_up_tokenization_spaces=True)
                        for s in summaries]
    decoded_summaries=[d.replace(""," ") for d in decoded_summaries]

    metric.add_batch(predictions=decoded_summaries,references=target_batch)

  # finally compute & return the ROGUE Score
  score = metric.compute()
  return score

In [18]:
rouge_names=["rouge1","rouge2","rougeL","rougeLsum"]
rouge_metric=load_metric('rouge')

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [23]:
## Doing on 10 test dataset just to save time, in reality do it on complete test data
score=calculate_metric_on_test_ds(
    dataset_samsum['test'][0:10],rouge_metric,trainer.model,tokenizer,batch_size=2,column_text='dialogue',column_summary='summary'
)

rouge_dict=dict((rn,score[rn].mid.fmeasure)for rn in rouge_names)

pd.DataFrame(rouge_dict,index=[f'pegasus'])

100%|██████████| 5/5 [00:22<00:00,  4.58s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018348,0.0,0.01827,0.018042


## Saving the model & Tokenizer

In [24]:
#Save Model
model_pegasus.save_pretrained("pegaus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [27]:
# Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
## Predictions

## Doing Prediction on sample output for QC

In [1]:
## Prediction
# Load tokenizer
tokenizer=AutoTokenizer.from_pretrained("/content/tokenizer")

gen_kwargs={"length_penalty":0.8,"num_beams":8,"max_length":128}

sample_text=dataset_samsum["test"][0]["dialogue"]
reference=dataset_samsum["test"][0]["summary"]
pipe=pipeline("summarization",model="pegaus-samsum-model",tokenizer=tokenizer)


print("Dialogue")
print(sample_text)

print("\n Reference Summary")
print(reference)

print("\n Model Summary:")
print(pipe(sample_text,**gen_kwargs)[0]["summary_text"])

NameError: name 'AutoTokenizer' is not defined