# Installing Dependencies

In [1]:
!pip install transformers gdown rouge rouge_score -q

In [2]:
!gdown --id 1QGknCbFF7C5IKQ69VlyhiCxVTn39sF30

Downloading...
From: https://drive.google.com/uc?id=1QGknCbFF7C5IKQ69VlyhiCxVTn39sF30
To: /kaggle/working/filtered_news_data.csv
100%|██████████████████████████████████████| 16.6M/16.6M [00:00<00:00, 65.8MB/s]


# Checking For GPU

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

# Loading Data

In [4]:
import pandas as pd
news_data = pd.read_csv('/kaggle/working/filtered_news_data.csv')
news_data.head()

Unnamed: 0.1,Unnamed: 0,text,ctext,text_len,ctext_len,text_sent_count,text_word_count,ctext_sent_count,ctext_word_count,preprocessed_text,preprocessed_ctext
0,0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,358,2313,2,62,16,413,the administration of union territory daman an...,the daman and diu administration on wednesday ...
1,2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...,398,2112,3,70,18,379,the indira gandhi institute of medical science...,the indira gandhi institute of medical science...
2,4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...,366,3249,3,67,8,569,hotels in maharashtra will train their staff t...,hotels in mumbai and other indian cities are t...
3,5,A 32-year-old man on Wednesday was found hangi...,An alleged suspect in a kidnapping case was fo...,347,2247,3,63,23,440,a 32yearold man on wednesday was found hanging...,an alleged suspect in a kidnapping case was fo...
4,6,The Delhi High Court reduced the compensation ...,"In an interesting ruling, the Delhi high court...",361,2367,3,70,11,444,the delhi high court reduced the compensation ...,in an interesting ruling the delhi high court ...


# Preparing Data

In [5]:
news_data = news_data[['preprocessed_text', 'preprocessed_ctext']]
news_data.columns = ['summary', 'news']
news_data['news'] = 'summarize: ' + news_data['news']
news_data.head()

Unnamed: 0,summary,news
0,the administration of union territory daman an...,summarize: the daman and diu administration on...
1,the indira gandhi institute of medical science...,summarize: the indira gandhi institute of medi...
2,hotels in maharashtra will train their staff t...,summarize: hotels in mumbai and other indian c...
3,a 32yearold man on wednesday was found hanging...,summarize: an alleged suspect in a kidnapping ...
4,the delhi high court reduced the compensation ...,summarize: in an interesting ruling the delhi ...


# Converting Pandas Dataframe to Hugging Face Dataset

## Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

train_news, test_news = train_test_split(news_data, test_size=0.1, shuffle = True)
print("No. of Train and Validation Datapoints: ",len(train_news))
print("No. of Test Datapoints: ",len(test_news))

No. of Train and Validation Datapoints:  3524
No. of Test Datapoints:  392


In [7]:
# Saving the global pandas version of train and test dataframes
global_train_news = train_news
global_test_news = test_news

In [38]:
# Saving global train and test data as csv for backup
global_train_news.to_csv('/kaggle/working/train_news.csv', index=False)
global_test_news.to_csv('/kaggle/working/test_news.csv', index=False)

In [8]:
# Converting train set to hugging face dataset
import datasets
from datasets import Dataset, DatasetDict

train_news = datasets.Dataset.from_pandas(train_news)
train_news

Dataset({
    features: ['summary', 'news', '__index_level_0__'],
    num_rows: 3524
})

In [9]:
# Cleaning hugging face dataset
train_news = train_news.remove_columns(["__index_level_0__"])
train_news

Dataset({
    features: ['summary', 'news'],
    num_rows: 3524
})

In [10]:
# split train into train and val
train_news = train_news.train_test_split(test_size=0.2, shuffle=True)
train_news

DatasetDict({
    train: Dataset({
        features: ['summary', 'news'],
        num_rows: 2819
    })
    test: Dataset({
        features: ['summary', 'news'],
        num_rows: 705
    })
})

In [11]:
# Fitting into dataset dict
train_val_dataset = DatasetDict({
    'train': train_news["train"],
    'val': train_news['test']})

# T5 Small Modeling

## Loading Model and Tokenizer

In [12]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
t5_small_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
t5_small_tokenizer = T5Tokenizer.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def prepare_dataset(data):
    inputs = data["news"]
    model_inputs = t5_small_tokenizer(inputs, max_length=512, truncation=True)
    labels = t5_small_tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
t5_small_tokenized_data = train_val_dataset.map(prepare_dataset, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
t5_small_tokenized_data

DatasetDict({
    train: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2819
    })
    val: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 705
    })
})

## Data Collator

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer= t5_small_tokenizer, model=t5_small_model)

2024-03-29 09:34:28.395890: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 09:34:28.396043: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 09:34:28.572345: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Compute Metrics

In [17]:
from rouge import Rouge
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = t5_small_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, t5_small_tokenizer.pad_token_id)
    decoded_labels = t5_small_tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = Rouge().get_scores(decoded_preds, decoded_labels, avg=True, ignore_empty=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return result

## Setting Training Arguments

In [18]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-news-sum-fine",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

## Setting up Trainer

In [19]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = t5_small_model,
    args = training_args,
    train_dataset = t5_small_tokenized_data["train"],
    eval_dataset = t5_small_tokenized_data["val"],
    tokenizer = t5_small_tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training

In [20]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
1,No log,1.799671,"{'r': 0.15908538565382108, 'p': 0.6461251282659629, 'f': 0.2527181961734281}","{'r': 0.06735686067093552, 'p': 0.3541911665722681, 'f': 0.11203090086144164}","{'r': 0.14640446701355142, 'p': 0.5958883047594268, 'f': 0.23267297486728808}"
2,No log,1.755471,"{'r': 0.16144930472795405, 'p': 0.6513760132915664, 'f': 0.2561248130643206}","{'r': 0.06945204879884524, 'p': 0.3615431482484053, 'f': 0.11532792625730404}","{'r': 0.14864695049714508, 'p': 0.6009034987389511, 'f': 0.23589322556949116}"
3,1.952600,1.739642,"{'r': 0.16259322276324512, 'p': 0.6545823934559862, 'f': 0.25785767200736864}","{'r': 0.07020101198635677, 'p': 0.36497510045695325, 'f': 0.11657172018558735}","{'r': 0.14941851327848202, 'p': 0.6029044696785627, 'f': 0.2370518674419431}"
4,1.952600,1.729928,"{'r': 0.16303760528221423, 'p': 0.6549508381110389, 'f': 0.25841781212514314}","{'r': 0.07054610267107185, 'p': 0.36668576185785234, 'f': 0.11712465550530764}","{'r': 0.14986077105974668, 'p': 0.6042206573151508, 'f': 0.23769349478649066}"
5,1.952600,1.726962,"{'r': 0.1629178931044338, 'p': 0.6559093988568335, 'f': 0.2583224821375902}","{'r': 0.07065762114180346, 'p': 0.3682244418539791, 'f': 0.11735158230760047}","{'r': 0.14986402840367127, 'p': 0.6056132796652199, 'f': 0.23778793843267182}"




TrainOutput(global_step=885, training_loss=1.922504179356462, metrics={'train_runtime': 623.3571, 'train_samples_per_second': 22.611, 'train_steps_per_second': 1.42, 'total_flos': 1907642691747840.0, 'train_loss': 1.922504179356462, 'epoch': 5.0})

## Saving Model

In [21]:
# save the model
model_path = "t5-small-news-sum-fine"
trainer.save_model(model_path)
t5_small_tokenizer.save_pretrained(model_path)

('t5-small-news-sum-fine/tokenizer_config.json',
 't5-small-news-sum-fine/special_tokens_map.json',
 't5-small-news-sum-fine/spiece.model',
 't5-small-news-sum-fine/added_tokens.json')

## Model Inferencing

### Loading Model and Tokenizer

In [22]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("/kaggle/working/t5-small-news-sum-fine").to(device)
tokenizer = T5Tokenizer.from_pretrained("/kaggle/working/t5-small-news-sum-fine")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Create Pipeline

In [27]:
from transformers import pipeline
t5_small_summarizer = pipeline("summarization", model = model,tokenizer = tokenizer, device=1)

### Inference on some instances

In [28]:
for i in range(3):
    summary = t5_small_summarizer(test_news['news'].iloc[i])
    print("Reference Summary:- ", test_news['summary'].iloc[i])
    print("\nGenerated Summary:- ",summary[0]['summary_text'])
    print("=="*50)
    print()

Reference Summary:-  the indian space research organisation isro is set to launch record 103 satellites in a single flight in february first week . as many as 100 of the satellites to be launched are from foreign nations . earlier isro had said it will launch 83 satellites but the launch got delayed due to the addition of more satellites an official said .

Generated Summary:-  the indian space research organisation isro is set to launch 103 satellites in one go using its workhorse pslvc37 in the first week of february . as many as 100 of the satellites set for launch belong to foreign nations including the united states and germany .



Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)


Reference Summary:-  women and children will be allowed to use toilet facilities at hotels and restaurants including fivestars in south delhi for free from may 1 . security concerns coupled with the fact that some restaurants and bars dont allow single males made us keep men out of the scheme for now national restaurants association head riaz amlani said .

Generated Summary:-  south delhi municipal corporation sdmc commissioner puneet kumar goel said that hotels restaurants and eateries in the area will open their toilets to public use from may 1 . however men have been kept of out of the scheme due to privacy issues cited by various hotel associations .

Reference Summary:-  an image of a pregnant us president donald trump being hugged by russian president vladimir putin from behind was projected on buildings in new york along with lovethroughhate . the image was a part of an ad campaign by the dating app hater . were just trying to make people laugh . through humour hate can turn in

## Testing on Test Set

In [30]:
def generate_summary_t5(text):
    summary = t5_small_summarizer(text)
    return summary[0]['summary_text']

In [31]:
test_news['t5_small_summary'] = test_news['news'].apply(lambda x: generate_summary_t5(x))

Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)
Your max_length is set to 200, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 142. Si

In [32]:
test_news

Unnamed: 0,summary,news,t5_small_summary
1492,the indian space research organisation isro is...,summarize: the indian space research organisat...,the indian space research organisation isro is...
3070,women and children will be allowed to use toil...,summarize: starting monday women and children ...,south delhi municipal corporation sdmc commiss...
895,an image of a pregnant us president donald tru...,summarize: the image of russian president vlad...,the image of russian president vladimir putin ...
3867,banking operations across the country came to ...,summarize: banking operations across the count...,banking operations across the country came to ...
3707,indian national congress could manage to win o...,summarize: six out of 10 assembly seats in ame...,six out of 10 assembly seats in amethi and rae...
...,...,...,...
1350,sasi gangadharan a semiparalysed man from kera...,summarize: eighteen years ago sasi gangadharan...,sasi gangadharan a coconut tree climber from t...
1384,usbased researchers have identified that the h...,summarize: washington jan 10 pti the human app...,the human appendix a narrow pouch that project...
2960,aap mla kapil mishra who was sacked on saturda...,summarize: rumblings in the aam aadmi party co...,delhi water minister kapil mishra has been sac...
1119,a child rights commission in tamil nadu has re...,summarize: after a complaint filed by a child ...,tamil nadu commission for protection of child ...


## Evaluate on Test Data

In [33]:
from rouge_score import rouge_scorer
import numpy as np

def evaluate_test_summaries(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in test_news.iterrows():
        scores = scorer.score(target=row['summary'], prediction=row['t5_small_summary'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [34]:
print("For T5-Small (FineTuned):- ")
print("\nEvaluation for the test summary: \n")
evaluate_test_summaries(test_news)

For T5-Small (FineTuned):- 

Evaluation for the test summary: 

Average ROUGE-1 Precision:  0.47245357485299827
Average ROUGE-1 Recall:  0.43645387053163986
Average ROUGE-1 F1-Score:  0.4480127211825521
Average ROUGE-2 Precision:  0.2405866159498558
Average ROUGE-2 Recall:  0.22256570251633492
Average ROUGE-2 F1-Score:  0.22822762309677963
Average ROUGE-L Precision:  0.351249264194184
Average ROUGE-L Recall:  0.3238438765256885
Average ROUGE-L F1-Score:  0.33267878929317674


## Download the Saved Model

### Create Zip File

In [35]:
import shutil
# Define the directory containing your model files
model_directory = "/kaggle/working/t5-small-news-sum-fine"
# Define the name for your zip file
zip_file_name = "t5-small-news-sum-fine"
# Create a zip file containing the model directory
shutil.make_archive(zip_file_name, 'zip', model_directory)

'/kaggle/working/t5-small-news-sum-fine.zip'

### Generate Downloadable Link

In [36]:
!ls

filtered_news_data.csv	t5-small-news-sum-fine	t5-small-news-sum-fine.zip


In [37]:
from IPython.display import FileLink
FileLink(r't5-small-news-sum-fine.zip')

# Empty Cache Memory

In [40]:
# Free up memory
import torch
torch.cuda.empty_cache()

# T5 Base Modeling

## Loading Tokenizer and Model

In [41]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
t5_base_model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
t5_base_tokenizer = T5Tokenizer.from_pretrained("t5-base")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [42]:
def prepare_dataset(data):
    inputs = data["news"]
    model_inputs = t5_base_tokenizer(inputs, max_length=512, truncation=True)
    labels = t5_base_tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [46]:
t5_base_tokenized_data = train_val_dataset.map(prepare_dataset, batched=True)
t5_base_tokenized_data

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2819
    })
    val: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 705
    })
})

## Data Collator

In [43]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer= t5_base_tokenizer, model=t5_base_model)

## Compute Metrics

In [44]:
from rouge import Rouge
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = t5_base_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, t5_base_tokenizer.pad_token_id)
    decoded_labels = t5_base_tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = Rouge().get_scores(decoded_preds, decoded_labels, avg=True, ignore_empty=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return result

## Setting Training Arguments

In [45]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-base-news-sum-fine",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

## Setting up Trainer

In [47]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = t5_base_model,
    args = training_args,
    train_dataset = t5_base_tokenized_data["train"],
    eval_dataset = t5_base_tokenized_data["val"],
    tokenizer = t5_base_tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training

In [48]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
1,No log,1.450725,"{'r': 0.1628415189764626, 'p': 0.6634320626529636, 'f': 0.2588926368388458}","{'r': 0.07117565265418536, 'p': 0.37584502319170643, 'f': 0.11846411765684098}","{'r': 0.1497600295477841, 'p': 0.6116983906095302, 'f': 0.23817615055031585}"
2,No log,1.416676,"{'r': 0.167401705669421, 'p': 0.6765066846887878, 'f': 0.2656676487797007}","{'r': 0.0749656884820431, 'p': 0.3914811624198356, 'f': 0.12454612905039124}","{'r': 0.15474660702986653, 'p': 0.6263578785512459, 'f': 0.24564630690342165}"
3,1.581000,1.404616,"{'r': 0.16809011236165894, 'p': 0.6834468572265822, 'f': 0.26700664848010097}","{'r': 0.07567568020110897, 'p': 0.39971589354880954, 'f': 0.12591480599258173}","{'r': 0.15541884195389366, 'p': 0.6332211311842105, 'f': 0.24695705663618991}"
4,1.581000,1.394844,"{'r': 0.1678356918497007, 'p': 0.6834247020073057, 'f': 0.2666485805808682}","{'r': 0.0762194738425002, 'p': 0.40338205726503623, 'f': 0.12683671761161106}","{'r': 0.155688883473455, 'p': 0.635313086417593, 'f': 0.24742679487326824}"
5,1.581000,1.393673,"{'r': 0.16785097776704277, 'p': 0.683698504249193, 'f': 0.266688874276523}","{'r': 0.07615434736587069, 'p': 0.4033418472780176, 'f': 0.12674555284057912}","{'r': 0.15581402880410963, 'p': 0.636027570908673, 'f': 0.2476394822238726}"




TrainOutput(global_step=885, training_loss=1.5368183373057909, metrics={'train_runtime': 1826.9657, 'train_samples_per_second': 7.715, 'train_steps_per_second': 0.484, 'total_flos': 8583261467443200.0, 'train_loss': 1.5368183373057909, 'epoch': 5.0})

## Saving Model

In [49]:
# save the model
model_path = "t5-base-news-sum-fine"
trainer.save_model(model_path)
t5_base_tokenizer.save_pretrained(model_path)

('t5-base-news-sum-fine/tokenizer_config.json',
 't5-base-news-sum-fine/special_tokens_map.json',
 't5-base-news-sum-fine/spiece.model',
 't5-base-news-sum-fine/added_tokens.json')

## Model Inferencing

### Loading Model and Tokenizer

In [50]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("/kaggle/working/t5-base-news-sum-fine").to(device)
tokenizer = T5Tokenizer.from_pretrained("/kaggle/working/t5-base-news-sum-fine")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Create Pipeline

In [51]:
from transformers import pipeline
t5_base_summarizer = pipeline("summarization", model = model,tokenizer = tokenizer, device=1)

### Inference on some instances

In [52]:
for i in range(3):
    summary = t5_base_summarizer(test_news['news'].iloc[i])
    print("Reference Summary:- ", test_news['summary'].iloc[i])
    print("\nGenerated Summary:- ",summary[0]['summary_text'])
    print("=="*50)
    print()

Reference Summary:-  the indian space research organisation isro is set to launch record 103 satellites in a single flight in february first week . as many as 100 of the satellites to be launched are from foreign nations . earlier isro had said it will launch 83 satellites but the launch got delayed due to the addition of more satellites an official said .

Generated Summary:-  the indian space research organisation isro is all set to launch a record 103 satellites in one go using its workhorse pslvc37 in the first week of february . we are making a century by launching over 100 satellites at one go said s somnath director of the liquid propulsion systems centre . the launch was delayed by a week with the addition of 20 more foreign satellites .



Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)


Reference Summary:-  women and children will be allowed to use toilet facilities at hotels and restaurants including fivestars in south delhi for free from may 1 . security concerns coupled with the fact that some restaurants and bars dont allow single males made us keep men out of the scheme for now national restaurants association head riaz amlani said .

Generated Summary:-  women and children will no longer have to go looking for public washrooms to relieve themselves after a long day of shopping at south delhi markets . simply walking into the nearest restaurant and conveying their need to the staff will make posh toilets accessible to them completely free of cost . however men have been kept of out of the scheme due to privacy issues cited by various hotel associations .

Reference Summary:-  an image of a pregnant us president donald trump being hugged by russian president vladimir putin from behind was projected on buildings in new york along with lovethroughhate . the image wa

## Testing on Test Set

In [53]:
def generate_summary_t5(text):
    summary = t5_base_summarizer(text)
    return summary[0]['summary_text']

In [54]:
test_news['t5_base_summary'] = test_news['news'].apply(lambda x: generate_summary_t5(x))

Your max_length is set to 200, but your input_length is only 197. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)
Your max_length is set to 200, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)


In [55]:
test_news

Unnamed: 0,summary,news,t5_small_summary,t5_base_summary
1492,the indian space research organisation isro is...,summarize: the indian space research organisat...,the indian space research organisation isro is...,the indian space research organisation isro is...
3070,women and children will be allowed to use toil...,summarize: starting monday women and children ...,south delhi municipal corporation sdmc commiss...,women and children will no longer have to go l...
895,an image of a pregnant us president donald tru...,summarize: the image of russian president vlad...,the image of russian president vladimir putin ...,the image of russian president vladimir putin ...
3867,banking operations across the country came to ...,summarize: banking operations across the count...,banking operations across the country came to ...,10 lakh bankers staged a strike against the go...
3707,indian national congress could manage to win o...,summarize: six out of 10 assembly seats in ame...,six out of 10 assembly seats in amethi and rae...,six out of 10 assembly seats in amethi and rae...
...,...,...,...,...
1350,sasi gangadharan a semiparalysed man from kera...,summarize: eighteen years ago sasi gangadharan...,sasi gangadharan a coconut tree climber from t...,sasi gangadharan a coconut tree climber from t...
1384,usbased researchers have identified that the h...,summarize: washington jan 10 pti the human app...,the human appendix a narrow pouch that project...,the human appendix a narrow pouch that project...
2960,aap mla kapil mishra who was sacked on saturda...,summarize: rumblings in the aam aadmi party co...,delhi water minister kapil mishra has been sac...,sacked delhi water minister kapil mishra has m...
1119,a child rights commission in tamil nadu has re...,summarize: after a complaint filed by a child ...,tamil nadu commission for protection of child ...,tamil nadu commission for protection of child ...


## Evaluate on Test Data

In [56]:
from rouge_score import rouge_scorer
import numpy as np

def evaluate_test_summaries(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in test_news.iterrows():
        scores = scorer.score(target=row['summary'], prediction=row['t5_base_summary'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [57]:
print("For T5-Base (FineTuned):- ")
print("\nEvaluation for the test summary: \n")
evaluate_test_summaries(test_news)

For T5-Base (FineTuned):- 

Evaluation for the test summary: 

Average ROUGE-1 Precision:  0.4912017558071222
Average ROUGE-1 Recall:  0.47417111536210593
Average ROUGE-1 F1-Score:  0.47822674414523353
Average ROUGE-2 Precision:  0.2626294725939422
Average ROUGE-2 Recall:  0.25308497289355675
Average ROUGE-2 F1-Score:  0.25524545642826096
Average ROUGE-L Precision:  0.3699302665876597
Average ROUGE-L Recall:  0.35630072322131695
Average ROUGE-L F1-Score:  0.35959601725547086


## Download Saved Model

### Create Zip File

In [58]:
import shutil
# Define the directory containing your model files
model_directory = "/kaggle/working/t5-base-news-sum-fine"
# Define the name for your zip file
zip_file_name = "t5-base-news-sum-fine"
# Create a zip file containing the model directory
shutil.make_archive(zip_file_name, 'zip', model_directory)

'/kaggle/working/t5-base-news-sum-fine.zip'

### Create Downloadable Link

In [59]:
!ls

filtered_news_data.csv	   t5-small-news-sum-fine      train_news.csv
t5-base-news-sum-fine	   t5-small-news-sum-fine.zip
t5-base-news-sum-fine.zip  test_news.csv


In [60]:
from IPython.display import FileLink
FileLink(r't5-base-news-sum-fine.zip')

# Empty Cache Memory

In [61]:
# Free up memory
torch.cuda.empty_cache()

# BART Base Modeling

## Loading Model and Tokenizer

In [63]:
from transformers import BartTokenizer, BartForConditionalGeneration
bart_base_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base').to(device)
bart_base_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [64]:
def prepare_dataset(data):
    inputs = data["news"]
    model_inputs = bart_base_tokenizer(inputs, max_length=512, truncation=True)
    labels = bart_base_tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [65]:
bart_base_tokenized_data = train_val_dataset.map(prepare_dataset, batched=True)
bart_base_tokenized_data

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2819
    })
    val: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 705
    })
})

## Data Collator

In [66]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer= bart_base_tokenizer, model=bart_base_model)

## Compute Metrics

In [67]:
from rouge import Rouge
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = bart_base_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, bart_base_tokenizer.pad_token_id)
    decoded_labels = bart_base_tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = Rouge().get_scores(decoded_preds, decoded_labels, avg=True, ignore_empty=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return result

## Setting Training Argumenets

In [68]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="bart-base-news-sum-fine",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

## Setting up Trainer

In [69]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = bart_base_model,
    args = training_args,
    train_dataset = bart_base_tokenized_data["train"],
    eval_dataset = bart_base_tokenized_data["val"],
    tokenizer = bart_base_tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training

In [70]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
1,No log,1.793528,"{'r': 0.17877187503616232, 'p': 0.6633735408891868, 'f': 0.27982430999463015}","{'r': 0.0829149543039276, 'p': 0.3943548768016855, 'f': 0.1361744735159913}","{'r': 0.16620701152975154, 'p': 0.6180734767249171, 'f': 0.26027416255917174}"
2,No log,1.739008,"{'r': 0.18265433424218153, 'p': 0.6780211738847544, 'f': 0.2859900037578004}","{'r': 0.08602156109003656, 'p': 0.40866643600686176, 'f': 0.14128906702050947}","{'r': 0.17033086390546642, 'p': 0.6333834882958795, 'f': 0.2667910912576665}"
3,2.014600,1.716352,"{'r': 0.18433033008073849, 'p': 0.6831720305374879, 'f': 0.2884914537446572}","{'r': 0.08732257963721683, 'p': 0.41543694485183835, 'f': 0.14345128232652557}","{'r': 0.17161168434433222, 'p': 0.6368711952535485, 'f': 0.26865579090312675}"
4,2.014600,1.714038,"{'r': 0.1841942211796001, 'p': 0.6849368068078965, 'f': 0.2885075271700781}","{'r': 0.08774068854957644, 'p': 0.4188095218414369, 'f': 0.1442362018384003}","{'r': 0.17148308222547035, 'p': 0.6386522675164735, 'f': 0.26868306047593005}"
5,2.014600,1.708896,"{'r': 0.18391563528804872, 'p': 0.6831923261460189, 'f': 0.2880104927303693}","{'r': 0.08708510728351558, 'p': 0.41517203049117946, 'f': 0.14312410321474003}","{'r': 0.1709196568193276, 'p': 0.6360569591952574, 'f': 0.26776069266995317}"


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=885, training_loss=1.8980897030587924, metrics={'train_runtime': 988.1505, 'train_samples_per_second': 14.264, 'train_steps_per_second': 0.896, 'total_flos': 4291144177582080.0, 'train_loss': 1.8980897030587924, 'epoch': 5.0})

## Saving Model

In [71]:
# save the model
model_path = "bart-base-news-sum-fine"
trainer.save_model(model_path)
bart_base_tokenizer.save_pretrained(model_path)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('bart-base-news-sum-fine/tokenizer_config.json',
 'bart-base-news-sum-fine/special_tokens_map.json',
 'bart-base-news-sum-fine/vocab.json',
 'bart-base-news-sum-fine/merges.txt',
 'bart-base-news-sum-fine/added_tokens.json')

## Model Inferencing

### Loading Model and Tokenizer

In [72]:
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("/kaggle/working/bart-base-news-sum-fine").to(device)
tokenizer = BartTokenizer.from_pretrained("/kaggle/working/bart-base-news-sum-fine")

### Create pipeline

In [73]:
from transformers import pipeline
bart_base_summarizer = pipeline("summarization", model = model,tokenizer = tokenizer, device=1)

### Inference on some instance

In [74]:
for i in range(3):
    summary = bart_base_summarizer(test_news['news'].iloc[i])
    print("Reference Summary:- ", test_news['summary'].iloc[i])
    print("\nGenerated Summary:- ",summary[0]['summary_text'])
    print("=="*50)
    print()

Reference Summary:-  the indian space research organisation isro is set to launch record 103 satellites in a single flight in february first week . as many as 100 of the satellites to be launched are from foreign nations . earlier isro had said it will launch 83 satellites but the launch got delayed due to the addition of more satellites an official said .

Generated Summary:-  the indian space research organisation isro is set to launch 103 satellites in one go using its workhorse pslvc37 in the first week of february . as many as 100 of the satellites set for launch this year belong to foreign nations including the united states and germany . we are making a century by launching over 100 satellites at one go said s somnath .

Reference Summary:-  women and children will be allowed to use toilet facilities at hotels and restaurants including fivestars in south delhi for free from may 1 . security concerns coupled with the fact that some restaurants and bars dont allow single males mad

## Testing on Test Set

In [76]:
def generate_summary_bart_base(text):
    summary = bart_base_summarizer(text)
    return summary[0]['summary_text']

In [77]:
test_news['bart_base_summary'] = test_news['news'].apply(lambda x: generate_summary_bart_base(x))

Your max_length is set to 128, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)
Your max_length is set to 128, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 128, but your input_length is only 60. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 128, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
You

In [78]:
test_news

Unnamed: 0,summary,news,t5_small_summary,t5_base_summary,bart_base_summary
1492,the indian space research organisation isro is...,summarize: the indian space research organisat...,the indian space research organisation isro is...,the indian space research organisation isro is...,the indian space research organisation isro is...
3070,women and children will be allowed to use toil...,summarize: starting monday women and children ...,south delhi municipal corporation sdmc commiss...,women and children will no longer have to go l...,south delhi municipal corporation sdmc commiss...
895,an image of a pregnant us president donald tru...,summarize: the image of russian president vlad...,the image of russian president vladimir putin ...,the image of russian president vladimir putin ...,the image of russian president vladimir putin ...
3867,banking operations across the country came to ...,summarize: banking operations across the count...,banking operations across the country came to ...,10 lakh bankers staged a strike against the go...,banking operations across the country came to ...
3707,indian national congress could manage to win o...,summarize: six out of 10 assembly seats in ame...,six out of 10 assembly seats in amethi and rae...,six out of 10 assembly seats in amethi and rae...,bjp president amit shah on monday said that si...
...,...,...,...,...,...
1350,sasi gangadharan a semiparalysed man from kera...,summarize: eighteen years ago sasi gangadharan...,sasi gangadharan a coconut tree climber from t...,sasi gangadharan a coconut tree climber from t...,a coconut tree climber from keralas thiruvanan...
1384,usbased researchers have identified that the h...,summarize: washington jan 10 pti the human app...,the human appendix a narrow pouch that project...,the human appendix a narrow pouch that project...,a new study has found that the human appendix ...
2960,aap mla kapil mishra who was sacked on saturda...,summarize: rumblings in the aam aadmi party co...,delhi water minister kapil mishra has been sac...,sacked delhi water minister kapil mishra has m...,former delhi water minister kapil mishra has m...
1119,a child rights commission in tamil nadu has re...,summarize: after a complaint filed by a child ...,tamil nadu commission for protection of child ...,tamil nadu commission for protection of child ...,the tamil nadu commission for protection of ch...


## Evaluate on Test Data

In [79]:
from rouge_score import rouge_scorer
import numpy as np

def evaluate_test_summaries(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in test_news.iterrows():
        scores = scorer.score(target=row['summary'], prediction=row['bart_base_summary'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [80]:
print("For Bart-Base (FineTuned):- ")
print("\nEvaluation for the test summary: \n")
evaluate_test_summaries(test_news)

For Bart-Base (FineTuned):- 

Evaluation for the test summary: 

Average ROUGE-1 Precision:  0.47284096062155256
Average ROUGE-1 Recall:  0.49304117074314413
Average ROUGE-1 F1-Score:  0.47979170748525635
Average ROUGE-2 Precision:  0.2579776674100123
Average ROUGE-2 Recall:  0.2697483137697911
Average ROUGE-2 F1-Score:  0.2621123065377172
Average ROUGE-L Precision:  0.3548961679574369
Average ROUGE-L Recall:  0.3697732336015233
Average ROUGE-L F1-Score:  0.35998367685178156


## Download Saved Model

### Create Zip File

In [81]:
import shutil
# Define the directory containing your model files
model_directory = "/kaggle/working/bart-base-news-sum-fine"
# Define the name for your zip file
zip_file_name = "bart-base-news-sum-fine"
# Create a zip file containing the model directory
shutil.make_archive(zip_file_name, 'zip', model_directory)

'/kaggle/working/bart-base-news-sum-fine.zip'

### Generate Downloadable Link

In [82]:
!ls

bart-base-news-sum-fine      t5-small-news-sum-fine
bart-base-news-sum-fine.zip  t5-small-news-sum-fine.zip
filtered_news_data.csv	     test_news.csv
t5-base-news-sum-fine	     train_news.csv
t5-base-news-sum-fine.zip


In [83]:
from IPython.display import FileLink
FileLink(r'bart-base-news-sum-fine.zip')

# Empty Cache Memory

In [84]:
# Free up memory
torch.cuda.empty_cache()

# BART Large Modeling

## Loading Model and Tokenizer

In [85]:
from transformers import BartTokenizer, BartForConditionalGeneration
bart_large_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
bart_large_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [86]:
def prepare_dataset(data):
    inputs = data["news"]
    model_inputs = bart_large_tokenizer(inputs, max_length=512, truncation=True)
    labels = bart_large_tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [87]:
bart_large_tokenized_data = train_val_dataset.map(prepare_dataset, batched=True)
bart_large_tokenized_data

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2819
    })
    val: Dataset({
        features: ['summary', 'news', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 705
    })
})

## Data Collator

In [88]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer= bart_large_tokenizer, model=bart_large_model)

## Compute Metrics

In [89]:
from rouge import Rouge
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = bart_large_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, bart_large_tokenizer.pad_token_id)
    decoded_labels = bart_large_tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = Rouge().get_scores(decoded_preds, decoded_labels, avg=True, ignore_empty=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return result

## Setting Training Arguments

In [90]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="bart-large-news-sum-fine",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

## Setting up Trainer

In [91]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model = bart_large_model,
    args = training_args,
    train_dataset = bart_large_tokenized_data["train"],
    eval_dataset = bart_large_tokenized_data["val"],
    tokenizer = bart_large_tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training

In [92]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
1,No log,1.466648,"{'r': 0.5513922726518424, 'p': 0.49765570977049306, 'f': 0.5203058915324223}","{'r': 0.30436257294222074, 'p': 0.26819716891540835, 'f': 0.28357309855627255}","{'r': 0.5030421308027588, 'p': 0.4542066066608883, 'f': 0.4747898388590857}"
2,No log,1.447289,"{'r': 0.5479768915889259, 'p': 0.495532934738148, 'f': 0.5179448770220181}","{'r': 0.303188275117596, 'p': 0.2674060799628405, 'f': 0.28281248116638374}","{'r': 0.49876597127797895, 'p': 0.4511902347444576, 'f': 0.4715208198355528}"
3,1.322300,1.466044,"{'r': 0.5582892879550762, 'p': 0.49033888336391523, 'f': 0.5198464124648229}","{'r': 0.3088090992622457, 'p': 0.26405719408063544, 'f': 0.2834688467630489}","{'r': 0.5089111927117953, 'p': 0.44705309446249314, 'f': 0.4739243377073237}"
4,1.322300,1.497634,"{'r': 0.55141837580872, 'p': 0.49024787679390586, 'f': 0.5168736806972768}","{'r': 0.3024620376720677, 'p': 0.2622582569880217, 'f': 0.2797803502404694}","{'r': 0.500987746720903, 'p': 0.4456745105318896, 'f': 0.46973820969771496}"
5,1.322300,1.527477,"{'r': 0.549482603818736, 'p': 0.4924769306470366, 'f': 0.5171717309063367}","{'r': 0.30107894552286635, 'p': 0.26399058914475143, 'f': 0.28014563269123727}","{'r': 0.49879938453296546, 'p': 0.4473449307521337, 'f': 0.4696280620352501}"


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=885, training_loss=1.1490949210474046, metrics={'train_runtime': 4327.3487, 'train_samples_per_second': 3.257, 'train_steps_per_second': 0.205, 'total_flos': 1.5251438830485504e+16, 'train_loss': 1.1490949210474046, 'epoch': 5.0})

## Save Model

In [93]:
# save the model
model_path = "bart-large-news-sum-fine"
trainer.save_model(model_path)
bart_large_tokenizer.save_pretrained(model_path)

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('bart-large-news-sum-fine/tokenizer_config.json',
 'bart-large-news-sum-fine/special_tokens_map.json',
 'bart-large-news-sum-fine/vocab.json',
 'bart-large-news-sum-fine/merges.txt',
 'bart-large-news-sum-fine/added_tokens.json')

## Model Inferencing 

#### Load Trained Model and Tokenizer

In [94]:
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("/kaggle/working/bart-large-news-sum-fine").to(device)
tokenizer = BartTokenizer.from_pretrained("/kaggle/working/bart-large-news-sum-fine")

### Create Pipeline

In [95]:
from transformers import pipeline
bart_large_summarizer = pipeline("summarization", model = model,tokenizer = tokenizer, device=1)

### Inference on some instances

In [96]:
for i in range(3):
    summary = bart_large_summarizer(test_news['news'].iloc[i])
    print("Reference Summary:- ", test_news['summary'].iloc[i])
    print("\nGenerated Summary:- ",summary[0]['summary_text'])
    print("=="*50)
    print()

Reference Summary:-  the indian space research organisation isro is set to launch record 103 satellites in a single flight in february first week . as many as 100 of the satellites to be launched are from foreign nations . earlier isro had said it will launch 83 satellites but the launch got delayed due to the addition of more satellites an official said .

Generated Summary:-  the indian space research organisation isro is all set to launch a record 103 satellites in one go using its workhorse pslvc37 in the first week of february . as many as 100 of the satellites set for launch belong to foreign nations including the united states and germany . we are making a century by launching over 100 satellites at one go said an isro official .

Reference Summary:-  women and children will be allowed to use toilet facilities at hotels and restaurants including fivestars in south delhi for free from may 1 . security concerns coupled with the fact that some restaurants and bars dont allow single

## Testing on Test Set

In [97]:
def generate_summary_bart_large(text):
    summary = bart_large_summarizer(text)
    return summary[0]['summary_text']

In [98]:
test_news['bart_large_summary'] = test_news['news'].apply(lambda x: generate_summary_bart_large(x))

Your max_length is set to 142, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)
Your max_length is set to 142, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 142, but your input_length is only 132. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)
Your max_length is set to 142, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)
Y

## Evaluate on Test Data

In [99]:
from rouge_score import rouge_scorer
import numpy as np

def evaluate_test_summaries(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in test_news.iterrows():
        scores = scorer.score(target=row['summary'], prediction=row['bart_large_summary'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [100]:
print("For Bart-Large (FineTuned):- ")
print("\nEvaluation for the test summary: \n")
evaluate_test_summaries(test_news)

For Bart-Large (FineTuned):- 

Evaluation for the test summary: 

Average ROUGE-1 Precision:  0.48548995236352377
Average ROUGE-1 Recall:  0.5499142414029417
Average ROUGE-1 F1-Score:  0.5140280644783877
Average ROUGE-2 Precision:  0.2712282359526478
Average ROUGE-2 Recall:  0.30684569606784096
Average ROUGE-2 F1-Score:  0.2869888696243616
Average ROUGE-L Precision:  0.3635404133469843
Average ROUGE-L Recall:  0.41113379651930215
Average ROUGE-L F1-Score:  0.3846138362117907


## Download Saved Model

### Create Zip File 

In [107]:
import shutil
shutil.rmtree("/kaggle/working/bart-base-news-sum-fine")

In [103]:
import shutil
# Define the directory containing your model files
model_directory = "/kaggle/working/bart-large-news-sum-fine"
# Define the name for your zip file
zip_file_name = "bart-large-news-sum-fine"
# Create a zip file containing the model directory
shutil.make_archive(zip_file_name, 'zip', model_directory)

'/kaggle/working/bart-large-news-sum-fine.zip'

### Get Downloadable Link

In [104]:
!ls

bart-base-news-sum-fine       t5-base-news-sum-fine.zip
bart-base-news-sum-fine.zip   t5-small-news-sum-fine.zip
bart-large-news-sum-fine      test_news.csv
bart-large-news-sum-fine.zip  train_news.csv
filtered_news_data.csv


In [105]:
from IPython.display import FileLink
FileLink(r'bart-large-news-sum-fine.zip')

# Saving all Test Summaries

In [108]:
test_news.to_csv('/kaggle/working/all_test_news_summaries.csv', index=False)