In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \
    Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler
import evaluate
import nltk
from nltk.tokenize import sent_tokenize
import warnings
warnings.simplefilter(action='ignore')

In [18]:
import os
cwd = os.getcwd()
cwd

'C:\\Users\\svvlk\\Documents\\python\\personal\\news_sum'

In [19]:
!pipreqs C:\\Users\\svvlk\\Documents\\python\\personal\\news_sum

INFO: Not scanning for jupyter notebooks.
Traceback (most recent call last):
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\Scripts\pipreqs.exe\__main__.py", line 7, in <module>
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\lib\site-packages\pipreqs\pipreqs.py", line 609, in main
    init(args)
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\lib\site-packages\pipreqs\pipreqs.py", line 533, in init
    candidates = get_all_imports(
  File "C:\Users\svvlk\AppData\Local\Programs\Python\Python310\lib\site-packages\pipreqs\pipreqs.py", line 136, in get_all_imports
    contents = read_file_content(file_name, encoding)
  File "C:\Users\svvlk\AppData\Local\Programs\

In [3]:
data = pd.read_csv('news_summary.csv', encoding='cp437')
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4396 entries, 0 to 4513
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     4396 non-null   object
 1   date       4396 non-null   object
 2   headlines  4396 non-null   object
 3   read_more  4396 non-null   object
 4   text       4396 non-null   object
 5   ctext      4396 non-null   object
dtypes: object(6)
memory usage: 240.4+ KB


In [4]:
# headlines - column containing headlines which will be used as reference summarizations
# ctext - column containing full texts of news articles
# taking a look at the average lengths of both

def length(text):
    return len(text.split())

print('Mean headline length (words):', data['headlines'].apply(length).mean())
print('Mean text length (words):', data['ctext'].apply(length).mean())

Mean headline length (words): 9.300045495905369
Mean text length (words): 342.9438125568699


In [5]:
# splitting the data into train, val, and test, and converting it into Dataset format

train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_data = data[:train_size]
val_data = data[train_size:train_size+val_size]
test_data = data[train_size+val_size:]

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['author', 'date', 'headlines', 'read_more', 'text', 'ctext', '__index_level_0__'],
        num_rows: 3516
    })
    validation: Dataset({
        features: ['author', 'date', 'headlines', 'read_more', 'text', 'ctext', '__index_level_0__'],
        num_rows: 439
    })
    test: Dataset({
        features: ['author', 'date', 'headlines', 'read_more', 'text', 'ctext', '__index_level_0__'],
        num_rows: 441
    })
})

In [6]:
# loading the model tokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [7]:
# creating tokenization function with length limits for headlines and texts

max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["ctext"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["headlines"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# tokenizing the datasets

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3516 [00:00<?, ? examples/s]

Map:   0%|          | 0/439 [00:00<?, ? examples/s]

Map:   0%|          | 0/441 [00:00<?, ? examples/s]

## Calculate baseline metrics

As a textual data for evaluating the baseline model I will be using 3 first sentences of each article, applying the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric, consisting of 'rouge1'(unigram overlap), 'rouge2' (bigram overlap),
'rougeL'(longest overlap in a sentence), and 'rougeLsum'(longest overlap in a paragraph).

In [11]:
# loading ROUGE metric

rouge_score = evaluate.load("rouge")

In [12]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(dataset["train"][1]["ctext"]))

From her special numbers to TV?appearances, Bollywood actor Malaika Arora Khan has managed to carve her own identity.
The actor, who made her debut in the Hindi film industry with the blockbuster debut opposite Shah Rukh Khan in Chaiyya Chaiyya from Dil Se (1998), is still remembered for the song.
However, for trolls, she is a woman first and what matters right now is that she divorced a ?rich man?.


In [13]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["ctext"]]
    return metric.compute(predictions=summaries, references=dataset["headlines"])

In [14]:
# getting baseline metrics

score = evaluate_baseline(dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 12.73, 'rouge2': 4.24, 'rougeL': 10.32, 'rougeLsum': 11.15}

## Train and evaluate the model

In [15]:
# logging in to Hugging Face Hub

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# loading the pre-trained Seq2Seq model and the data collator

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:
# setting arguments 

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
output_dir = "mt5-small-finetuned-news-summary-kaggle"

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,       # calculate ROUGE for every epoch
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [17]:
# function for computing ROUGE metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
       predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [18]:
# removing columns containing strings

tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [19]:
# defining Trainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
# training the model

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msvetaku[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,3.995636,14.9001,3.3613,13.48,13.4691
2,8.318300,3.155007,17.9773,5.9638,16.7289,16.6792
3,8.318300,2.894964,21.3253,7.3866,19.5114,19.5167
4,4.045700,2.808666,25.1652,9.4202,22.7342,22.7302
5,4.045700,2.737482,25.5974,9.4123,23.0271,23.0383
6,3.650500,2.709118,25.9273,9.3421,23.2037,23.1651
7,3.650500,2.694949,26.2777,9.8465,23.6534,23.6262
8,3.517500,2.69075,26.7556,10.1226,24.052,23.9879


TrainOutput(global_step=1760, training_loss=4.879849246957085, metrics={'train_runtime': 1513.5565, 'train_samples_per_second': 18.584, 'train_steps_per_second': 1.163, 'total_flos': 1.487267130507264e+16, 'train_loss': 4.879849246957085, 'epoch': 8.0})

In [21]:
# evaluating the model

trainer.evaluate()

{'eval_loss': 2.6907496452331543,
 'eval_rouge1': 26.7556,
 'eval_rouge2': 10.1226,
 'eval_rougeL': 24.052,
 'eval_rougeLsum': 23.9879,
 'eval_runtime': 21.0318,
 'eval_samples_per_second': 20.873,
 'eval_steps_per_second': 1.331,
 'epoch': 8.0}

In [None]:
# pushing to Hugging Face Hub

trainer.push_to_hub(commit_message="Training complete", tags="summarization")

## Summarize test data

In [22]:
from transformers import pipeline

hub_model_id = "svetaku/mt5-small-finetuned-news-summary-kaggle"
summarizer = pipeline("summarization", model=hub_model_id)

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [24]:
# function to get a summary of an article with index idx

def print_summary(idx):
    review = dataset["test"][idx]["ctext"]
    title = dataset["test"][idx]["headlines"]
    summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"]
    print(f"'>>> Article: {review}'")
    print(f"\n'>>> Headline: {title}'")
    print(f"\n'>>> Summary: {summary}'")

In [26]:
print_summary(20)

'>>> Article: The Indian Army, after consultations with the Defence Ministry, is considering cutting down the numbers of its sahayaks or 'buddies' by 25 percent. This would translate to around 10,000 jawans as there are currently about 40,000 sahayaks in the Indian Army. The move would come in the wake of the controversy over the tasks Indian Army sahayaks perform for officers and junior commissioned officers.Under the army's 'buddy's system, sahayak jawans are attached to officers and junior commissioned officers. A sahayak's tasks include working with the officer or JCO for army-related duties. According to top army sources, the 10,000 sahayak jawans will be replaced by civilians. The civilian substitutes will be employed for officers in static formations such as the Army Headquarters or units in the Delhi area and not operational locations like battalions, brigades and division and corps headquarters. Once their civilian replacements are in the place, the 10,000 sahayak jawans will 