In [None]:
import csv
!pip install datasets
from datasets import load_dataset
import pandas as pd
!pip install sentencepiece
!pip install datasets
!pip install transformers
from datasets import load_dataset
!pip install torch
import torch as th
from transformers import T5Tokenizer, T5Model,T5ForConditionalGeneration,T5Config
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset


# DATASET LOADING

In [None]:

dataset_file = '/news_summary_more.csv'

dataset = load_dataset('csv', data_files=dataset_file, split='train')

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

# CREATING TOKENIZER AND MODEL

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
config = T5Config.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small",config=config,)


# FINDING MAXIMUM LENGTH OF TEXTS AFTER TOKENIZATION

In [None]:
#finding maximum padding 
sumry = list(train_dataset['headlines'])
txt = list(train_dataset['text'])
sumry_t = tokenizer(sumry,padding=False,truncation=False)
txt_t = tokenizer(txt,padding=False,truncation=False)
max_source=0
for item in sumry_t['input_ids']:
    if len(item) > max_source:
        max_source = len(item)

max_target = 0
for item in txt_t['input_ids']:
    if len(item) > max_target:
        max_target = len(item)
max_txt_pad=max_target
max_smry_pad=max_source


# DATASET CREATION FOR TRAINING

In [None]:

class GSMDataset(th.utils.data.Dataset):
    def __init__(self,tokenizer,dataset,pad_text,pad_sum, loss_on_prefix=True):
        self.examples = dataset
        self.pad_text=pad_text
        self.pad_sum=pad_sum
        self.sumry = list(self.examples['headlines'])
        self.txt = list(self.examples['text'])
        self.txts = tokenizer(self.txt,padding='max_length', max_length=self.pad_text, truncation=True)
        self.sums = tokenizer(self.sumry,padding='max_length',max_length=self.pad_sum, truncation=True)
        
        self.loss_on_prefix = loss_on_prefix

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        txts_tokens = self.txts["input_ids"][idx]
        txts_att=self.txts["attention_mask"][idx]
        sumry_tokens = self.sums["input_ids"][idx]
        sumry_tokens = [-100 if x==0 else x for x in sumry_tokens] 
        txts_tokens = th.tensor(txts_tokens)
        txts_att = th.tensor(txts_att)
        sumry_tokens=th.tensor(sumry_tokens)
        
        return dict(input_ids= txts_tokens, attention_mask=txts_att,labels=sumry_tokens)

In [None]:
train_d=GSMDataset(tokenizer,train_dataset,max_txt_pad,max_smry_pad)
eval_d=GSMDataset(tokenizer,val_dataset,max_txt_pad,max_smry_pad)

# TRAINING ARGUMENTS INITILIZATION AND TRAINING

In [None]:
output_dir = '/content/sample_data'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=12,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=10000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_name', # Wandb run name
    logging_steps=1000, # How often to log loss to wandb
    eval_steps=1000, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_d,
    eval_dataset=eval_d
)

trainer.train()
trainer.save_model(output_dir + '/model')

# PREDICTING

In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content')
tokenized_text = tokenizer(str(text),padding='max_length', max_length=127, truncation=True)
source_ids = th.tensor([tokenized_text['input_ids']])
source_mask = th.tensor([tokenized_text['attention_mask']])
generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=127,
        num_beams=5,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )
pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)