## T5 model

### importing the librariers

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

### importing the datasets

In [6]:
df_train = pd.read_csv('../../Datasets/without_ctext/df_train.csv')
df_train.drop('Unnamed: 0', axis = 1, inplace = True)
df_train.head()

Unnamed: 0,headlines,text
0,Chhattisgarh to start ambulance service for cows,Chhattisgarh Chief Minister Raman Singh on Sun...
1,Trucks dumping debris on wetlands seized in Mu...,Mumbai Police on Saturday seized ten trucks an...
2,Modi pays homage to Indian World War I heroes ...,PM Narendra Modi visited the Haifa Indian Ceme...
3,Delhi's domesticated elephants may be shifted ...,A forest department report submitted to the Hi...
4,Ranchi civic body uses 'Sholay' climax to prom...,Ranchi Municipal Corporation has used movie Sh...


In [7]:
df_eval = pd.read_csv('../../Datasets/without_ctext/df_eval.csv')
df_eval.drop('Unnamed: 0', axis = 1, inplace = True)
df_eval.head()

Unnamed: 0,headlines,text
0,Delhi taxi driver returns lost bag with valuab...,"Debendra Kapri, a 24-year-old taxi driver, has..."
1,Recall what happened in 1971: Venkaiah Naidu t...,NDA Vice Presidential nominee Venkaiah Naidu o...
2,"Bihar minister abuses PM Modi, calls him a dacoit",A video of Bihar's Minister for Excise and Pro...
3,6 arrested for blackmailing makers over Baahub...,The cyber crime police of Hyderabad have arres...
4,Indrani forged Peter?s signature on bank docum...,Sheena Bora murder case accused Indrani Mukerj...


In [45]:
df_test = pd.read_csv('../../Datasets/without_ctext/df_test.csv')
df_test.drop('Unnamed: 0', axis = 1, inplace = True)
df_test.head()

Unnamed: 0,headlines,text
0,Ex-Australian PM sends signed bat to Modi thro...,Former Australian PM Tony Abbott has sent a si...
1,Nearly 400 judicial officers transferred in Ut...,The Allahabad High Court has transferred aroun...
2,"Big B complains about Vodafone on Twitter, RJi...",Actor Amitabh Bachchan on Tuesday took to Twit...
3,No interference in Jayalalithaa's treatment: A...,Apollo Hospitals on Tuesday said that there wa...
4,Varun's pants tear while dancing with contesta...,Actor Varun Dhawan's pants got torn while he w...


In [12]:
print(f"{len(df_train['headlines'][0])} :: {len(df_train['text'][0])}")

48 :: 359


In [9]:
df_train = Dataset.from_pandas(df_train)
df_train

Dataset({
    features: ['headlines', 'text'],
    num_rows: 3000
})

In [10]:
df_eval = Dataset.from_pandas(df_eval)
df_test = Dataset.from_pandas(df_test)

### model and mapping

In [11]:
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [25]:
def preprocess(data) : 
    model_inputs = tokenizer(
        data['text'],
        max_length = 512,
        truncation = True,
    )
    labels = tokenizer(
        data['headlines'],
        max_length = 60,
        truncation = True
    )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [26]:
train_data = df_train.map(preprocess, batched = True)
test_data = df_test.map(preprocess, batched = True)
eval_data = df_eval.map(preprocess, batched = True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map: 100%|██████████| 3000/3000 [00:00<00:00, 8741.91 examples/s]
Map: 100%|██████████| 595/595 [00:00<00:00, 9751.41 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 9863.13 examples/s]


### training arguments and model training

In [18]:
datacollator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [20]:
training_args = TrainingArguments(
    output_dir = './results_without_ctext',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    num_train_epochs = 1,
    per_device_eval_batch_size = 64,
    per_device_train_batch_size = 64,
    do_predict = True,
    save_total_limit = 2,
    logging_dir = './logs_without_ctext'
)

In [28]:
trainer = Trainer(
    args = training_args,
    model = model,
    train_dataset = train_data,
    eval_dataset = eval_data,
    tokenizer = tokenizer,
    data_collator = datacollator
)

  trainer = Trainer(


In [29]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.003681




TrainOutput(global_step=47, training_loss=3.395263671875, metrics={'train_runtime': 480.274, 'train_samples_per_second': 6.246, 'train_steps_per_second': 0.098, 'total_flos': 92442679836672.0, 'train_loss': 3.395263671875, 'epoch': 1.0})

### testing the model on test_data and the rouge score

In [34]:
generated_summary = []

for i in range(0, 595) : 
    sample_data = test_data['text'][i]
    inputs = tokenizer([sample_data], return_tensors = 'pt', truncation = True, max_length = 512)
    summary_ids = model.generate(inputs['input_ids'], max_length = 20, min_length = 10, length_penalty = 1.0)

    print(f"{[i]} :: generated : {tokenizer.decode(summary_ids[0], skip_special_tokens = True)}")
    print(f"reference summary : {test_data['headlines'][i]}")

    generated_summary.append(tokenizer.decode(summary_ids[0], skip_special_tokens = True))

    print('-'*30)

[0] :: generated : Abbott has signed a cricket bat for PM Narendra Modi. reportedly through an
reference summary : Ex-Australian PM sends signed bat to Modi through Indian man
------------------------------
[1] :: generated : Die Allahabad High Court hat reshuffled around 400 judicial officers,
reference summary : Nearly 400 judicial officers transferred in Uttar Pradesh
------------------------------
[2] :: generated : Amitabh Bachchan took to twitter to complain about Vodafone's network. the
reference summary : Big B complains about Vodafone on Twitter, RJio offers SIM
------------------------------
[3] :: generated : said that there is "no interference" in the treatment given to former Tamil Nadu
reference summary : No interference in Jayalalithaa's treatment: Apollo Hospital
------------------------------
[4] :: generated : Varun Dhawan's pants got torn while he was dancing with 
reference summary : Varun's pants tear while dancing with contestant on TV show
-----------------------

In [35]:
from evaluate import load

In [46]:
ref_summary = df_test['headlines']

In [48]:
rouge = load('rouge')
results = rouge.compute(predictions = generated_summary, references = ref_summary)
print(results)

{'rouge1': 0.3236402974028537, 'rouge2': 0.13996237050864452, 'rougeL': 0.28820584269050264, 'rougeLsum': 0.28752708684184636}
