## Pegasus model

### importing the libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


### importing the data without ctext

In [2]:
df_train = pd.read_csv('../../Datasets/without_ctext/df_train.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,headlines,text
0,0,Chhattisgarh to start ambulance service for cows,Chhattisgarh Chief Minister Raman Singh on Sun...
1,1,Trucks dumping debris on wetlands seized in Mu...,Mumbai Police on Saturday seized ten trucks an...
2,2,Modi pays homage to Indian World War I heroes ...,PM Narendra Modi visited the Haifa Indian Ceme...
3,3,Delhi's domesticated elephants may be shifted ...,A forest department report submitted to the Hi...
4,4,Ranchi civic body uses 'Sholay' climax to prom...,Ranchi Municipal Corporation has used movie Sh...


In [3]:
df_train.drop('Unnamed: 0', axis = 1, inplace = True)
df_train

Unnamed: 0,headlines,text
0,Chhattisgarh to start ambulance service for cows,Chhattisgarh Chief Minister Raman Singh on Sun...
1,Trucks dumping debris on wetlands seized in Mu...,Mumbai Police on Saturday seized ten trucks an...
2,Modi pays homage to Indian World War I heroes ...,PM Narendra Modi visited the Haifa Indian Ceme...
3,Delhi's domesticated elephants may be shifted ...,A forest department report submitted to the Hi...
4,Ranchi civic body uses 'Sholay' climax to prom...,Ranchi Municipal Corporation has used movie Sh...
...,...,...
2995,HC stays criminal proceedings against SRK in s...,Gujarat High Court has put an interim stay on ...
2996,Samajwadi Party still belongs to me: Mulayam S...,Samajwadi Party supremo Mulayam Singh Yadav on...
2997,Doctors say abortion not possible for 10-yr-ol...,A Chandigarh hospital's doctors ruled out the ...
2998,Logon ka kaam hai kehna: Sonakshi Sinha on mar...,"Actress Sonakshi Sinha, while reacting to repo..."


In [4]:
df_eval = pd.read_csv('../../Datasets/without_ctext/df_eval.csv')
df_eval.drop('Unnamed: 0', axis = 1, inplace = True)
df_eval

Unnamed: 0,headlines,text
0,Delhi taxi driver returns lost bag with valuab...,"Debendra Kapri, a 24-year-old taxi driver, has..."
1,Recall what happened in 1971: Venkaiah Naidu t...,NDA Vice Presidential nominee Venkaiah Naidu o...
2,"Bihar minister abuses PM Modi, calls him a dacoit",A video of Bihar's Minister for Excise and Pro...
3,6 arrested for blackmailing makers over Baahub...,The cyber crime police of Hyderabad have arres...
4,Indrani forged Peter?s signature on bank docum...,Sheena Bora murder case accused Indrani Mukerj...
...,...,...
796,Delhi civic bodies to jail people for littering,Delhi civic bodies reportedly plan to make the...
797,Azhar Ali 1st Pak Test batsman to hit 2 200+ s...,Azhar Ali became Pakistan's first Test batsman...
798,Congress leader shot dead in Maharashtra's Bhi...,A Congress leader was shot dead by a couple of...
799,Disha shares pics with rumoured boyfriend Tige...,Actress Disha Patani has shared pictures on In...


In [5]:
df_test = pd.read_csv('../../Datasets/without_ctext/df_test.csv')
df_test.drop('Unnamed: 0', axis = 1, inplace = True)
df_test

Unnamed: 0,headlines,text
0,Ex-Australian PM sends signed bat to Modi thro...,Former Australian PM Tony Abbott has sent a si...
1,Nearly 400 judicial officers transferred in Ut...,The Allahabad High Court has transferred aroun...
2,"Big B complains about Vodafone on Twitter, RJi...",Actor Amitabh Bachchan on Tuesday took to Twit...
3,No interference in Jayalalithaa's treatment: A...,Apollo Hospitals on Tuesday said that there wa...
4,Varun's pants tear while dancing with contesta...,Actor Varun Dhawan's pants got torn while he w...
...,...,...
590,43% employees are in the unorganised sector: Govt,Labour Minister Bandaru Dattatreya has said th...
591,Dileep arrested in Malayalam actress abduction...,Actor Dileep was arrested on Monday in connect...
592,Ex-Prez Pranab Mukherjee joins Twitter as @Cit...,Former President Pranab Mukherjee made a perso...
593,WhatsApp to bring back old text status,A beta version of WhatsApp shows that the mess...


In [6]:
df_train = Dataset.from_pandas(df_train)
df_eval = Dataset.from_pandas(df_eval)
df_test = Dataset.from_pandas(df_test)

### model building and parameters

In [None]:
model_name = 'google/pegasus-cnn_dailymail'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
def preprocess(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["headlines"],
            max_length=64,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
train_data = df_train.map(preprocess, batched = True)
eval_data = df_eval.map(preprocess, batched = True)
test_data = df_test.map(preprocess, batched = True)

Map: 100%|██████████| 3000/3000 [00:02<00:00, 1309.48 examples/s]
Map: 100%|██████████| 801/801 [00:00<00:00, 1306.37 examples/s]
Map: 100%|██████████| 595/595 [00:00<00:00, 1340.32 examples/s]


In [15]:
datacollator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [16]:
training_args = TrainingArguments(
    output_dir = './results_without_ctext',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    num_train_epochs = 1,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    do_predict = True,
    save_total_limit = 2,
    logging_dir = './logs_without_ctext'
)

In [17]:
trainer = Trainer(
    model = model, 
    args = training_args,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = eval_data,
    data_collator = datacollator
)

  trainer = Trainer(


In [18]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


: 