## Text summarization

In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset

### training data

In [None]:
df = pd.read_csv('news_summary.csv')
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [7]:
df.drop(['author', 'date', 'read_more', 'ctext'], axis = 1, inplace = True)
df.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


In [9]:
print(f"{len(df['headlines'][1])} :: {len(df['text'][1])}")

60 :: 361


In [10]:
df_train = Dataset.from_pandas(df)

In [11]:
df_train

Dataset({
    features: ['headlines', 'text'],
    num_rows: 4514
})

### testing data

In [12]:
df_2 = pd.read_csv('news_summary_more.csv')
df_2.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [14]:
df_2 = df_2.head(500)

In [15]:
df_2

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...
...,...,...
495,WhatsApp testing additional 'Media' menu featu...,"According to WABetaInfo, WhatsApp is testing n..."
496,"Priyanka Gandhi has bipolar disorder, beats up...",Talking about Priyanka Gandhi Vadra entering a...
497,Shivpal Yadav to contest 2019 LS polls from UP...,Pragatisheel Samajwadi Party (Lohia) chief Shi...
498,"Nearly 4,500 cases pending per High Court judg...","According to the National Judicial Data Grid, ..."


In [19]:
print(f"{len(df_2['headlines'][0])} :: {len(df_2['text'][0])}")

65 :: 375


In [20]:
df_test = Dataset.from_pandas(df_2)
df_test

Dataset({
    features: ['headlines', 'text'],
    num_rows: 500
})

### loading the transformer and the tokens

In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [24]:
model_name = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [25]:
max_input_len = 512
max_target_len = 90

In [26]:
def preprocessor(data) : 
    model_inputs = tokenizer(
        data['headlines'], max_length = max_input_len, truncation = True
    )

    labels = tokenizer(
        data['text'], max_length = max_target_len, truncation = True
    )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [28]:
train_data = df_train.map(preprocessor, batched = True)
test_data = df_test.map(preprocessor, batched = True)

Map: 100%|██████████| 4514/4514 [00:00<00:00, 7292.78 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 10883.33 examples/s]


### training parameters and model training

In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [29]:
training_args = TrainingArguments(
    output_dir = './results_3',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_eval_batch_size = 64,
    per_device_train_batch_size = 64,
    weight_decay = 0.01,
    save_total_limit = 2,
    num_train_epochs = 1,
    do_predict = True,
    logging_dir = './logs_3'
)

In [31]:
trainer = Trainer(
    args = training_args,
    model = model,
    tokenizer = tokenizer,
    eval_dataset = train_data,
    train_dataset = test_data,
    data_collator = data_collator
)

  trainer = Trainer(


In [32]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.010814




TrainOutput(global_step=8, training_loss=4.109134674072266, metrics={'train_runtime': 474.8007, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.017, 'total_flos': 7893228994560.0, 'train_loss': 4.109134674072266, 'epoch': 1.0})

### outputs for the new data and evaluation

In [78]:
df = pd.read_csv('news_summary_more.csv')
df = df.iloc[600 : 1001, :]

In [79]:
df.head()

Unnamed: 0,headlines,text
600,Humbled by this recognition: Actor Mohanlal on Padma Bhushan,"Malayalam actor Mohanlal, who was conferred with the Padma Bhushan award on Friday, said that he was ""humbled by this recognition"". ""[I] am eternally grateful to one and all who have been part of this worldly journey of mine,"" the actor added. The five-time National Film Award winning actor was earlier honoured with the Padma Shri in 2001."
601,"Prateik, Sanya host Gatsby themed wedding reception","Actor Prateik Babbar and his wife Sanya Sagar hosted a wedding reception in Mumbai on Friday, the theme of which was inspired by American author F Scott Fitzgerald's 1925 novel 'The Great Gatsby'. The couple arrived at the reception in a vintage red car. Prateik and Sanya tied the knot in Lucknow on Thursday."
602,I've become more popular since 'Race 3' memes: Daisy Shah,"Speaking about being trolled for her dialogue, ""My business is my business, none of your business"" in the film 'Race 3', actress Daisy Shah said that the memes made her more popular. ""[E]ither you take it positively or you take it negatively, and I take it positively,"" the actress added. She further said she was ""very happy"" with the outcome."
603,FIR against Zubeen for alleged 'unconstitutional' Bharat Ratna remark,"A case has been filed against singer Zubeen Garg for allegedly using 'unparliamentary' language defaming the Bharat Ratna, Indiaâs highest civilian honour, in an audio clip doing rounds on WhatsApp. An FIR was lodged against Garg by Satya Ranjan Borah, State Vice President BJP, Kisan Morcha, Assam. ""The way he's been behaving cannot be accepted by our society,"" Borah said."
604,Sonu to face legal action for running hotel without licence,"Brihanmumbai Municipal Corporation (BMC) is likely to take legal action against Sonu Sood for converting a residential building into a hotel without clearance from the municipal body. Sood reportedly sent a proposal to the municipal body in June 2018, which was rejected as it didn't comply with its norms. BMC is yet to receive an amended proposal from the actor."


In [81]:
print(f"{len(df['headlines'][600])} :: {len(df['text'][600])}")

60 :: 341


In [47]:
new_df_test = Dataset.from_pandas(df)

In [48]:
new_df_test

Dataset({
    features: ['headlines', 'text'],
    num_rows: 401
})

In [49]:
new_test_data = new_df_test.map(preprocessor, batched = True)

Map: 100%|██████████| 401/401 [00:00<00:00, 8105.11 examples/s]


In [50]:
for i in range(0, 10) : 
    sample_text = new_test_data['text'][i]
    inputs = tokenizer([sample_text], return_tensors = 'pt', truncation = True, max_length = 512)
    summary_ids = model.generate(inputs["input_ids"], max_length = 60, min_length = 30, length_penalty = 2.0)

    print(f"genertaed summary : {tokenizer.decode(summary_ids[0], skip_special_tokens = True)}")
    print(f"Reference summary : {new_test_data['headlines'][i]}")

    print('-'*40)

genertaed summary : Malayalam actor Mohanlal, who was conferred with the Padma Bhushan award on Friday, said that he was "humbled by this recognition". "[I] am eternally grateful to one and all who have been part of this worldly journey of mine," the
Reference summary : Humbled by this recognition: Actor Mohanlal on Padma Bhushan
----------------------------------------
genertaed summary : Actor Prateik Babbar and his wife Sanya Sagar hosted a wedding reception in Mumbai on Friday, the theme of which was inspired by American author F Scott Fitzgerald's 1925 novel 'The Great Gatsby'. The couple arrived at the reception in a vintage red car. Pr
Reference summary : Prateik, Sanya host Gatsby themed wedding reception
----------------------------------------
genertaed summary : Speaking about being trolled for her dialogue, "My business is my business, none of your business" in the film 'Race 3', actress Daisy Shah said that the memes made her more popular. "[E]ither you take it positively 