## Text summarization

### data loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('news_summary.csv')
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [3]:
df.drop(['author', 'date', 'read_more', 'ctext'], axis = 1, inplace = True)
df.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


In [5]:
df.shape

(4514, 2)

In [4]:
df.isnull().sum()

headlines    0
text         0
dtype: int64

### spliting the data

In [6]:
df_train = df.head(3000)
df_eval = df.iloc[3001:4000, :]
df_test = df.tail(513)

In [10]:
for i in range(0, 10) : 
    print(f"{len(df['headlines'][i])} :: {len(df['text'][i])}")

60 :: 358
60 :: 361
52 :: 398
56 :: 368
60 :: 366
60 :: 347
59 :: 361
60 :: 331
59 :: 370
48 :: 311


In [11]:
from datasets import Dataset

In [12]:
df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_test)
df_eval = Dataset.from_pandas(df_eval)

In [13]:
df_train

Dataset({
    features: ['headlines', 'text'],
    num_rows: 3000
})

In [14]:
df_eval

Dataset({
    features: ['headlines', 'text'],
    num_rows: 999
})

In [16]:
df_test

Dataset({
    features: ['headlines', 'text'],
    num_rows: 513
})

### importing the models

In [17]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [18]:
model_name = 'facebook/bart-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
max_input_len = 512
max_target_len = 60

In [20]:
datacollator = DataCollatorForSeq2Seq(model = model, tokenizer = tokenizer)

In [21]:
def preprocessor(data) : 
    model_inputs = tokenizer(
        data['headlines'], max_length = max_input_len, truncation = True
    )

    labels = tokenizer(
        data['text'], max_length = max_target_len, truncation = True
    )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [22]:
train_data = df_train.map(preprocessor, batched = True)
eval_data = df_eval.map(preprocessor, batched = True)
test_data = df_test.map(preprocessor, batched = True)

Map: 100%|██████████| 3000/3000 [00:00<00:00, 6483.54 examples/s]
Map: 100%|██████████| 999/999 [00:00<00:00, 12757.18 examples/s]
Map: 100%|██████████| 513/513 [00:00<00:00, 12038.17 examples/s]


### training arguments and trainer

In [23]:
training_args = TrainingArguments(
    output_dir = './results_5',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    num_train_epochs = 1,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 64,
    do_predict = True,
    save_total_limit = 2,
    logging_dir = './logs_5'
)

In [24]:
trainer = Trainer(
    model = model, 
    args = training_args,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = eval_data,
    data_collator = datacollator
)

  trainer = Trainer(


In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,2.666879




TrainOutput(global_step=47, training_loss=3.4099192517869015, metrics={'train_runtime': 695.1531, 'train_samples_per_second': 4.316, 'train_steps_per_second': 0.068, 'total_flos': 39185094574080.0, 'train_loss': 3.4099192517869015, 'epoch': 1.0})

### checking the model on test data

In [28]:
for i in range(0, 5) : 
    sample_text = test_data['text'][i]
    inputs = tokenizer([sample_text], return_tensors = 'pt', truncation = True, max_length = 512)
    summary_ids = model.generate(inputs['input_ids'], max_length = 20, min_length = 10, length_penalty = 2.0)

    print(f"generated summary : {tokenizer.decode(summary_ids[0], skip_special_tokens = True)}")
    print(f"reference summary  : {test_data['headlines'][i]}")

    print('-'*40)

generated summary : Indian cricket captain Virat Kohli was conferred with the Padma Shri Award by
reference summary  : Virat Kohli receives Padma Shri at Rashtrapati Bhavan
----------------------------------------
generated summary : The government will seek data on deposits made by an individual during the demonetisation period
reference summary  : Govt to seek info on deposits during note ban in I-T returns
----------------------------------------
generated summary : The stamping of hand baggage will be stopped at Delhi, Mumbai, Kolkata
reference summary  : Stamping of hand baggage to end at 7 airports from April 1
----------------------------------------
generated summary : Following Supreme Court's ban on vehicles with BS-III emission norms, auto firms are
reference summary  : Auto industry stuck with vehicles of ?20,000 cr post SC ban
----------------------------------------
generated summary : Late actor Raj Kapoor's 'Mera Naam Joker' and 'Sang
reference summary  : Mera Naam Joke

### Rouge score

In [29]:
from evaluate import load

In [31]:
df_test_2 = df.tail(50)

In [33]:
ref_summary = df_test_2.headlines.head(50)

In [35]:
type(ref_summary)

pandas.core.series.Series

In [37]:
test_data_2 = Dataset.from_pandas(df_test_2)

In [38]:
test_data_2

Dataset({
    features: ['headlines', 'text'],
    num_rows: 50
})

In [39]:
generated_summary = []

for i in range(0, len(df_test_2)) : 
    sample_text = test_data_2['text'][i]
    
    inputs = tokenizer([sample_text], return_tensors = 'pt', truncation = True, max_length = 512)
    summary_ids = model.generate(inputs['input_ids'], max_length = 20, min_length = 10, length_penalty = 2.0)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens = True)

    generated_summary.append(summary)

In [42]:
type(generated_summary)

list

In [46]:
type(generated_summary)

pandas.core.series.Series

In [44]:
generated_summary = pd.Series(generated_summary)

In [47]:
rouge = load('rouge')

In [54]:
results = rouge.compute(predictions = generated_summary, references = ref_summary)
print(results)

{'rouge1': 0.3092470365510412, 'rouge2': 0.11346996476101334, 'rougeL': 0.27003400620483, 'rougeLsum': 0.2708632775437124}
