## Imports

In [None]:
import os
import time
import glob
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator

In [None]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
os.listdir('/kaggle/input/bbc-news-summary/BBC News Summary/Summaries')

In [None]:
articles_path = '../input/bbc-news-summary/BBC News Summary/News Articles'
summaries_path ='../input/bbc-news-summary/BBC News Summary/Summaries'
categories_list = ['politics', 'sport', 'tech', 'entertainment', 'business']

In [None]:
def read_files_from_folders(articles_path,summaries_path,categories_list =['tech','sport'], encoding = "ISO-8859-1"):
    articles = []
    summaries = []
    categories = []
    for category in categories_list:
        article_paths =  glob.glob(os.path.join(articles_path , category , '*.txt'), recursive=True)
        summary_paths =  glob.glob(os.path.join(summaries_path , category , '*.txt'), recursive=True)

        print(f'found {len(article_paths)} file in articles/{category} folder, {len(summary_paths)} file in summaries/{category} folder')

        if len(article_paths) != len(summary_paths):
            print('number of files is not equal') 
            return    
        for idx_file in range(len(article_paths)):
            categories.append(category)
            with open(article_paths[idx_file], mode = 'r', encoding = encoding) as file:
                articles.append(file.read())


            with open(summary_paths[idx_file], mode = 'r', encoding = encoding) as file:
                 summaries.append(file.read())
    
    print(f'total {len(articles)} file in articles folders, {len(summaries)} file in summaries folders')
    return articles, summaries, categories

In [None]:
articles, summaries, categories = read_files_from_folders(articles_path, summaries_path, categories_list)

In [None]:
df = pd.DataFrame({'articles':articles,'summaries': summaries, 'categories' : categories},)
df

In [None]:
df.info()

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens = {'pad_token':'<|pad|>','sep_token':'<|sep|>'}
num_add_toks = tokenizer.add_special_tokens(special_tokens)



In [None]:
to_drop = []

for i in range(0, len(df)):
    if((len(tokenizer.encode((df.iloc[i]).articles)) + len(tokenizer.encode((df.iloc[i]).summaries))) >= 1024):
        to_drop.append(i)
        #print("ok")
    
df = df.drop(df.index[to_drop]) 

In [None]:
df.info()

In [None]:
print(tokenizer.encode("Hello world my name"))

## Dataset, DataLoader

In [None]:
class NewsData(Dataset):
    def __init__(self,dataframe, tokenizer):
        super(NewsData, self).__init__()
        self.df = dataframe
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):      
        text = self.tokenizer.encode(self.tokenizer.pad_token)*1024
        content = self.tokenizer.encode((df.iloc[index]).articles) + self.tokenizer.encode(self.tokenizer.sep_token) + self.tokenizer.encode((df.iloc[index]).summaries)
        text[:len(content)] = content
        text = torch.tensor(text)
        sample = {'article': text, 'sum_idx': len(self.tokenizer.encode(df.iloc[index].articles))}
        return sample

In [None]:
(df.iloc[13]).articles

In [None]:
news_dataset = NewsData(df,tokenizer)

## Hyperparameters

In [None]:
#model = Transformer(embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads,
#                   num_encoder_layers, num_decoder_layers, froward_expansion, dropout, max_len, device).to(device)
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

model.to(device)

In [None]:
import argparse
from datetime import datetime
import os
import time

import numpy as np
from transformers import GPT2LMHeadModel,AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
import torch
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tnrange, tqdm_notebook
import random



parser = argparse.ArgumentParser()
parser.add_argument("--lr",default=5e-5, type=float, help="learning rate")
parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--n_gpu",default=1, type=int,  help="no of gpu available")
parser.add_argument("--gradient_accumulation_steps",default=2, type=int, help="gradient_accumulation_steps")
parser.add_argument("--batch_size",default=1, type=int,  help="batch_size")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cuda'), help="torch.device object")
parser.add_argument("--num_train_epochs",default=1, type=int,  help="no of epochs of training")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.")
parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args([])
print(args)

In [None]:
loss_arr = []
def train(args, model, tokenizer, train_dataset, ignore_index):
    """ Trains GPT2 model and logs necessary details.
        Args:
            args: dict that contains all the necessary information passed by user while training
            model: finetuned gpt/gpt2 model
            tokenizer: GPT/GPT2 tokenizer
            train_dataset: GPT21024Dataset object for training data
            ignore_index: token not considered in loss calculation
    """
    writer = SummaryWriter('./output/logs')
    train_sampler = RandomSampler(train_dataset)
    train_dl = DataLoader(train_dataset,sampler=train_sampler,batch_size=args.batch_size,num_workers=args.num_workers)
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation
    optimizer = AdamW(model.parameters(),lr=args.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,100,80000)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = tnrange(int(args.num_train_epochs), desc="Epoch")
    
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
        
    for _ in train_iterator:
        epoch_iterator = tqdm_notebook(train_dl, desc="Training")
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = batch['article'].to(args.device), batch['article'].to(args.device)
            model.train()
            logits = model(inputs)[0]
            # only consider loss on reference summary just like seq2seq models
            shift_logits = logits[..., batch['sum_idx']:-1, :].contiguous()
            shift_labels = labels[..., batch['sum_idx']+1:].contiguous()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss = loss/args.gradient_accumulation_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step)
                logging_loss = tr_loss
                print("loss:", loss.item(), end='\n\n')
                loss_arr.append(loss.item())
                if (step + 1)/args.gradient_accumulation_steps == 1.0:
                	print('After 1st update: ', end='\n\n')                
                
            if (step + 1) % (10*args.gradient_accumulation_steps) == 0:
                generate_sample(train_dataset, tokenizer, model, num=2, eval_step=False,device=args.device)


In [None]:
from matplotlib import pyplot as plt
plt.plot(loss_arr,'r-')

In [None]:
ignore_idx = tokenizer.pad_token_id
start = time.time()
train(args, model, tokenizer, news_dataset, ignore_idx)
print('total time: ', (time.time()-start)/60, " minutes", end='\n\n')

In [None]:
generate_sample(news_dataset, tokenizer, model, num=2, eval_step=False,device=args.device)



## To Assess Outputs

In [None]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits


def sample_seq(model, context, length, device, temperature=1, top_k=0, top_p=0.0):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0)
    generated = context
    with torch.no_grad():  
        for _ in tnrange(length):
            inputs = {'input_ids': generated}
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated


def beam_search(model, context, length, beam_size, device, temperature=1):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0)
    with torch.no_grad():  
        inputs = {'input_ids': context}
        outputs = model(**inputs) 
        next_token_logits = outputs[0][0, -1, :] / temperature
        next_token_probs = F.softmax(next_token_logits)
        scores, indices = torch.topk(next_token_probs, beam_size)
        indices = indices.tolist()
        sequences = [[c] for c in indices]
        for _ in tnrange(length-1):
            logits = torch.zeros(beam_size*len(next_token_logits))
            for j in range(len(sequences)):
                new_generated = torch.cat((context,torch.tensor([sequences[j]], dtype=torch.long, device=device)),dim=1)
                inputs = {'input_ids': new_generated}
                outputs = model(**inputs) 
                next_token_logits = outputs[0][0, -1, :] / temperature
                next_token_probs = F.softmax(next_token_logits)
                start, stop = j*len(next_token_logits), (j+1)*len(next_token_logits)
                logits[start:stop] = scores[j]*next_token_probs
            scores, new_logits_indices = torch.topk(logits,beam_size)
            logits = (new_logits_indices%50259).tolist()
            for j in range(len(sequences)):
                sequences[j] = sequences[j]+[logits[j]]
    return scores, sequences


def generate_beam_sample(data, tokenizer, model, num=1, length=100, beam_size=3, device=torch.device('cuda')):
    for i in range(num):
        sample = data.__getitem__(i)
        idx = sample['sum_idx']
        context = sample['article'][:idx].tolist()
        summary = sample['article'][idx+1:][:100].tolist()
        scores, sequences = beam_search(model, context, length, beam_size, device)
        #print('new_article', end='\n\n')
        #print(tokenizer.decode(context[:-1]), end='\n\n')
        print('actual_summary', end='\n\n')
        print(tokenizer.decode(summary), end='\n\n')
        for i in range(len(sequences)):
            text = tokenizer.convert_ids_to_tokens(sequences[i],skip_special_tokens=True)
            text = tokenizer.convert_tokens_to_string(text)  
            print("generated_summary-{} and Score is {}.".format(i+1, scores[i]), end='\n\n')
            print(text, end='\n\n')


def generate_sample(data, tokenizer, model, num=1, eval_step=False, length=100, temperature=1, top_k=10, top_p=0.5, device=torch.device('cuda')):
    for i in range(num):
        sample = data.__getitem__(i)
        idx = sample['sum_idx']
        context = ((sample['article'])[:idx+1]).tolist()
        summary = sample['article'][idx+1:][:100].tolist()
        generated_text = sample_seq(model, context, length, device, temperature, top_k, top_p)
        generated_text = generated_text[0, len(context):].tolist()
        text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
        text = tokenizer.convert_tokens_to_string(text)
        if eval_step==False:
            print('new_article', end='\n\n')
            print(tokenizer.decode(context), end='\n\n')
            print("generated_summary", end='\n\n')
            print(text, end='\n\n')
            print('actual_summary', end='\n\n')
            print(tokenizer.decode(summary), end='\n\n')
        else:
            print(tokenizer.decode(context), end='\n\n')
            print("generated_summary", end='\n\n')

In [None]:
sample = news_dataset.__getitem__(1)
idx = sample['sum_idx']
context = ((sample['article'])[:idx]).tolist()
len(context)


In [None]:
test_sentence = ''' In first half of 20th century, factions of Indian National Congress continued to remain getting identified with "Hindu politics" and ideas of a Hindu nation. The word "Hindu", throughout history, had been used as an inclusive description that lacked a definition and was used to refer to the native traditions and people of India. It was only in the late 18th century that the word "Hindu" came to be used extensively with religious connotation, while still being used as a synecdoche describing the indigenous traditions. Hindu nationalist ideologies and political languages were very diverse both linguistically and socially. Since Hinduism does not represent an identifiable religious group, the terms such as 'Hindu nationalism', 'Hindu', are considered problematic in the case of religious and nationalism discourse. As Hindus were identifiable as a homogeneous community, some individual Congress leaders were able to induce a symbolism with "Hindu" meaning inside the general stance of a secular nationalism.[12][13]

The diversity of Indian cultural groups and moderate positions of Hindu nationalism have sometimes made it regarded as cultural nationalism than a religious one.'''

In [None]:
context = (tokenizer.encode(test_sentence))
generated_text = sample_seq(model, context, 100, device, 1, 10, 0.5)
generated_text = generated_text[0, len(context):].tolist()
text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
text = tokenizer.convert_tokens_to_string(text)
print(text)

In [None]:
!pip install newspaper3k

In [None]:
!pip install pygooglenews 

In [None]:
from transformers import pipeline
import os

## Setting to use the 0th GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

summarizer = pipeline("summarization")

from newspaper import Article



In [None]:
summary_text = summarizer(toi_article.text[:3423], max_length=200, min_length=150, do_sample=False)[0]['summary_text']
print(summary_text)

In [None]:
test_sentence = ''' In first half of 20th century, factions of Indian National Congress continued to remain getting identified with "Hindu politics" and ideas of a Hindu nation. The word "Hindu", throughout history, had been used as an inclusive description that lacked a definition and was used to refer to the native traditions and people of India. It was only in the late 18th century that the word "Hindu" came to be used extensively with religious connotation, while still being used as a synecdoche describing the indigenous traditions. Hindu nationalist ideologies and political languages were very diverse both linguistically and socially. Since Hinduism does not represent an identifiable religious group, the terms such as 'Hindu nationalism', 'Hindu', are considered problematic in the case of religious and nationalism discourse. As Hindus were identifiable as a homogeneous community, some individual Congress leaders were able to induce a symbolism with "Hindu" meaning inside the general stance of a secular nationalism.
The diversity of Indian cultural groups and moderate positions of Hindu nationalism have sometimes made it regarded as cultural nationalism than a religious one.'''

summary_text = summarizer(test_sentence, max_length=100, min_length=5, do_sample=False)[0]['summary_text']
print(summary_text)

In [None]:
from pygooglenews import GoogleNews

summaries = []
from pygooglenews import GoogleNews
gn = GoogleNews()
search = gn.search("modi")
for item in search["entries"]:
    l = item["links"]
    toi_article = Article(l, language="en") # en for English
    toi_article.download()
    toi_article.parse()
    summaries.append(summarizer(toi_article.text[:3423], max_length=200, min_length=150, do_sample=False)[0]['summary_text'])