<a href="https://colab.research.google.com/github/thakkar-hiren/News-Summarisation/blob/main/6_News_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Summarization Using BERT Embeddings and TextRank Algorithm
This file contains an implementation of text summarization using BERT (Bidirectional Encoder Representations from Transformers) embeddings and the TextRank algorithm. The process begins by loading a dataset containing news articles and preprocessed text. The pre-trained BERT model and tokenizer are then loaded from the Hugging Face Transformers library. The text is tokenized into sentences, and BERT embeddings are obtained for each sentence using the pre-trained model. Cosine similarity is utilized to construct a similarity matrix based on the BERT embeddings. The TextRank algorithm is applied to rank sentences based on their similarity scores, and the top-ranked sentences are selected to form the summary. The number of sentences in the summary is customizable, with options for generating summaries of 3 or 5 sentences. Additionally, the file includes evaluation functions to compute the average ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scores, providing insights into the quality of the generated summaries. This implementation leverages the power of BERT embeddings to capture contextual information and generate informative summaries, making it suitable for various text summarization tasks across different domains.

# Installing Libraries

In [None]:
!pip install gdown

In [None]:
!gdown --id 1QGknCbFF7C5IKQ69VlyhiCxVTn39sF30

In [None]:
!pip install transformers

# Loading Dataset

In [None]:
import pandas as pd
news_data = pd.read_csv('/kaggle/working/filtered_news_data.csv')
news_data

In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Loading Model

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

In [None]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')

In [None]:
# Obtain BERT embeddings for a sentence
def get_bert_embeddings(sentence):
    tokens = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        tokens = {key: value.to(device) for key, value in tokens.items()}
        outputs = model(**tokens)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Use mean pooling to obtain sentence embedding
    return cls_embedding.to('cpu')

In [None]:
# Calculate similarity matrix based on BERT embeddings
def cosine_similarity_matrix(embeddings):
    embeddings_mean = torch.mean(embeddings, dim=1)
    similarity_matrix = cosine_similarity(embeddings_mean, embeddings_mean)
    return similarity_matrix

In [None]:
# TextRank function
def textrank(similarity_matrix):
    damping_factor = 0.85
    n_iterations = 250
    n_sentences = similarity_matrix.shape[0]
    ranks = np.ones(n_sentences) / n_sentences  # Initialize page ranks

    for _ in range(n_iterations):
        new_ranks = (1 - damping_factor) / n_sentences + damping_factor * similarity_matrix.T.dot(ranks)
        ranks = new_ranks
    return ranks

In [None]:
def pad_sentences(tokenized_sentences):
    max_length = max(len(tokens) for tokens in tokenized_sentences)
    padded_sentences = [tokens + ['[PAD]'] * (max_length - len(tokens)) for tokens in tokenized_sentences]
    return padded_sentences

In [None]:
# Summarization function
def summarize_article(article_text, nos=3):
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sent_tokenize(article_text)]
    padded_sentences = pad_sentences(tokenized_sentences)
    sentence_embeddings = [get_bert_embeddings(tokens) for tokens in padded_sentences]
    sentence_embeddings = torch.stack(sentence_embeddings, dim=0)

    similarity_matrix = cosine_similarity_matrix(sentence_embeddings)
    ranks = textrank(similarity_matrix)

    n_summary_sentences = min(nos, len(tokenized_sentences))
    top_sentence_indices = ranks.argsort()[-n_summary_sentences:][::-1]
    predicted_summary = " ".join([sent_tokenize(article_text)[i] for i in sorted(top_sentence_indices)])

    return predicted_summary

In [None]:
# Apply summarization to each article
news_data['predictedSummary_3'] = news_data['preprocessed_ctext'].apply(lambda x: summarize_article(x))

In [None]:
# Apply summarization to each article
news_data['predictedSummary_5'] = news_data['preprocessed_ctext'].apply(lambda x: summarize_article(x,nos=5))

# Model Evaluation

In [None]:
!pip install rouge_score

In [None]:
# Model evaluation function
from rouge_score import rouge_scorer

def evaluate_summaries_3(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in news_data.iterrows():
        scores = scorer.score(target=row['preprocessed_text'], prediction=row['predictedSummary_3'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [None]:
def evaluate_summaries_5(news_data):
    scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []

    for idx, row in news_data.iterrows():
        scores = scorer.score(target=row['preprocessed_text'], prediction=row['predictedSummary_5'])
        rouge_scores.append(scores)

    avg_rouge1_precision = np.mean([score['rouge1'].precision for score in rouge_scores])
    avg_rouge1_recall = np.mean([score['rouge1'].recall for score in rouge_scores])
    avg_rouge1_f1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])

    avg_rouge2_precision = np.mean([score['rouge2'].precision for score in rouge_scores])
    avg_rouge2_recall = np.mean([score['rouge2'].recall for score in rouge_scores])
    avg_rouge2_f1 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])

    avg_rougeL_precision = np.mean([score['rougeL'].precision for score in rouge_scores])
    avg_rougeL_recall = np.mean([score['rougeL'].recall for score in rouge_scores])
    avg_rougeL_f1 = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

    print("Average ROUGE-1 Precision: ", avg_rouge1_precision)
    print("Average ROUGE-1 Recall: ", avg_rouge1_recall)
    print("Average ROUGE-1 F1-Score: ", avg_rouge1_f1)
    print("Average ROUGE-2 Precision: ", avg_rouge2_precision)
    print("Average ROUGE-2 Recall: ", avg_rouge2_recall)
    print("Average ROUGE-2 F1-Score: ", avg_rouge2_f1)
    print("Average ROUGE-L Precision: ", avg_rougeL_precision)
    print("Average ROUGE-L Recall: ", avg_rougeL_recall)
    print("Average ROUGE-L F1-Score: ", avg_rougeL_f1)

In [None]:
print("For BERT:- ")
print("Evaluation for the summary of 3 sentences: \n")
evaluate_summaries_3(news_data)
print("\n")
print("Evaluation for the summary of 5 sentences: \n")
evaluate_summaries_5(news_data)

In [None]:
news_data['predictedSummary_3'][0]

In [None]:
news_data['predictedSummary_5'][0]